From 0a7de4a8f898c480ffafe024c4a0a8b8819597f1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Apr 2025 16:36:02 +0000 Subject: net: rps: remove kfree_rcu_mightsleep() use Add an rcu_head to sd_flow_limit and rps_sock_flow_table structs to use the more conventional and predictable k[v]free_rcu(). Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250407163602.170356-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/rps.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/rps.h b/include/net/rps.h index e358e9711f27..507f4aa5d39b 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -57,9 +57,10 @@ struct rps_dev_flow_table { * meaning we use 32-6=26 bits for the hash. */ struct rps_sock_flow_table { - u32 mask; + struct rcu_head rcu; + u32 mask; - u32 ents[] ____cacheline_aligned_in_smp; + u32 ents[] ____cacheline_aligned_in_smp; }; #define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) -- cgit v1.2.3 From a36283e2b683f172aa1760c77325e50b16c0f792 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 7 Apr 2025 17:45:41 +0200 Subject: udp_tunnel: create a fastpath GRO lookup. Most UDP tunnels bind a socket to a local port, with ANY address, no peer and no interface index specified. Additionally it's quite common to have a single tunnel device per namespace. Track in each namespace the UDP tunnel socket respecting the above. When only a single one is present, store a reference in the netns. When such reference is not NULL, UDP tunnel GRO lookup just need to match the incoming packet destination port vs the socket local port. The tunnel socket never sets the reuse[port] flag[s]. When bound to no address and interface, no other socket can exist in the same netns matching the specified local port. Matching packets with non-local destination addresses will be aggregated, and eventually segmented as needed - no behavior changes intended. Restrict the optimization to kernel sockets only: it covers all the relevant use-cases, and user-space owned sockets could be disconnected and rebound after setup_udp_tunnel_sock(), breaking the uniqueness assumption Note that the UDP tunnel socket reference is stored into struct netns_ipv4 for both IPv4 and IPv6 tunnels. That is intentional to keep all the fastpath-related netns fields in the same struct and allow cacheline-based optimization. Currently both the IPv4 and IPv6 socket pointer share the same cacheline as the `udp_table` field. Signed-off-by: Paolo Abeni Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/41d16bc8d1257d567f9344c445b4ae0b4a91ede4.1744040675.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/udp.h | 16 ++++++++++++++++ include/net/netns/ipv4.h | 11 +++++++++++ include/net/udp.h | 1 + include/net/udp_tunnel.h | 12 ++++++++++++ net/ipv4/udp.c | 13 ++++++++++++- net/ipv4/udp_offload.c | 37 +++++++++++++++++++++++++++++++++++++ net/ipv4/udp_tunnel_core.c | 13 +++++++++++++ net/ipv6/udp.c | 2 ++ net/ipv6/udp_offload.c | 5 +++++ 9 files changed, 109 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index 0807e21cfec9..895240177f4f 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -101,6 +101,13 @@ struct udp_sock { /* Cache friendly copy of sk->sk_peek_off >= 0 */ bool peeking_with_offset; + + /* + * Accounting for the tunnel GRO fastpath. + * Unprotected by compilers guard, as it uses space available in + * the last UDP socket cacheline. + */ + struct hlist_node tunnel_list; }; #define udp_test_bit(nr, sk) \ @@ -219,4 +226,13 @@ static inline void udp_allow_gso(struct sock *sk) #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) +static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6) +{ +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + return rcu_dereference(net->ipv4.udp_tunnel_gro[is_ipv6].sk); +#else + return NULL; +#endif +} + #endif /* _LINUX_UDP_H */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 650b2dc9199f..6373e3f17da8 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -47,6 +47,11 @@ struct sysctl_fib_multipath_hash_seed { }; #endif +struct udp_tunnel_gro { + struct sock __rcu *sk; + struct hlist_head list; +}; + struct netns_ipv4 { /* Cacheline organization can be found documented in * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. @@ -85,6 +90,11 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; struct udp_table *udp_table; +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + /* Not in a pernet subsys because need to be available at GRO stage */ + struct udp_tunnel_gro udp_tunnel_gro[2]; +#endif + #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table_header *frags_hdr; @@ -277,4 +287,5 @@ struct netns_ipv4 { struct hlist_head *inet_addr_lst; struct delayed_work addr_chk_work; }; + #endif diff --git a/include/net/udp.h b/include/net/udp.h index 6e89520e100d..a772510b2aa5 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -290,6 +290,7 @@ static inline void udp_lib_init_sock(struct sock *sk) struct udp_sock *up = udp_sk(sk); skb_queue_head_init(&up->reader_queue); + INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index a93dc51f6323..1bb2b852e90e 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -191,6 +191,18 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) } #endif +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +#else +static inline void udp_tunnel_update_gro_lookup(struct net *net, + struct sock *sk, bool add) {} +#endif + +static inline void udp_tunnel_cleanup_gro(struct sock *sk) +{ + udp_tunnel_update_gro_lookup(sock_net(sk), sk, false); +} + static inline void udp_tunnel_encap_enable(struct sock *sk) { if (udp_test_and_set_bit(ENCAP_ENABLED, sk)) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2742cc7602bb..04cd3353f1d4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2897,8 +2897,10 @@ void udp_destroy_sock(struct sock *sk) if (encap_destroy) encap_destroy(sk); } - if (udp_test_bit(ENCAP_ENABLED, sk)) + if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udp_encap_needed_key); + udp_tunnel_cleanup_gro(sk); + } } } @@ -3810,6 +3812,15 @@ fallback: static int __net_init udp_pernet_init(struct net *net) { +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + int i; + + /* No tunnel is configured */ + for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) { + INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list); + RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL); + } +#endif udp_sysctl_init(net); udp_set_table(net); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 2c0725583be3..69fc4eae8250 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -12,6 +12,38 @@ #include #include #include +#include + +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +static DEFINE_SPINLOCK(udp_tunnel_gro_lock); + +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) +{ + bool is_ipv6 = sk->sk_family == AF_INET6; + struct udp_sock *tup, *up = udp_sk(sk); + struct udp_tunnel_gro *udp_tunnel_gro; + + spin_lock(&udp_tunnel_gro_lock); + udp_tunnel_gro = &net->ipv4.udp_tunnel_gro[is_ipv6]; + if (add) + hlist_add_head(&up->tunnel_list, &udp_tunnel_gro->list); + else if (up->tunnel_list.pprev) + hlist_del_init(&up->tunnel_list); + + if (udp_tunnel_gro->list.first && + !udp_tunnel_gro->list.first->next) { + tup = hlist_entry(udp_tunnel_gro->list.first, struct udp_sock, + tunnel_list); + + rcu_assign_pointer(udp_tunnel_gro->sk, (struct sock *)tup); + } else { + RCU_INIT_POINTER(udp_tunnel_gro->sk, NULL); + } + + spin_unlock(&udp_tunnel_gro_lock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup); +#endif static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, @@ -635,8 +667,13 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct iphdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, false); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 619a53eb672d..3d1214e6df0e 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -58,6 +58,15 @@ error: } EXPORT_SYMBOL(udp_sock_create4); +static bool sk_saddr_any(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#else + return !sk->sk_rcv_saddr; +#endif +} + void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *cfg) { @@ -80,6 +89,10 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sk); + + if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sk) && + sk->sk_kern_sock) + udp_tunnel_update_gro_lookup(net, sk, true); } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 024458ef163c..7317f8e053f1 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -1825,6 +1826,7 @@ void udpv6_destroy_sock(struct sock *sk) if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udpv6_encap_needed_key); udp_encap_disable(); + udp_tunnel_cleanup_gro(sk); } } } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 404212dfc99a..d8445ac1b2e4 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -118,8 +118,13 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct ipv6hdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, true); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, -- cgit v1.2.3 From 5d7f5b2f6b935517ee5fd8058dc32342a5cba3e1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 7 Apr 2025 17:45:42 +0200 Subject: udp_tunnel: use static call for GRO hooks when possible It's quite common to have a single UDP tunnel type active in the whole system. In such a case we can replace the indirect call for the UDP tunnel GRO callback with a static call. Add the related accounting in the control path and switch to static call when possible. To keep the code simple use a static array for the registered tunnel types, and size such array based on the kernel config. Note that there are valid kernel configurations leading to UDP_MAX_TUNNEL_TYPES == 0 even with IS_ENABLED(CONFIG_NET_UDP_TUNNEL), Explicitly skip the accounting in such a case, to avoid compile warning when accessing "udp_tunnel_gro_types". Signed-off-by: Paolo Abeni Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/53d156cdfddcc9678449e873cc83e68fa1582653.1744040675.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/net/udp_tunnel.h | 4 ++ net/ipv4/udp_offload.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 138 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 1bb2b852e90e..288f06f23a80 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -193,13 +193,16 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add); #else static inline void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) {} +static inline void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) {} #endif static inline void udp_tunnel_cleanup_gro(struct sock *sk) { + udp_tunnel_update_gro_rcv(sk, false); udp_tunnel_update_gro_lookup(sock_net(sk), sk, false); } @@ -212,6 +215,7 @@ static inline void udp_tunnel_encap_enable(struct sock *sk) if (READ_ONCE(sk->sk_family) == PF_INET6) ipv6_stub->udpv6_encap_enable(); #endif + udp_tunnel_update_gro_rcv(sk, true); udp_encap_enable(); } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 69fc4eae8250..787b2947d3a0 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -15,6 +15,38 @@ #include #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + +/* + * Dummy GRO tunnel callback, exists mainly to avoid dangling/NULL + * values for the udp tunnel static call. + */ +static struct sk_buff *dummy_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->flush = 1; + return NULL; +} + +typedef struct sk_buff *(*udp_tunnel_gro_rcv_t)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); + +struct udp_tunnel_type_entry { + udp_tunnel_gro_rcv_t gro_receive; + refcount_t count; +}; + +#define UDP_MAX_TUNNEL_TYPES (IS_ENABLED(CONFIG_GENEVE) + \ + IS_ENABLED(CONFIG_VXLAN) * 2 + \ + IS_ENABLED(CONFIG_NET_FOU) * 2 + \ + IS_ENABLED(CONFIG_XFRM) * 2) + +DEFINE_STATIC_CALL(udp_tunnel_gro_rcv, dummy_gro_rcv); +static DEFINE_STATIC_KEY_FALSE(udp_tunnel_static_call); +static struct mutex udp_tunnel_gro_type_lock; +static struct udp_tunnel_type_entry udp_tunnel_gro_types[UDP_MAX_TUNNEL_TYPES]; +static unsigned int udp_tunnel_gro_type_nr; static DEFINE_SPINLOCK(udp_tunnel_gro_lock); void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) @@ -43,6 +75,105 @@ void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) spin_unlock(&udp_tunnel_gro_lock); } EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup); + +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) +{ + struct udp_tunnel_type_entry *cur = NULL; + struct udp_sock *up = udp_sk(sk); + int i, old_gro_type_nr; + + if (!UDP_MAX_TUNNEL_TYPES || !up->gro_receive) + return; + + mutex_lock(&udp_tunnel_gro_type_lock); + + /* Check if the static call is permanently disabled. */ + if (udp_tunnel_gro_type_nr > UDP_MAX_TUNNEL_TYPES) + goto out; + + for (i = 0; i < udp_tunnel_gro_type_nr; i++) + if (udp_tunnel_gro_types[i].gro_receive == up->gro_receive) + cur = &udp_tunnel_gro_types[i]; + + old_gro_type_nr = udp_tunnel_gro_type_nr; + if (add) { + /* + * Update the matching entry, if found, or add a new one + * if needed + */ + if (cur) { + refcount_inc(&cur->count); + goto out; + } + + if (unlikely(udp_tunnel_gro_type_nr == UDP_MAX_TUNNEL_TYPES)) { + pr_err_once("Too many UDP tunnel types, please increase UDP_MAX_TUNNEL_TYPES\n"); + /* Ensure static call will never be enabled */ + udp_tunnel_gro_type_nr = UDP_MAX_TUNNEL_TYPES + 1; + } else { + cur = &udp_tunnel_gro_types[udp_tunnel_gro_type_nr++]; + refcount_set(&cur->count, 1); + cur->gro_receive = up->gro_receive; + } + } else { + /* + * The stack cleanups only successfully added tunnel, the + * lookup on removal should never fail. + */ + if (WARN_ON_ONCE(!cur)) + goto out; + + if (!refcount_dec_and_test(&cur->count)) + goto out; + + /* Avoid gaps, so that the enable tunnel has always id 0 */ + *cur = udp_tunnel_gro_types[--udp_tunnel_gro_type_nr]; + } + + if (udp_tunnel_gro_type_nr == 1) { + static_call_update(udp_tunnel_gro_rcv, + udp_tunnel_gro_types[0].gro_receive); + static_branch_enable(&udp_tunnel_static_call); + } else if (old_gro_type_nr == 1) { + static_branch_disable(&udp_tunnel_static_call); + static_call_update(udp_tunnel_gro_rcv, dummy_gro_rcv); + } + +out: + mutex_unlock(&udp_tunnel_gro_type_lock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_rcv); + +static void udp_tunnel_gro_init(void) +{ + mutex_init(&udp_tunnel_gro_type_lock); +} + +static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + if (static_branch_likely(&udp_tunnel_static_call)) { + if (unlikely(gro_recursion_inc_test(skb))) { + NAPI_GRO_CB(skb)->flush |= 1; + return NULL; + } + return static_call(udp_tunnel_gro_rcv)(sk, head, skb); + } + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); +} + +#else + +static void udp_tunnel_gro_init(void) {} + +static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); +} + #endif static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, @@ -654,7 +785,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); - pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); + pp = udp_tunnel_gro_rcv(sk, head, skb); out: skb_gro_flush_final(skb, pp, flush); @@ -804,5 +935,7 @@ int __init udpv4_offload_init(void) .gro_complete = udp4_gro_complete, }, }; + + udp_tunnel_gro_init(); return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP); } -- cgit v1.2.3 From 420aabef3ab5fa743afb4d3d391f03ef0e777ca8 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 7 Apr 2025 21:01:02 +0200 Subject: net: Drop unused @sk of __skb_try_recv_from_queue() __skb_try_recv_from_queue() deals with a queue, @sk is not used since commit e427cad6eee4 ("net: datagram: drop 'destructor' argument from several helpers"). Remove sk from function parameters, adapt callers. No functional change intended. Signed-off-by: Michal Luczaj Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250407-cleanup-drop-param-sk-v1-1-cd076979afac@rbox.co Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 3 +-- net/core/datagram.c | 5 ++--- net/ipv4/udp.c | 8 ++++---- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b974a277975a..f1381aff0f89 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4105,8 +4105,7 @@ static inline void skb_frag_list_init(struct sk_buff *skb) int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, int *err, long *timeo_p, const struct sk_buff *skb); -struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, - struct sk_buff_head *queue, +struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last); diff --git a/net/core/datagram.c b/net/core/datagram.c index f0693707aece..f0634f0cb834 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -163,8 +163,7 @@ done: return skb; } -struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, - struct sk_buff_head *queue, +struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last) @@ -261,7 +260,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, * However, this function was correct in any case. 8) */ spin_lock_irqsave(&queue->lock, cpu_flags); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error, + skb = __skb_try_recv_from_queue(queue, flags, off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); if (error) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 04cd3353f1d4..8867fe687888 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1942,8 +1942,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, error = -EAGAIN; do { spin_lock_bh(&queue->lock); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, - err, &last); + skb = __skb_try_recv_from_queue(queue, flags, off, err, + &last); if (skb) { if (!(flags & MSG_PEEK)) udp_skb_destructor(sk, skb); @@ -1964,8 +1964,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, queue); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, - err, &last); + skb = __skb_try_recv_from_queue(queue, flags, off, err, + &last); if (skb && !(flags & MSG_PEEK)) udp_skb_dtor_locked(sk, skb); spin_unlock(&sk_queue->lock); -- cgit v1.2.3 From a82dc19db13649aa4232ce37cb6f4ceff851e2fe Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:48 -0700 Subject: net: avoid potential race between netdev_get_by_index_lock() and netns switch netdev_get_by_index_lock() performs following steps: rcu_lock(); dev = lookup(netns, ifindex); dev_get(dev); rcu_unlock(); [... lock & validate the dev ...] return dev Validation right now only checks if the device is registered but since the lookup is netns-aware we must also protect against the device switching netns right after we dropped the RCU lock. Otherwise the caller in netns1 may get a pointer to a device which has just switched to netns2. We can't hold the lock for the entire netns change process (because of the NETDEV_UNREGISTER notifier), and there's no existing marking to indicate that the netns is unlisted because of netns move, so add one. AFAIU none of the existing netdev_get_by_index_lock() callers can suffer from this problem (NAPI code double checks the netns membership and other callers are either under rtnl_lock or not ns-sensitive), so this patch does not have to be treated as a fix. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 +++++- net/core/dev.c | 25 ++++++++++++++++++------- net/core/dev.h | 2 +- 3 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cf3b6445817b..8e9be80bc167 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1952,6 +1952,7 @@ enum netdev_reg_state { * @priv_destructor: Called from unregister * @npinfo: XXX: need comments on this one * @nd_net: Network namespace this network device is inside + * protected by @lock * * @ml_priv: Mid-layer private * @ml_priv_type: Mid-layer private type @@ -2359,6 +2360,9 @@ struct net_device { bool dismantle; + /** @moving_ns: device is changing netns, protected by @lock */ + bool moving_ns; + enum { RTNL_LINK_INITIALIZED, RTNL_LINK_INITIALIZING, @@ -2521,7 +2525,7 @@ struct net_device { * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: - * @up + * @up, @moving_ns, @nd_net * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues diff --git a/net/core/dev.c b/net/core/dev.c index 4ccc6dc5303e..8c13e5339cc9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -828,7 +828,7 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id) dev_hold(dev); rcu_read_unlock(); - dev = __netdev_put_lock(dev); + dev = __netdev_put_lock(dev, net); if (!dev) return NULL; @@ -1039,10 +1039,11 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id) * This helper is intended for locking net_device after it has been looked up * using a lockless lookup helper. Lock prevents the instance from going away. */ -struct net_device *__netdev_put_lock(struct net_device *dev) +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net) { netdev_lock(dev); - if (dev->reg_state > NETREG_REGISTERED) { + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { netdev_unlock(dev); dev_put(dev); return NULL; @@ -1070,7 +1071,7 @@ struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) if (!dev) return NULL; - return __netdev_put_lock(dev); + return __netdev_put_lock(dev, net); } struct net_device * @@ -1090,7 +1091,7 @@ netdev_xa_find_lock(struct net *net, struct net_device *dev, dev_hold(dev); rcu_read_unlock(); - dev = __netdev_put_lock(dev); + dev = __netdev_put_lock(dev, net); if (dev) return dev; @@ -12157,7 +12158,11 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, netif_close(dev); /* And unlink it from device chain */ unlist_netdevice(dev); - netdev_unlock_ops(dev); + + if (!netdev_need_ops_lock(dev)) + netdev_lock(dev); + dev->moving_ns = true; + netdev_unlock(dev); synchronize_net(); @@ -12193,7 +12198,9 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, move_netdevice_notifiers_dev_net(dev, net); /* Actually switch the network namespace */ + netdev_lock(dev); dev_net_set(dev, net); + netdev_unlock(dev); dev->ifindex = new_ifindex; if (new_name[0]) { @@ -12219,7 +12226,11 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, err = netdev_change_owner(dev, net_old, net); WARN_ON(err); - netdev_lock_ops(dev); + netdev_lock(dev); + dev->moving_ns = false; + if (!netdev_need_ops_lock(dev)) + netdev_unlock(dev); + /* Add the device back in the hashes */ list_netdevice(dev); /* Notify protocols, that a new device appeared. */ diff --git a/net/core/dev.h b/net/core/dev.h index 710abc05ebdb..e7bf21f52fc7 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -30,7 +30,7 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); struct net_device *dev_get_by_napi_id(unsigned int napi_id); struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); -struct net_device *__netdev_put_lock(struct net_device *dev); +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index); -- cgit v1.2.3 From 606048cbd8346e616cfaee01b0143d072534136d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:49 -0700 Subject: net: designate XSK pool pointers in queues as "ops protected" Read accesses go via xsk_get_pool_from_qid(), the call coming from the core and gve look safe (other "ops locked" drivers don't support XSK). Write accesses go via xsk_reg_pool_at_qid() and xsk_clear_pool_at_qid(). Former is already under the ops lock, latter is not (both coming from the workqueue via xp_clear_dev() and NETDEV_UNREGISTER via xsk_notifier()). Acked-by: Stanislav Fomichev Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + include/net/netdev_rx_queue.h | 6 +++--- net/xdp/xsk_buff_pool.c | 6 +++++- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8e9be80bc167..7242fb8a22fc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -688,6 +688,7 @@ struct netdev_queue { /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; #ifdef CONFIG_XDP_SOCKETS + /* "ops protected", see comment about net_device::lock */ struct xsk_buff_pool *pool; #endif diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index b2238b551dce..8cdcd138b33f 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -20,12 +20,12 @@ struct netdev_rx_queue { struct net_device *dev; netdevice_tracker dev_tracker; + /* All fields below are "ops protected", + * see comment about net_device::lock + */ #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif - /* NAPI instance for the queue - * "ops protected", see comment about net_device::lock - */ struct napi_struct *napi; struct pp_memory_provider_params mp_params; } ____cacheline_aligned_in_smp; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 25a76c5ce0f1..cbf2129e808b 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -266,13 +266,17 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, void xp_clear_dev(struct xsk_buff_pool *pool) { + struct net_device *netdev = pool->netdev; + if (!pool->netdev) return; + netdev_lock_ops(netdev); xp_disable_drv_zc(pool); xsk_clear_pool_at_qid(pool->netdev, pool->queue_id); - dev_put(pool->netdev); pool->netdev = NULL; + netdev_unlock_ops(netdev); + dev_put(netdev); } static void xp_release_deferred(struct work_struct *work) -- cgit v1.2.3 From 4ec9031cbeb73a66979560bbb6d355329be762de Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:50 -0700 Subject: netdev: add "ops compat locking" helpers Add helpers to "lock a netdev in a backward-compatible way", which for ops-locked netdevs will mean take the instance lock. For drivers which haven't opted into the ops locking we'll take rtnl_lock. The scoped foreach is dropping and re-taking the lock for each device, even if prev and next are both under rtnl_lock. I hope that's fine since we expect that netdev nl to be mostly supported by modern drivers, and modern drivers should also opt into the instance locking. Note that these helpers are mostly needed for queue related state, because drivers modify queue config in their ops in a non-atomic way. Or differently put, queue changes don't have a clear-cut API like NAPI configuration. Any state that can should just use the instance lock directly, not the "compat" hacks. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netdev_lock.h | 16 +++++++++++++++ net/core/dev.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++ net/core/dev.h | 15 ++++++++++++++ 3 files changed, 82 insertions(+) (limited to 'include') diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index c316b551df8d..5706835a660c 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -64,6 +64,22 @@ netdev_ops_assert_locked_or_invisible(const struct net_device *dev) netdev_ops_assert_locked(dev); } +static inline void netdev_lock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_lock(dev); + else + rtnl_lock(); +} + +static inline void netdev_unlock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_unlock(dev); + else + rtnl_unlock(); +} + static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { diff --git a/net/core/dev.c b/net/core/dev.c index 8c13e5339cc9..b52efa4cec56 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1052,6 +1052,20 @@ struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net) return dev; } +static struct net_device * +__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net) +{ + netdev_lock_ops_compat(dev); + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { + netdev_unlock_ops_compat(dev); + dev_put(dev); + return NULL; + } + dev_put(dev); + return dev; +} + /** * netdev_get_by_index_lock() - find a device by its ifindex * @net: the applicable net namespace @@ -1074,6 +1088,18 @@ struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) return __netdev_put_lock(dev, net); } +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex) +{ + struct net_device *dev; + + dev = dev_get_by_index(net, ifindex); + if (!dev) + return NULL; + + return __netdev_put_lock_ops_compat(dev, net); +} + struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index) @@ -1099,6 +1125,31 @@ netdev_xa_find_lock(struct net *net, struct net_device *dev, } while (true); } +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index) +{ + if (dev) + netdev_unlock_ops_compat(dev); + + do { + rcu_read_lock(); + dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); + if (!dev) { + rcu_read_unlock(); + return NULL; + } + dev_hold(dev); + rcu_read_unlock(); + + dev = __netdev_put_lock_ops_compat(dev, net); + if (dev) + return dev; + + (*index)++; + } while (true); +} + static DEFINE_SEQLOCK(netdev_rename_lock); void netdev_copy_name(struct net_device *dev, char *name) diff --git a/net/core/dev.h b/net/core/dev.h index e7bf21f52fc7..e93f36b7ddf3 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -42,6 +42,21 @@ DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T)); (var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \ ifindex++) +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex); +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index); + +DEFINE_FREE(netdev_unlock_ops_compat, struct net_device *, + if (_T) netdev_unlock_ops_compat(_T)); + +#define for_each_netdev_lock_ops_compat_scoped(net, var_name, ifindex) \ + for (struct net_device *var_name __free(netdev_unlock_ops_compat) = NULL; \ + (var_name = netdev_xa_find_lock_ops_compat(net, var_name, \ + &ifindex)); \ + ifindex++) + #ifdef CONFIG_PROC_FS int __init dev_proc_init(void); #else -- cgit v1.2.3 From 03df156dd3a6d5992f17682cd5c3b11e5ffdae02 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:52 -0700 Subject: xdp: double protect netdev->xdp_flags with netdev->lock Protect xdp_features with netdev->lock. This way pure readers no longer have to take rtnl_lock to access the field. This includes calling NETDEV_XDP_FEAT_CHANGE under the lock. Looks like that's fine for bonding, the only "real" listener, it's the same as ethtool feature change. In terms of normal drivers - only GVE need special consideration (other drivers don't use instance lock or don't support XDP). It calls xdp_set_features_flag() helper from gve_init_priv() which in turn is called from gve_reset_recovery() (locked), or prior to netdev registration. So switch to _locked. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Acked-by: Harshitha Ramamurthy Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250408195956.412733-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/netdevices.rst | 1 + drivers/net/ethernet/google/gve/gve_main.c | 2 +- include/linux/netdevice.h | 2 +- include/net/xdp.h | 1 + net/core/lock_debug.c | 2 +- net/core/xdp.c | 12 +++++++++++- 6 files changed, 16 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index 6c2d8945f597..d6357472d3f1 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -354,6 +354,7 @@ For devices with locked ops, currently only the following notifiers are running under the lock: * ``NETDEV_REGISTER`` * ``NETDEV_UP`` +* ``NETDEV_XDP_FEAT_CHANGE`` The following notifiers are running without the lock: * ``NETDEV_UNREGISTER`` diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index f9a73c956861..7a249baee316 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -2185,7 +2185,7 @@ static void gve_set_netdev_xdp_features(struct gve_priv *priv) xdp_features = 0; } - xdp_set_features_flag(priv->dev, xdp_features); + xdp_set_features_flag_locked(priv->dev, xdp_features); } static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7242fb8a22fc..dece2ae396a1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2526,7 +2526,7 @@ struct net_device { * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: - * @up, @moving_ns, @nd_net + * @up, @moving_ns, @nd_net, @xdp_flags * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues diff --git a/include/net/xdp.h b/include/net/xdp.h index 48efacbaa35d..20e41b5ff319 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -616,6 +616,7 @@ struct xdp_metadata_ops { u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); #else diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c index b7f22dc92a6f..598c443ef2f3 100644 --- a/net/core/lock_debug.c +++ b/net/core/lock_debug.c @@ -20,6 +20,7 @@ int netdev_debug_event(struct notifier_block *nb, unsigned long event, switch (cmd) { case NETDEV_REGISTER: case NETDEV_UP: + case NETDEV_XDP_FEAT_CHANGE: netdev_ops_assert_locked(dev); fallthrough; case NETDEV_DOWN: @@ -58,7 +59,6 @@ int netdev_debug_event(struct notifier_block *nb, unsigned long event, case NETDEV_OFFLOAD_XSTATS_DISABLE: case NETDEV_OFFLOAD_XSTATS_REPORT_USED: case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: - case NETDEV_XDP_FEAT_CHANGE: ASSERT_RTNL(); break; diff --git a/net/core/xdp.c b/net/core/xdp.c index f86eedad586a..3cd0db9c9d2d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -17,6 +17,7 @@ #include #include +#include #include #include /* struct xdp_mem_allocator */ #include @@ -991,17 +992,26 @@ static int __init xdp_metadata_init(void) } late_initcall(xdp_metadata_init); -void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val) { val &= NETDEV_XDP_ACT_MASK; if (dev->xdp_features == val) return; + netdev_assert_locked_or_invisible(dev); dev->xdp_features = val; if (dev->reg_state == NETREG_REGISTERED) call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); } +EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked); + +void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +{ + netdev_lock(dev); + xdp_set_features_flag_locked(dev, val); + netdev_unlock(dev); +} EXPORT_SYMBOL_GPL(xdp_set_features_flag); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) -- cgit v1.2.3 From ce7b14947484e6190372f2c3dbfb69aafbc4c0fc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:55 -0700 Subject: netdev: depend on netdev->lock for qstats in ops locked drivers We mostly needed rtnl_lock in qstat to make sure the queue count is stable while we work. For "ops locked" drivers the instance lock protects the queue count, so we don't have to take rtnl_lock. For currently ops-locked drivers: netdevsim and bnxt need the protection from netdev going down while we dump, which instance lock provides. gve doesn't care. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/netdevices.rst | 6 ++++++ include/net/netdev_queues.h | 4 +++- net/core/netdev-genl.c | 29 +++++++++++++++++------------ 3 files changed, 26 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index 7ae28c5fb835..0ccc7dcf4390 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -356,6 +356,12 @@ Similarly to ``ndos`` the instance lock is only held for select drivers. For "ops locked" drivers all ethtool ops without exceptions should be called under the instance lock. +struct netdev_stat_ops +---------------------- + +"qstat" ops are invoked under the instance lock for "ops locked" drivers, +and under rtnl_lock for all other drivers. + struct net_shaper_ops --------------------- diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 825141d675e5..ea709b59d827 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -85,9 +85,11 @@ struct netdev_queue_stats_tx { * for some of the events is not maintained, and reliable "total" cannot * be provided). * + * Ops are called under the instance lock if netdev_need_ops_lock() + * returns true, otherwise under rtnl_lock. * Device drivers can assume that when collecting total device stats, * the @get_base_stats and subsequent per-queue calls are performed - * "atomically" (without releasing the rtnl_lock). + * "atomically" (without releasing the relevant lock). * * Device drivers are encouraged to reset the per-queue statistics when * number of queues change. This is because the primary use case for diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 8c58261de969..b64c614a00c4 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -795,26 +795,31 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); - rtnl_lock(); if (ifindex) { - netdev = __dev_get_by_index(net, ifindex); - if (netdev && netdev->stat_ops) { - err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, - info, ctx); - } else { + netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); + if (!netdev) { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QSTATS_IFINDEX]); - err = netdev ? -EOPNOTSUPP : -ENODEV; + return -ENODEV; } - } else { - for_each_netdev_dump(net, netdev, ctx->ifindex) { + if (netdev->stat_ops) { err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, info, ctx); - if (err < 0) - break; + } else { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_QSTATS_IFINDEX]); + err = -EOPNOTSUPP; } + netdev_unlock_ops_compat(netdev); + return err; + } + + for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + if (err < 0) + break; } - rtnl_unlock(); return err; } -- cgit v1.2.3 From 229671ac60e298b85c2644f52d7e487e9f487d06 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Apr 2025 20:27:42 +0000 Subject: net: remove cpu stall in txq_trans_update() txq_trans_update() currently uses txq->xmit_lock_owner to conditionally update txq->trans_start. For regular devices, txq->xmit_lock_owner is updated from HARD_TX_LOCK() and HARD_TX_UNLOCK(), and this apparently causes cpu stalls. Using dev->lltx, which sits in a read-mostly cache-line, and already used in HARD_TX_LOCK() and HARD_TX_UNLOCK() helps cpu prediction. On an AMD EPYC 7B12 dual socket server, tcp_rr with 128 threads and 30,000 flows gets a 5 % increase in throughput. As explained in commit 95ecba62e2fd ("net: fix races in netdev_tx_sent_queue()/dev_watchdog()") I am planning to no longer update txq->trans_start in the fast path in a followup patch. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250408202742.2145516-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 +- include/linux/netdevice.h | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index c9fd34787c99..e78de79a5d78 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -427,7 +427,7 @@ static void am65_cpsw_nuss_ndo_host_tx_timeout(struct net_device *ndev, if (netif_tx_queue_stopped(netif_txq)) { /* try recover if stopped by us */ - txq_trans_update(netif_txq); + txq_trans_update(ndev, netif_txq); netif_tx_wake_queue(netif_txq); } } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dece2ae396a1..a28a08046615 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4693,9 +4693,10 @@ static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) /* * txq->trans_start can be read locklessly from dev_watchdog() */ -static inline void txq_trans_update(struct netdev_queue *txq) +static inline void txq_trans_update(const struct net_device *dev, + struct netdev_queue *txq) { - if (txq->xmit_lock_owner != -1) + if (!dev->lltx) WRITE_ONCE(txq->trans_start, jiffies); } @@ -5214,7 +5215,7 @@ static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_devi rc = __netdev_start_xmit(ops, skb, dev, more); if (rc == NETDEV_TX_OK) - txq_trans_update(txq); + txq_trans_update(dev, txq); return rc; } -- cgit v1.2.3 From 04271411121a58d37f47b065bc872f333274bf1f Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 9 Apr 2025 19:26:04 +0800 Subject: tcp: add TCP_RFC7323_TW_PAWS drop reason Devices in the networking path, such as firewalls, NATs, or routers, which can perform SNAT or DNAT, use addresses from their own limited address pools to masquerade the source address during forwarding, causing PAWS verification to fail more easily. Currently, packet loss statistics for PAWS can only be viewed through MIB, which is a global metric and cannot be precisely obtained through tracing to get the specific 4-tuple of the dropped packet. In the past, we had to use kprobe ret to retrieve relevant skb information from tcp_timewait_state_process(). We add a drop_reason pointer, similar to what previous commit does: commit e34100c2ecbb ("tcp: add a drop_reason pointer to tcp_check_req()") This commit addresses the PAWSESTABREJECTED case and also sets the corresponding drop reason. We use 'pwru' to test. Before this commit: '''' ./pwru 'port 9999' 2025/04/07 13:40:19 Listening for events.. TUPLE FUNC 172.31.75.115:12345->172.31.75.114:9999(tcp) sk_skb_reason_drop(SKB_DROP_REASON_NOT_SPECIFIED) ''' After this commit: ''' ./pwru 'port 9999' 2025/04/07 13:51:34 Listening for events.. TUPLE FUNC 172.31.75.115:12345->172.31.75.114:9999(tcp) sk_skb_reason_drop(SKB_DROP_REASON_TCP_RFC7323_TW_PAWS) ''' Suggested-by: Eric Dumazet Signed-off-by: Jiayuan Chen Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250409112614.16153-2-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 6 ++++++ include/net/tcp.h | 3 ++- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv4/tcp_minisocks.c | 7 +++++-- net/ipv6/tcp_ipv6.c | 3 ++- 5 files changed, 17 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index e4fdc6b54cef..9701d7f936f6 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -40,6 +40,7 @@ FN(TCP_OFOMERGE) \ FN(TCP_RFC7323_PAWS) \ FN(TCP_RFC7323_PAWS_ACK) \ + FN(TCP_RFC7323_TW_PAWS) \ FN(TCP_RFC7323_TSECR) \ FN(TCP_LISTEN_OVERFLOW) \ FN(TCP_OLD_SEQUENCE) \ @@ -283,6 +284,11 @@ enum skb_drop_reason { * Corresponds to LINUX_MIB_PAWS_OLD_ACK. */ SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK, + /** + * @SKB_DROP_REASON_TCP_RFC7323_TW_PAWS: PAWS check, socket is in + * TIME_WAIT state. + */ + SKB_DROP_REASON_TCP_RFC7323_TW_PAWS, /** * @SKB_DROP_REASON_TCP_RFC7323_TSECR: PAWS check, invalid TSEcr. * Corresponds to LINUX_MIB_TSECRREJECTED. diff --git a/include/net/tcp.h b/include/net/tcp.h index 4450c384ef17..5078ad868fee 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -427,7 +427,8 @@ enum tcp_tw_status { enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, - u32 *tw_isn); + u32 *tw_isn, + enum skb_drop_reason *drop_reason); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *lost_race, enum skb_drop_reason *drop_reason); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8cce0d5489da..d5b5c32115d2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2417,7 +2417,8 @@ do_time_wait: goto csum_error; } - tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn); + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, + &drop_reason); switch (tw_status) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(net, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index fb9349be36b8..27511bf58c0f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -97,7 +97,8 @@ static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq, */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th, u32 *tw_isn) + const struct tcphdr *th, u32 *tw_isn, + enum skb_drop_reason *drop_reason) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); @@ -245,8 +246,10 @@ kill: return TCP_TW_SYN; } - if (paws_reject) + if (paws_reject) { + *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); + } if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b03c223eda4f..7dcb33f879ee 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1970,7 +1970,8 @@ do_time_wait: goto csum_error; } - tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn); + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, + &drop_reason); switch (tw_status) { case TCP_TW_SYN: { -- cgit v1.2.3 From c449d5f3a3d70b6223af8df2cadca3ca6eacb613 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 9 Apr 2025 19:26:05 +0800 Subject: tcp: add LINUX_MIB_PAWS_TW_REJECTED counter When TCP is in TIME_WAIT state, PAWS verification uses LINUX_PAWSESTABREJECTED, which is ambiguous and cannot be distinguished from other PAWS verification processes. We added a new counter, like the existing PAWS_OLD_ACK one. Also we update the doc with previously missing PAWS_OLD_ACK. usage: ''' nstat -az | grep PAWSTimewait TcpExtPAWSTimewait 1 0.0 ''' Suggested-by: Eric Dumazet Signed-off-by: Jiayuan Chen Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250409112614.16153-3-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/snmp.rst | 2 ++ include/net/dropreason-core.h | 1 + include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_minisocks.c | 2 +- 5 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/networking/net_cachelines/snmp.rst b/Documentation/networking/net_cachelines/snmp.rst index bc96efc92cf5..bd44b3eebbef 100644 --- a/Documentation/networking/net_cachelines/snmp.rst +++ b/Documentation/networking/net_cachelines/snmp.rst @@ -37,6 +37,8 @@ unsigned_long LINUX_MIB_TIMEWAITKILLED unsigned_long LINUX_MIB_PAWSACTIVEREJECTED unsigned_long LINUX_MIB_PAWSESTABREJECTED unsigned_long LINUX_MIB_TSECR_REJECTED +unsigned_long LINUX_MIB_PAWS_OLD_ACK +unsigned_long LINUX_MIB_PAWS_TW_REJECTED unsigned_long LINUX_MIB_DELAYEDACKLOST unsigned_long LINUX_MIB_LISTENOVERFLOWS unsigned_long LINUX_MIB_LISTENDROPS diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 9701d7f936f6..bea77934a235 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -287,6 +287,7 @@ enum skb_drop_reason { /** * @SKB_DROP_REASON_TCP_RFC7323_TW_PAWS: PAWS check, socket is in * TIME_WAIT state. + * Corresponds to LINUX_MIB_PAWS_TW_REJECTED. */ SKB_DROP_REASON_TCP_RFC7323_TW_PAWS, /** diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index ec47f9b68a1b..1d234d7e1892 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -188,6 +188,7 @@ enum LINUX_MIB_PAWSESTABREJECTED, /* PAWSEstabRejected */ LINUX_MIB_TSECRREJECTED, /* TSEcrRejected */ LINUX_MIB_PAWS_OLD_ACK, /* PAWSOldAck */ + LINUX_MIB_PAWS_TW_REJECTED, /* PAWSTimewait */ LINUX_MIB_DELAYEDACKS, /* DelayedACKs */ LINUX_MIB_DELAYEDACKLOCKED, /* DelayedACKLocked */ LINUX_MIB_DELAYEDACKLOST, /* DelayedACKLost */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 10cbeb76c274..ea2f01584379 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -191,6 +191,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED), SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK), + SNMP_MIB_ITEM("PAWSTimewait", LINUX_MIB_PAWS_TW_REJECTED), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 27511bf58c0f..43d7852ce07e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -248,7 +248,7 @@ kill: if (paws_reject) { *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; - __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); + __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); } if (!th->rst) { -- cgit v1.2.3 From b1e904999542ad6764eafa54545f1c55776006d1 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 8 Apr 2025 11:32:01 -0700 Subject: net: pass const to msg_data_left() The msg_data_left() function doesn't modify the struct msghdr parameter, so mark it as const. This allows the function to be used with const references, improving type safety and making the API more flexible. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250408-tcpsendmsg-v3-1-208b87064c28@debian.org Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index c3322eb3d686..3b262487ec06 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -168,7 +168,7 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); } -static inline size_t msg_data_left(struct msghdr *msg) +static inline size_t msg_data_left(const struct msghdr *msg) { return iov_iter_count(&msg->msg_iter); } -- cgit v1.2.3 From 0f08335ade71273f89d19412268b48b55f3e3726 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 8 Apr 2025 11:32:02 -0700 Subject: trace: tcp: Add tracepoint for tcp_sendmsg_locked() Add a tracepoint to monitor TCP send operations, enabling detailed visibility into TCP message transmission. Create a new tracepoint within the tcp_sendmsg_locked function, capturing traditional fields along with size_goal, which indicates the optimal data size for a single TCP segment. Additionally, a reference to the struct sock sk is passed, allowing direct access for BPF programs. The implementation is largely based on David's patch[1] and suggestions. Link: https://lore.kernel.org/all/70168c8f-bf52-4279-b4c4-be64527aa1ac@kernel.org/ [1] Signed-off-by: Breno Leitao Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250408-tcpsendmsg-v3-2-208b87064c28@debian.org Signed-off-by: Jakub Kicinski --- include/trace/events/tcp.h | 24 ++++++++++++++++++++++++ kernel/bpf/btf.c | 1 + net/ipv4/tcp.c | 2 ++ 3 files changed, 27 insertions(+) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 1a40c41ff8c3..75d3d53a3832 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -259,6 +259,30 @@ TRACE_EVENT(tcp_retransmit_synack, __entry->saddr_v6, __entry->daddr_v6) ); +TRACE_EVENT(tcp_sendmsg_locked, + TP_PROTO(const struct sock *sk, const struct msghdr *msg, + const struct sk_buff *skb, int size_goal), + + TP_ARGS(sk, msg, skb, size_goal), + + TP_STRUCT__entry( + __field(const void *, skb_addr) + __field(int, skb_len) + __field(int, msg_left) + __field(int, size_goal) + ), + + TP_fast_assign( + __entry->skb_addr = skb; + __entry->skb_len = skb ? skb->len : 0; + __entry->msg_left = msg_data_left(msg); + __entry->size_goal = size_goal; + ), + + TP_printk("skb_addr %p skb_len %d msg_left %d size_goal %d", + __entry->skb_addr, __entry->skb_len, __entry->msg_left, + __entry->size_goal)); + DECLARE_TRACE(tcp_cwnd_reduction_tp, TP_PROTO(const struct sock *sk, int newly_acked_sacked, int newly_lost, int flag), diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 16ba36f34dfa..24a26b4bb0b8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6541,6 +6541,7 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { { "xprt_put_cong", 0x10 }, /* tcp */ { "tcp_send_reset", 0x11 }, + { "tcp_sendmsg_locked", 0x100 }, /* tegra_apb_dma */ { "tegra_dma_tx_status", 0x100 }, /* timer_migration */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6edc441b3702..e0e96f8fd47c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1160,6 +1160,8 @@ restart: if (skb) copy = size_goal - skb->len; + trace_tcp_sendmsg_locked(sk, msg, skb, size_goal); + if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; -- cgit v1.2.3 From 2a63dd0edf388802074f1d4d6b588a3b4c380688 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 9 Apr 2025 19:36:45 -0700 Subject: net: Retire DCCP socket. DCCP was orphaned in 2021 by commit 054c4610bd05 ("MAINTAINERS: dccp: move Gerrit Renker to CREDITS"), which noted that the last maintainer had been inactive for five years. In recent years, it has become a playground for syzbot, and most changes to DCCP have been odd bug fixes triggered by syzbot. Apart from that, the only changes have been driven by treewide or networking API updates or adjustments related to TCP. Thus, in 2023, we announced we would remove DCCP in 2025 via commit b144fcaf46d4 ("dccp: Print deprecation notice."). Since then, only one individual has contacted the netdev mailing list. [0] There is ongoing research for Multipath DCCP. The repository is hosted on GitHub [1], and development is not taking place through the upstream community. While the repository is published under the GPLv2 license, the scheduling part remains proprietary, with a LICENSE file [2] stating: "This is not Open Source software." The researcher mentioned a plan to address the licensing issue, upstream the patches, and step up as a maintainer, but there has been no further communication since then. Maintaining DCCP for a decade without any real users has become a burden. Therefore, it's time to remove it. Removing DCCP will also provide significant benefits to TCP. It allows us to freely reorganize the layout of struct inet_connection_sock, which is currently shared with DCCP, and optimize it to reduce the number of cachelines accessed in the TCP fast path. Note that we keep DCCP netfilter modules as requested. [3] Link: https://lore.kernel.org/netdev/20230710182253.81446-1-kuniyu@amazon.com/T/#u #[0] Link: https://github.com/telekom/mp-dccp #[1] Link: https://github.com/telekom/mp-dccp/blob/mpdccp_v03_k5.10/net/dccp/non_gpl_scheduler/LICENSE #[2] Link: https://lore.kernel.org/netdev/Z_VQ0KlCRkqYWXa-@calendula/ #[3] Signed-off-by: Kuniyuki Iwashima Acked-by: Paul Moore (LSM and SELinux) Acked-by: Casey Schaufler Link: https://patch.msgid.link/20250410023921.11307-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- Documentation/admin-guide/bug-hunting.rst | 2 +- Documentation/networking/dccp.rst | 219 --- Documentation/networking/index.rst | 1 - Documentation/networking/ip-sysctl.rst | 4 +- .../translations/zh_CN/admin-guide/bug-hunting.rst | 2 +- .../translations/zh_TW/admin-guide/bug-hunting.rst | 2 +- MAINTAINERS | 9 - arch/m68k/configs/amiga_defconfig | 2 - arch/m68k/configs/apollo_defconfig | 2 - arch/m68k/configs/atari_defconfig | 2 - arch/m68k/configs/bvme6000_defconfig | 2 - arch/m68k/configs/hp300_defconfig | 2 - arch/m68k/configs/mac_defconfig | 2 - arch/m68k/configs/multi_defconfig | 2 - arch/m68k/configs/mvme147_defconfig | 2 - arch/m68k/configs/mvme16x_defconfig | 2 - arch/m68k/configs/q40_defconfig | 2 - arch/m68k/configs/sun3_defconfig | 2 - arch/m68k/configs/sun3x_defconfig | 2 - arch/mips/configs/bigsur_defconfig | 1 - arch/mips/configs/gpr_defconfig | 1 - arch/mips/configs/mtx1_defconfig | 1 - arch/powerpc/configs/pmac32_defconfig | 1 - arch/powerpc/configs/ppc6xx_defconfig | 1 - include/linux/dccp.h | 289 ---- include/linux/tfrc.h | 51 - include/net/inet_hashtables.h | 6 +- include/net/rstreason.h | 2 +- include/net/secure_seq.h | 4 - include/trace/events/sock.h | 1 - include/trace/events/sunrpc.h | 2 - net/Kconfig | 1 - net/Makefile | 1 - net/core/secure_seq.c | 42 - net/core/sock_diag.c | 2 - net/dccp/Kconfig | 46 - net/dccp/Makefile | 30 - net/dccp/ackvec.c | 403 ----- net/dccp/ackvec.h | 136 -- net/dccp/ccid.c | 219 --- net/dccp/ccid.h | 262 ---- net/dccp/ccids/Kconfig | 55 - net/dccp/ccids/ccid2.c | 794 ---------- net/dccp/ccids/ccid2.h | 121 -- net/dccp/ccids/ccid3.c | 866 ----------- net/dccp/ccids/ccid3.h | 148 -- net/dccp/ccids/lib/loss_interval.c | 184 --- net/dccp/ccids/lib/loss_interval.h | 69 - net/dccp/ccids/lib/packet_history.c | 439 ------ net/dccp/ccids/lib/packet_history.h | 142 -- net/dccp/ccids/lib/tfrc.c | 46 - net/dccp/ccids/lib/tfrc.h | 73 - net/dccp/ccids/lib/tfrc_equation.c | 702 --------- net/dccp/dccp.h | 483 ------ net/dccp/diag.c | 85 -- net/dccp/feat.c | 1581 -------------------- net/dccp/feat.h | 133 -- net/dccp/input.c | 739 --------- net/dccp/ipv4.c | 1101 -------------- net/dccp/ipv6.c | 1174 --------------- net/dccp/ipv6.h | 27 - net/dccp/minisocks.c | 266 ---- net/dccp/options.c | 609 -------- net/dccp/output.c | 708 --------- net/dccp/proto.c | 1293 ---------------- net/dccp/qpolicy.c | 136 -- net/dccp/sysctl.c | 107 -- net/dccp/timer.c | 272 ---- net/dccp/trace.h | 82 - net/ipv4/Kconfig | 2 +- net/ipv4/af_inet.c | 5 +- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/inet_diag.c | 2 - net/ipv6/ip6_output.c | 2 +- samples/bpf/sockex2_kern.c | 1 - scripts/checkpatch.pl | 2 +- security/lsm_audit.c | 19 - security/selinux/hooks.c | 41 +- security/selinux/include/classmap.h | 2 - security/selinux/nlmsgtab.c | 1 - security/smack/smack_lsm.c | 9 +- 81 files changed, 14 insertions(+), 14274 deletions(-) delete mode 100644 Documentation/networking/dccp.rst delete mode 100644 include/linux/tfrc.h delete mode 100644 net/dccp/Kconfig delete mode 100644 net/dccp/Makefile delete mode 100644 net/dccp/ackvec.c delete mode 100644 net/dccp/ackvec.h delete mode 100644 net/dccp/ccid.c delete mode 100644 net/dccp/ccid.h delete mode 100644 net/dccp/ccids/Kconfig delete mode 100644 net/dccp/ccids/ccid2.c delete mode 100644 net/dccp/ccids/ccid2.h delete mode 100644 net/dccp/ccids/ccid3.c delete mode 100644 net/dccp/ccids/ccid3.h delete mode 100644 net/dccp/ccids/lib/loss_interval.c delete mode 100644 net/dccp/ccids/lib/loss_interval.h delete mode 100644 net/dccp/ccids/lib/packet_history.c delete mode 100644 net/dccp/ccids/lib/packet_history.h delete mode 100644 net/dccp/ccids/lib/tfrc.c delete mode 100644 net/dccp/ccids/lib/tfrc.h delete mode 100644 net/dccp/ccids/lib/tfrc_equation.c delete mode 100644 net/dccp/dccp.h delete mode 100644 net/dccp/diag.c delete mode 100644 net/dccp/feat.c delete mode 100644 net/dccp/feat.h delete mode 100644 net/dccp/input.c delete mode 100644 net/dccp/ipv4.c delete mode 100644 net/dccp/ipv6.c delete mode 100644 net/dccp/ipv6.h delete mode 100644 net/dccp/minisocks.c delete mode 100644 net/dccp/options.c delete mode 100644 net/dccp/output.c delete mode 100644 net/dccp/proto.c delete mode 100644 net/dccp/qpolicy.c delete mode 100644 net/dccp/sysctl.c delete mode 100644 net/dccp/timer.c delete mode 100644 net/dccp/trace.h (limited to 'include') diff --git a/Documentation/admin-guide/bug-hunting.rst b/Documentation/admin-guide/bug-hunting.rst index ce6f4e8ca487..30858757c9f2 100644 --- a/Documentation/admin-guide/bug-hunting.rst +++ b/Documentation/admin-guide/bug-hunting.rst @@ -196,7 +196,7 @@ will see the assembler code for the routine shown, but if your kernel has debug symbols the C code will also be available. (Debug symbols can be enabled in the kernel hacking menu of the menu configuration.) For example:: - $ objdump -r -S -l --disassemble net/dccp/ipv4.o + $ objdump -r -S -l --disassemble net/ipv4/tcp.o .. note:: diff --git a/Documentation/networking/dccp.rst b/Documentation/networking/dccp.rst deleted file mode 100644 index 91e5c33ba3ff..000000000000 --- a/Documentation/networking/dccp.rst +++ /dev/null @@ -1,219 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============= -DCCP protocol -============= - - -.. Contents - - Introduction - - Missing features - - Socket options - - Sysctl variables - - IOCTLs - - Other tunables - - Notes - - -Introduction -============ -Datagram Congestion Control Protocol (DCCP) is an unreliable, connection -oriented protocol designed to solve issues present in UDP and TCP, particularly -for real-time and multimedia (streaming) traffic. -It divides into a base protocol (RFC 4340) and pluggable congestion control -modules called CCIDs. Like pluggable TCP congestion control, at least one CCID -needs to be enabled in order for the protocol to function properly. In the Linux -implementation, this is the TCP-like CCID2 (RFC 4341). Additional CCIDs, such as -the TCP-friendly CCID3 (RFC 4342), are optional. -For a brief introduction to CCIDs and suggestions for choosing a CCID to match -given applications, see section 10 of RFC 4340. - -It has a base protocol and pluggable congestion control IDs (CCIDs). - -DCCP is a Proposed Standard (RFC 2026), and the homepage for DCCP as a protocol -is at http://www.ietf.org/html.charters/dccp-charter.html - - -Missing features -================ -The Linux DCCP implementation does not currently support all the features that are -specified in RFCs 4340...42. - -The known bugs are at: - - http://www.linuxfoundation.org/collaborate/workgroups/networking/todo#DCCP - -For more up-to-date versions of the DCCP implementation, please consider using -the experimental DCCP test tree; instructions for checking this out are on: -http://www.linuxfoundation.org/collaborate/workgroups/networking/dccp_testing#Experimental_DCCP_source_tree - - -Socket options -============== -DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes -a policy ID as argument and can only be set before the connection (i.e. changes -during an established connection are not supported). Currently, two policies are -defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special, -and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an -u32 priority value as ancillary data to sendmsg(), where higher numbers indicate -a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to -be formatted using a cmsg(3) message header filled in as follows:: - - cmsg->cmsg_level = SOL_DCCP; - cmsg->cmsg_type = DCCP_SCM_PRIORITY; - cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */ - -DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero -value is always interpreted as unbounded queue length. If different from zero, -the interpretation of this parameter depends on the current dequeuing policy -(see above): the "simple" policy will enforce a fixed queue size by returning -EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the -lowest-priority packet first. The default value for this parameter is -initialised from /proc/sys/net/dccp/default/tx_qlen. - -DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of -service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, -the socket will fall back to 0 (which means that no meaningful service code -is present). On active sockets this is set before connect(); specifying more -than one code has no effect (all subsequent service codes are ignored). The -case is different for passive sockets, where multiple service codes (up to 32) -can be set before calling bind(). - -DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet -size (application payload size) in bytes, see RFC 4340, section 14. - -DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs -supported by the endpoint. The option value is an array of type uint8_t whose -size is passed as option length. The minimum array size is 4 elements, the -value returned in the optlen argument always reflects the true number of -built-in CCIDs. - -DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same -time, combining the operation of the next two socket options. This option is -preferable over the latter two, since often applications will use the same -type of CCID for both directions; and mixed use of CCIDs is not currently well -understood. This socket option takes as argument at least one uint8_t value, or -an array of uint8_t values, which must match available CCIDS (see above). CCIDs -must be registered on the socket before calling connect() or listen(). - -DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets -the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. -Please note that the getsockopt argument type here is ``int``, not uint8_t. - -DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. - -DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold -timewait state when closing the connection (RFC 4340, 8.3). The usual case is -that the closing server sends a CloseReq, whereupon the client holds timewait -state. When this boolean socket option is on, the server sends a Close instead -and will enter TIMEWAIT. This option must be set after accept() returns. - -DCCP_SOCKOPT_SEND_CSCOV and DCCP_SOCKOPT_RECV_CSCOV are used for setting the -partial checksum coverage (RFC 4340, sec. 9.2). The default is that checksums -always cover the entire packet and that only fully covered application data is -accepted by the receiver. Hence, when using this feature on the sender, it must -be enabled at the receiver, too with suitable choice of CsCov. - -DCCP_SOCKOPT_SEND_CSCOV sets the sender checksum coverage. Values in the - range 0..15 are acceptable. The default setting is 0 (full coverage), - values between 1..15 indicate partial coverage. - -DCCP_SOCKOPT_RECV_CSCOV is for the receiver and has a different meaning: it - sets a threshold, where again values 0..15 are acceptable. The default - of 0 means that all packets with a partial coverage will be discarded. - Values in the range 1..15 indicate that packets with minimally such a - coverage value are also acceptable. The higher the number, the more - restrictive this setting (see [RFC 4340, sec. 9.2.1]). Partial coverage - settings are inherited to the child socket after accept(). - -The following two options apply to CCID 3 exclusively and are getsockopt()-only. -In either case, a TFRC info struct (defined in ) is returned. - -DCCP_SOCKOPT_CCID_RX_INFO - Returns a ``struct tfrc_rx_info`` in optval; the buffer for optval and - optlen must be set to at least sizeof(struct tfrc_rx_info). - -DCCP_SOCKOPT_CCID_TX_INFO - Returns a ``struct tfrc_tx_info`` in optval; the buffer for optval and - optlen must be set to at least sizeof(struct tfrc_tx_info). - -On unidirectional connections it is useful to close the unused half-connection -via shutdown (SHUT_WR or SHUT_RD): this will reduce per-packet processing costs. - - -Sysctl variables -================ -Several DCCP default parameters can be managed by the following sysctls -(sysctl net.dccp.default or /proc/sys/net/dccp/default): - -request_retries - The number of active connection initiation retries (the number of - Requests minus one) before timing out. In addition, it also governs - the behaviour of the other, passive side: this variable also sets - the number of times DCCP repeats sending a Response when the initial - handshake does not progress from RESPOND to OPEN (i.e. when no Ack - is received after the initial Request). This value should be greater - than 0, suggested is less than 10. Analogue of tcp_syn_retries. - -retries1 - How often a DCCP Response is retransmitted until the listening DCCP - side considers its connecting peer dead. Analogue of tcp_retries1. - -retries2 - The number of times a general DCCP packet is retransmitted. This has - importance for retransmitted acknowledgments and feature negotiation, - data packets are never retransmitted. Analogue of tcp_retries2. - -tx_ccid = 2 - Default CCID for the sender-receiver half-connection. Depending on the - choice of CCID, the Send Ack Vector feature is enabled automatically. - -rx_ccid = 2 - Default CCID for the receiver-sender half-connection; see tx_ccid. - -seq_window = 100 - The initial sequence window (sec. 7.5.2) of the sender. This influences - the local ackno validity and the remote seqno validity windows (7.5.1). - Values in the range Wmin = 32 (RFC 4340, 7.5.2) up to 2^32-1 can be set. - -tx_qlen = 5 - The size of the transmit buffer in packets. A value of 0 corresponds - to an unbounded transmit buffer. - -sync_ratelimit = 125 ms - The timeout between subsequent DCCP-Sync packets sent in response to - sequence-invalid packets on the same socket (RFC 4340, 7.5.4). The unit - of this parameter is milliseconds; a value of 0 disables rate-limiting. - - -IOCTLS -====== -FIONREAD - Works as in udp(7): returns in the ``int`` argument pointer the size of - the next pending datagram in bytes, or 0 when no datagram is pending. - -SIOCOUTQ - Returns the number of unsent data bytes in the socket send queue as ``int`` - into the buffer specified by the argument pointer. - -Other tunables -============== -Per-route rto_min support - CCID-2 supports the RTAX_RTO_MIN per-route setting for the minimum value - of the RTO timer. This setting can be modified via the 'rto_min' option - of iproute2; for example:: - - > ip route change 10.0.0.0/24 rto_min 250j dev wlan0 - > ip route add 10.0.0.254/32 rto_min 800j dev wlan0 - > ip route show dev wlan0 - - CCID-3 also supports the rto_min setting: it is used to define the lower - bound for the expiry of the nofeedback timer. This can be useful on LANs - with very low RTTs (e.g., loopback, Gbit ethernet). - - -Notes -===== -DCCP does not travel through NAT successfully at present on many boxes. This is -because the checksum covers the pseudo-header as per TCP and UDP. Linux NAT -support for DCCP has been added. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index c64133d309bf..ac90b82f3ce9 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -48,7 +48,6 @@ Contents: ax25 bonding cdc_mbim - dccp dctcp devmem dns_resolver diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 5c63ab928b97..b43222ee57cf 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -37,8 +37,8 @@ ip_no_pmtu_disc - INTEGER Mode 3 is a hardened pmtu discover mode. The kernel will only accept fragmentation-needed errors if the underlying protocol can verify them besides a plain socket lookup. Current - protocols for which pmtu events will be honored are TCP, SCTP - and DCCP as they verify e.g. the sequence number or the + protocols for which pmtu events will be honored are TCP and + SCTP as they verify e.g. the sequence number or the association. This mode should not be enabled globally but is only intended to secure e.g. name servers in namespaces where TCP path mtu must still work but path MTU information of other diff --git a/Documentation/translations/zh_CN/admin-guide/bug-hunting.rst b/Documentation/translations/zh_CN/admin-guide/bug-hunting.rst index c3f6a83294dc..4b3432753eb9 100644 --- a/Documentation/translations/zh_CN/admin-guide/bug-hunting.rst +++ b/Documentation/translations/zh_CN/admin-guide/bug-hunting.rst @@ -188,7 +188,7 @@ objdump 编行。如果没有调试符号,您将看到所示例程的汇编程序代码,但是如果内核有调试 符号,C代码也将可见(调试符号可以在内核配置菜单的hacking项中启用)。例如:: - $ objdump -r -S -l --disassemble net/dccp/ipv4.o + $ objdump -r -S -l --disassemble net/ipv4/tcp.o .. note:: diff --git a/Documentation/translations/zh_TW/admin-guide/bug-hunting.rst b/Documentation/translations/zh_TW/admin-guide/bug-hunting.rst index b25ecc44d735..80ea5677ee52 100644 --- a/Documentation/translations/zh_TW/admin-guide/bug-hunting.rst +++ b/Documentation/translations/zh_TW/admin-guide/bug-hunting.rst @@ -191,7 +191,7 @@ objdump 編行。如果沒有調試符號,您將看到所示例程的彙編程序代碼,但是如果內核有調試 符號,C代碼也將可見(調試符號可以在內核配置菜單的hacking項中啓用)。例如:: - $ objdump -r -S -l --disassemble net/dccp/ipv4.o + $ objdump -r -S -l --disassemble net/ipv4/tcp.o .. note:: diff --git a/MAINTAINERS b/MAINTAINERS index c59316109e3f..1248443035f4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6546,15 +6546,6 @@ S: Maintained F: Documentation/scsi/dc395x.rst F: drivers/scsi/dc395x.* -DCCP PROTOCOL -L: dccp@vger.kernel.org -S: Orphan -W: http://www.linuxfoundation.org/collaborate/workgroups/networking/dccp -F: include/linux/dccp.h -F: include/linux/tfrc.h -F: include/uapi/linux/dccp.h -F: net/dccp/ - DEBUGOBJECTS: M: Thomas Gleixner L: linux-kernel@vger.kernel.org diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 31ecb8b7b9f1..85bd696b0e90 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -267,8 +267,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 1f57514624d5..93003ceb51bb 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -263,8 +263,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 02db7a48e57e..fa6ed1bb01e3 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -270,8 +270,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index f0e673cb17eb..80ca001d5eab 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -260,8 +260,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index e8ca5a50b86d..5e35e4b1fe16 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -262,8 +262,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index b3a270441bb1..11bc3f3c7576 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -261,8 +261,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index d215dba006ce..343f702210df 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -281,8 +281,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index a888ed93ff82..6104a5a24594 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -259,8 +259,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index b481782375f6..a99dd6c7c9c8 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -260,8 +260,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 6eba743d8eb5..0d87c119d4c4 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -261,8 +261,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 9bdbb418ffa8..841f9615b6ab 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -256,8 +256,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index e1cf20fa5343..12c5dc92ea4c 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -257,8 +257,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/mips/configs/bigsur_defconfig b/arch/mips/configs/bigsur_defconfig index 8f7c36868204..97d2cd997285 100644 --- a/arch/mips/configs/bigsur_defconfig +++ b/arch/mips/configs/bigsur_defconfig @@ -81,7 +81,6 @@ CONFIG_IP_VS_SH=m CONFIG_IP_VS_SED=m CONFIG_IP_VS_NQ=m CONFIG_IP_VS_FTP=m -CONFIG_IP_DCCP=m CONFIG_BRIDGE=m CONFIG_VLAN_8021Q=m CONFIG_VLAN_8021Q_GVRP=y diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig index 12f3eed8a946..df55fe6df8ba 100644 --- a/arch/mips/configs/gpr_defconfig +++ b/arch/mips/configs/gpr_defconfig @@ -84,7 +84,6 @@ CONFIG_BRIDGE_EBT_MARK_T=m CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m -CONFIG_IP_DCCP=m CONFIG_IP_SCTP=m CONFIG_TIPC=m CONFIG_ATM=y diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig index 06b7a0b97eca..487bd84043f7 100644 --- a/arch/mips/configs/mtx1_defconfig +++ b/arch/mips/configs/mtx1_defconfig @@ -130,7 +130,6 @@ CONFIG_BRIDGE_EBT_MARK_T=m CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m -CONFIG_IP_DCCP=m CONFIG_IP_SCTP=m CONFIG_TIPC=m CONFIG_ATM=y diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig index 1bc3466bc909..ae45f70b29f0 100644 --- a/arch/powerpc/configs/pmac32_defconfig +++ b/arch/powerpc/configs/pmac32_defconfig @@ -87,7 +87,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_IP_DCCP=m CONFIG_BT=m CONFIG_BT_RFCOMM=m CONFIG_BT_RFCOMM_TTY=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index a91a766b71a4..5e0f2cd7e62b 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -225,7 +225,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m CONFIG_TIPC=m CONFIG_ATM=m CONFIG_ATM_CLIP=m diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 325af611909f..0b61b8b996d4 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -2,79 +2,8 @@ #ifndef _LINUX_DCCP_H #define _LINUX_DCCP_H - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include #include -enum dccp_state { - DCCP_OPEN = TCP_ESTABLISHED, - DCCP_REQUESTING = TCP_SYN_SENT, - DCCP_LISTEN = TCP_LISTEN, - DCCP_RESPOND = TCP_SYN_RECV, - /* - * States involved in closing a DCCP connection: - * 1) ACTIVE_CLOSEREQ is entered by a server sending a CloseReq. - * - * 2) CLOSING can have three different meanings (RFC 4340, 8.3): - * a. Client has performed active-close, has sent a Close to the server - * from state OPEN or PARTOPEN, and is waiting for the final Reset - * (in this case, SOCK_DONE == 1). - * b. Client is asked to perform passive-close, by receiving a CloseReq - * in (PART)OPEN state. It sends a Close and waits for final Reset - * (in this case, SOCK_DONE == 0). - * c. Server performs an active-close as in (a), keeps TIMEWAIT state. - * - * 3) The following intermediate states are employed to give passively - * closing nodes a chance to process their unread data: - * - PASSIVE_CLOSE (from OPEN => CLOSED) and - * - PASSIVE_CLOSEREQ (from (PART)OPEN to CLOSING; case (b) above). - */ - DCCP_ACTIVE_CLOSEREQ = TCP_FIN_WAIT1, - DCCP_PASSIVE_CLOSE = TCP_CLOSE_WAIT, /* any node receiving a Close */ - DCCP_CLOSING = TCP_CLOSING, - DCCP_TIME_WAIT = TCP_TIME_WAIT, - DCCP_CLOSED = TCP_CLOSE, - DCCP_NEW_SYN_RECV = TCP_NEW_SYN_RECV, - DCCP_PARTOPEN = TCP_MAX_STATES, - DCCP_PASSIVE_CLOSEREQ, /* clients receiving CloseReq */ - DCCP_MAX_STATES -}; - -enum { - DCCPF_OPEN = TCPF_ESTABLISHED, - DCCPF_REQUESTING = TCPF_SYN_SENT, - DCCPF_LISTEN = TCPF_LISTEN, - DCCPF_RESPOND = TCPF_SYN_RECV, - DCCPF_ACTIVE_CLOSEREQ = TCPF_FIN_WAIT1, - DCCPF_CLOSING = TCPF_CLOSING, - DCCPF_TIME_WAIT = TCPF_TIME_WAIT, - DCCPF_CLOSED = TCPF_CLOSE, - DCCPF_NEW_SYN_RECV = TCPF_NEW_SYN_RECV, - DCCPF_PARTOPEN = (1 << DCCP_PARTOPEN), -}; - -static inline struct dccp_hdr *dccp_hdr(const struct sk_buff *skb) -{ - return (struct dccp_hdr *)skb_transport_header(skb); -} - -static inline struct dccp_hdr *dccp_zeroed_hdr(struct sk_buff *skb, int headlen) -{ - skb_push(skb, headlen); - skb_reset_transport_header(skb); - return memset(skb_transport_header(skb), 0, headlen); -} - static inline struct dccp_hdr_ext *dccp_hdrx(const struct dccp_hdr *dh) { return (struct dccp_hdr_ext *)((unsigned char *)dh + sizeof(*dh)); @@ -85,12 +14,6 @@ static inline unsigned int __dccp_basic_hdr_len(const struct dccp_hdr *dh) return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0); } -static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - return __dccp_basic_hdr_len(dh); -} - static inline __u64 dccp_hdr_seq(const struct dccp_hdr *dh) { __u64 seq_nr = ntohs(dh->dccph_seq); @@ -103,222 +26,10 @@ static inline __u64 dccp_hdr_seq(const struct dccp_hdr *dh) return seq_nr; } -static inline struct dccp_hdr_request *dccp_hdr_request(struct sk_buff *skb) -{ - return (struct dccp_hdr_request *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - -static inline struct dccp_hdr_ack_bits *dccp_hdr_ack_bits(const struct sk_buff *skb) -{ - return (struct dccp_hdr_ack_bits *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - -static inline u64 dccp_hdr_ack_seq(const struct sk_buff *skb) -{ - const struct dccp_hdr_ack_bits *dhack = dccp_hdr_ack_bits(skb); - return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + ntohl(dhack->dccph_ack_nr_low); -} - -static inline struct dccp_hdr_response *dccp_hdr_response(struct sk_buff *skb) -{ - return (struct dccp_hdr_response *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - -static inline struct dccp_hdr_reset *dccp_hdr_reset(struct sk_buff *skb) -{ - return (struct dccp_hdr_reset *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - static inline unsigned int __dccp_hdr_len(const struct dccp_hdr *dh) { return __dccp_basic_hdr_len(dh) + dccp_packet_hdr_len(dh->dccph_type); } -static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) -{ - return __dccp_hdr_len(dccp_hdr(skb)); -} - -/** - * struct dccp_request_sock - represent DCCP-specific connection request - * @dreq_inet_rsk: structure inherited from - * @dreq_iss: initial sequence number, sent on the first Response (RFC 4340, 7.1) - * @dreq_gss: greatest sequence number sent (for retransmitted Responses) - * @dreq_isr: initial sequence number received in the first Request - * @dreq_gsr: greatest sequence number received (for retransmitted Request(s)) - * @dreq_service: service code present on the Request (there is just one) - * @dreq_featneg: feature negotiation options for this connection - * The following two fields are analogous to the ones in dccp_sock: - * @dreq_timestamp_echo: last received timestamp to echo (13.1) - * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo - */ -struct dccp_request_sock { - struct inet_request_sock dreq_inet_rsk; - __u64 dreq_iss; - __u64 dreq_gss; - __u64 dreq_isr; - __u64 dreq_gsr; - __be32 dreq_service; - spinlock_t dreq_lock; - struct list_head dreq_featneg; - __u32 dreq_timestamp_echo; - __u32 dreq_timestamp_time; -}; - -static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req) -{ - return (struct dccp_request_sock *)req; -} - -extern struct inet_timewait_death_row dccp_death_row; - -extern int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - struct sk_buff *skb); - -struct dccp_options_received { - u64 dccpor_ndp:48; - u32 dccpor_timestamp; - u32 dccpor_timestamp_echo; - u32 dccpor_elapsed_time; -}; - -struct ccid; - -enum dccp_role { - DCCP_ROLE_UNDEFINED, - DCCP_ROLE_LISTEN, - DCCP_ROLE_CLIENT, - DCCP_ROLE_SERVER, -}; - -struct dccp_service_list { - __u32 dccpsl_nr; - __be32 dccpsl_list[]; -}; - -#define DCCP_SERVICE_INVALID_VALUE htonl((__u32)-1) -#define DCCP_SERVICE_CODE_IS_ABSENT 0 - -static inline bool dccp_list_has_service(const struct dccp_service_list *sl, - const __be32 service) -{ - if (likely(sl != NULL)) { - u32 i = sl->dccpsl_nr; - while (i--) - if (sl->dccpsl_list[i] == service) - return true; - } - return false; -} - -struct dccp_ackvec; - -/** - * struct dccp_sock - DCCP socket state - * - * @dccps_swl - sequence number window low - * @dccps_swh - sequence number window high - * @dccps_awl - acknowledgement number window low - * @dccps_awh - acknowledgement number window high - * @dccps_iss - initial sequence number sent - * @dccps_isr - initial sequence number received - * @dccps_osr - first OPEN sequence number received - * @dccps_gss - greatest sequence number sent - * @dccps_gsr - greatest valid sequence number received - * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss - * @dccps_service - first (passive sock) or unique (active sock) service code - * @dccps_service_list - second .. last service code on passive socket - * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option - * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo - * @dccps_l_ack_ratio - feature-local Ack Ratio - * @dccps_r_ack_ratio - feature-remote Ack Ratio - * @dccps_l_seq_win - local Sequence Window (influences ack number validity) - * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) - * @dccps_pcslen - sender partial checksum coverage (via sockopt) - * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) - * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) - * @dccps_ndp_count - number of Non Data Packets since last data packet - * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) - * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) - * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) - * @dccps_hc_rx_ackvec - rx half connection ack vector - * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) - * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) - * @dccps_options_received - parsed set of retrieved options - * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy - * @dccps_tx_qlen - maximum length of the TX queue - * @dccps_role - role of this sock, one of %dccp_role - * @dccps_hc_rx_insert_options - receiver wants to add options when acking - * @dccps_hc_tx_insert_options - sender wants to add options when sending - * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) - * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" - * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets - * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) - * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) - */ -struct dccp_sock { - /* inet_connection_sock has to be the first member of dccp_sock */ - struct inet_connection_sock dccps_inet_connection; -#define dccps_syn_rtt dccps_inet_connection.icsk_ack.lrcvtime - __u64 dccps_swl; - __u64 dccps_swh; - __u64 dccps_awl; - __u64 dccps_awh; - __u64 dccps_iss; - __u64 dccps_isr; - __u64 dccps_osr; - __u64 dccps_gss; - __u64 dccps_gsr; - __u64 dccps_gar; - __be32 dccps_service; - __u32 dccps_mss_cache; - struct dccp_service_list *dccps_service_list; - __u32 dccps_timestamp_echo; - __u32 dccps_timestamp_time; - __u16 dccps_l_ack_ratio; - __u16 dccps_r_ack_ratio; - __u64 dccps_l_seq_win:48; - __u64 dccps_r_seq_win:48; - __u8 dccps_pcslen:4; - __u8 dccps_pcrlen:4; - __u8 dccps_send_ndp_count:1; - __u64 dccps_ndp_count:48; - unsigned long dccps_rate_last; - struct list_head dccps_featneg; - struct dccp_ackvec *dccps_hc_rx_ackvec; - struct ccid *dccps_hc_rx_ccid; - struct ccid *dccps_hc_tx_ccid; - struct dccp_options_received dccps_options_received; - __u8 dccps_qpolicy; - __u32 dccps_tx_qlen; - enum dccp_role dccps_role:2; - __u8 dccps_hc_rx_insert_options:1; - __u8 dccps_hc_tx_insert_options:1; - __u8 dccps_server_timewait:1; - __u8 dccps_sync_scheduled:1; - struct tasklet_struct dccps_xmitlet; - struct timer_list dccps_xmit_timer; -}; - -#define dccp_sk(ptr) container_of_const(ptr, struct dccp_sock, \ - dccps_inet_connection.icsk_inet.sk) - -static inline const char *dccp_role(const struct sock *sk) -{ - switch (dccp_sk(sk)->dccps_role) { - case DCCP_ROLE_UNDEFINED: return "undefined"; - case DCCP_ROLE_LISTEN: return "listen"; - case DCCP_ROLE_SERVER: return "server"; - case DCCP_ROLE_CLIENT: return "client"; - } - return NULL; -} - -extern void dccp_syn_ack_timeout(const struct request_sock *req); - #endif /* _LINUX_DCCP_H */ diff --git a/include/linux/tfrc.h b/include/linux/tfrc.h deleted file mode 100644 index a5acc768085d..000000000000 --- a/include/linux/tfrc.h +++ /dev/null @@ -1,51 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _LINUX_TFRC_H_ -#define _LINUX_TFRC_H_ -/* - * TFRC - Data Structures for the TCP-Friendly Rate Control congestion - * control mechanism as specified in RFC 3448. - * - * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - */ -#include - -/** tfrc_rx_info - TFRC Receiver Data Structure - * - * @tfrcrx_x_recv: receiver estimate of sending rate (3.2.2) - * @tfrcrx_rtt: round-trip-time (communicated by sender) - * @tfrcrx_p: current estimate of loss event rate (3.2.2) - */ -struct tfrc_rx_info { - __u32 tfrcrx_x_recv; - __u32 tfrcrx_rtt; - __u32 tfrcrx_p; -}; - -/** tfrc_tx_info - TFRC Sender Data Structure - * - * @tfrctx_x: computed transmit rate (4.3 (4)) - * @tfrctx_x_recv: receiver estimate of send rate (4.3) - * @tfrctx_x_calc: return value of throughput equation (3.1) - * @tfrctx_rtt: (moving average) estimate of RTT (4.3) - * @tfrctx_p: current loss event rate (5.4) - * @tfrctx_rto: estimate of RTO, equals 4*RTT (4.3) - * @tfrctx_ipi: inter-packet interval (4.6) - * - * Note: X and X_recv are both maintained in units of 64 * bytes/second. This - * enables a finer resolution of sending rates and avoids problems with - * integer arithmetic; u32 is not sufficient as scaling consumes 6 bits. - */ -struct tfrc_tx_info { - __u64 tfrctx_x; - __u64 tfrctx_x_recv; - __u32 tfrctx_x_calc; - __u32 tfrctx_rtt; - __u32 tfrctx_p; - __u32 tfrctx_rto; - __u32 tfrctx_ipi; -}; - -#endif /* _LINUX_TFRC_H_ */ diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 949641e92539..d172b64a6320 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -177,12 +177,8 @@ struct inet_hashinfo { static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) { -#if IS_ENABLED(CONFIG_IP_DCCP) - return sk->sk_prot->h.hashinfo ? : - sock_net(sk)->ipv4.tcp_death_row.hashinfo; -#else + /* TODO: rename function */ return sock_net(sk)->ipv4.tcp_death_row.hashinfo; -#endif } static inline struct inet_listen_hashbucket * diff --git a/include/net/rstreason.h b/include/net/rstreason.h index 69cb2e52b7da..979ac87b5d99 100644 --- a/include/net/rstreason.h +++ b/include/net/rstreason.h @@ -36,7 +36,7 @@ /** * enum sk_rst_reason - the reasons of socket reset * - * The reasons of sk reset, which are used in DCCP/TCP/MPTCP protocols. + * The reasons of sk reset, which are used in TCP/MPTCP protocols. * * There are three parts in order: * 1) skb drop reasons: relying on drop reasons for such as passive reset diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index 21e7fa2a1813..cddebafb9f77 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -16,9 +16,5 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport); u32 secure_tcpv6_ts_off(const struct net *net, const __be32 *saddr, const __be32 *daddr); -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport); -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport); #endif /* _NET_SECURE_SEQ */ diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index 3836de435d9d..b5310439536e 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -19,7 +19,6 @@ /* The protocol traced by inet_sock_set_state */ #define inet_protocol_names \ EM(IPPROTO_TCP) \ - EM(IPPROTO_DCCP) \ EM(IPPROTO_SCTP) \ EMe(IPPROTO_MPTCP) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5d331383047b..de214f1dea58 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -21,7 +21,6 @@ TRACE_DEFINE_ENUM(SOCK_DGRAM); TRACE_DEFINE_ENUM(SOCK_RAW); TRACE_DEFINE_ENUM(SOCK_RDM); TRACE_DEFINE_ENUM(SOCK_SEQPACKET); -TRACE_DEFINE_ENUM(SOCK_DCCP); TRACE_DEFINE_ENUM(SOCK_PACKET); #define show_socket_type(type) \ @@ -31,7 +30,6 @@ TRACE_DEFINE_ENUM(SOCK_PACKET); { SOCK_RAW, "RAW" }, \ { SOCK_RDM, "RDM" }, \ { SOCK_SEQPACKET, "SEQPACKET" }, \ - { SOCK_DCCP, "DCCP" }, \ { SOCK_PACKET, "PACKET" }) /* This list is known to be incomplete, add new enums as needed. */ diff --git a/net/Kconfig b/net/Kconfig index c3fca69a7c83..202cc595e5e6 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -245,7 +245,6 @@ source "net/bridge/netfilter/Kconfig" endif -source "net/dccp/Kconfig" source "net/sctp/Kconfig" source "net/rds/Kconfig" source "net/tipc/Kconfig" diff --git a/net/Makefile b/net/Makefile index 60ed5190eda8..aac960c41db6 100644 --- a/net/Makefile +++ b/net/Makefile @@ -42,7 +42,6 @@ obj-$(CONFIG_PHONET) += phonet/ ifneq ($(CONFIG_VLAN_8021Q),) obj-y += 8021q/ endif -obj-$(CONFIG_IP_DCCP) += dccp/ obj-$(CONFIG_IP_SCTP) += sctp/ obj-$(CONFIG_RDS) += rds/ obj-$(CONFIG_WIRELESS) += wireless/ diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 568779d5a0ef..9a3965680451 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -156,45 +156,3 @@ u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) } EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif - -#if IS_ENABLED(CONFIG_IP_DCCP) -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) -{ - u64 seq; - net_secret_init(); - seq = siphash_3u32((__force u32)saddr, (__force u32)daddr, - (__force u32)sport << 16 | (__force u32)dport, - &net_secret); - seq += ktime_get_real_ns(); - seq &= (1ull << 48) - 1; - return seq; -} -EXPORT_SYMBOL(secure_dccp_sequence_number); - -#if IS_ENABLED(CONFIG_IPV6) -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport) -{ - const struct { - struct in6_addr saddr; - struct in6_addr daddr; - __be16 sport; - __be16 dport; - } __aligned(SIPHASH_ALIGNMENT) combined = { - .saddr = *(struct in6_addr *)saddr, - .daddr = *(struct in6_addr *)daddr, - .sport = sport, - .dport = dport - }; - u64 seq; - net_secret_init(); - seq = siphash(&combined, offsetofend(typeof(combined), dport), - &net_secret); - seq += ktime_get_real_ns(); - seq &= (1ull << 48) - 1; - return seq; -} -EXPORT_SYMBOL(secure_dccpv6_sequence_number); -#endif -#endif diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index a08eed9b9142..b23594c767f2 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -264,8 +264,6 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, switch (nlh->nlmsg_type) { case TCPDIAG_GETSOCK: - case DCCPDIAG_GETSOCK: - if (!rcu_access_pointer(inet_rcv_compat)) sock_load_diag_module(AF_INET, 0); diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig deleted file mode 100644 index 0c7d2f66ba27..000000000000 --- a/net/dccp/Kconfig +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -menuconfig IP_DCCP - tristate "The DCCP Protocol" - depends on INET - help - Datagram Congestion Control Protocol (RFC 4340) - - From https://www.ietf.org/rfc/rfc4340.txt: - - The Datagram Congestion Control Protocol (DCCP) is a transport - protocol that implements bidirectional, unicast connections of - congestion-controlled, unreliable datagrams. It should be suitable - for use by applications such as streaming media, Internet telephony, - and on-line games. - - To compile this protocol support as a module, choose M here: the - module will be called dccp. - - If in doubt, say N. - -if IP_DCCP - -config INET_DCCP_DIAG - depends on INET_DIAG - def_tristate y if (IP_DCCP = y && INET_DIAG = y) - def_tristate m - -source "net/dccp/ccids/Kconfig" - -menu "DCCP Kernel Hacking" - depends on DEBUG_KERNEL=y - -config IP_DCCP_DEBUG - bool "DCCP debug messages" - help - Only use this if you're hacking DCCP. - - When compiling DCCP as a module, this debugging output can be toggled - by setting the parameter dccp_debug of the `dccp' module to 0 or 1. - - Just say N. - - -endmenu - -endif # IP_DDCP diff --git a/net/dccp/Makefile b/net/dccp/Makefile deleted file mode 100644 index 5b4ff37bc806..000000000000 --- a/net/dccp/Makefile +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o - -dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \ - qpolicy.o -# -# CCID algorithms to be used by dccp.ko -# -# CCID-2 is default (RFC 4340, p. 77) and has Ack Vectors as dependency -dccp-y += ccids/ccid2.o ackvec.o -dccp-$(CONFIG_IP_DCCP_CCID3) += ccids/ccid3.o -dccp-$(CONFIG_IP_DCCP_TFRC_LIB) += ccids/lib/tfrc.o \ - ccids/lib/tfrc_equation.o \ - ccids/lib/packet_history.o \ - ccids/lib/loss_interval.o - -dccp_ipv4-y := ipv4.o - -# build dccp_ipv6 as module whenever either IPv6 or DCCP is a module -obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o -dccp_ipv6-y := ipv6.o - -obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o - -dccp-$(CONFIG_SYSCTL) += sysctl.o - -dccp_diag-y := diag.o - -# build with local directory for trace.h -CFLAGS_proto.o := -I$(src) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c deleted file mode 100644 index 1cba001bb4c8..000000000000 --- a/net/dccp/ackvec.c +++ /dev/null @@ -1,403 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/ackvec.c - * - * An implementation of Ack Vectors for the DCCP protocol - * Copyright (c) 2007 University of Aberdeen, Scotland, UK - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ -#include "dccp.h" -#include -#include -#include - -static struct kmem_cache *dccp_ackvec_slab; -static struct kmem_cache *dccp_ackvec_record_slab; - -struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) -{ - struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); - - if (av != NULL) { - av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; - INIT_LIST_HEAD(&av->av_records); - } - return av; -} - -static void dccp_ackvec_purge_records(struct dccp_ackvec *av) -{ - struct dccp_ackvec_record *cur, *next; - - list_for_each_entry_safe(cur, next, &av->av_records, avr_node) - kmem_cache_free(dccp_ackvec_record_slab, cur); - INIT_LIST_HEAD(&av->av_records); -} - -void dccp_ackvec_free(struct dccp_ackvec *av) -{ - if (likely(av != NULL)) { - dccp_ackvec_purge_records(av); - kmem_cache_free(dccp_ackvec_slab, av); - } -} - -/** - * dccp_ackvec_update_records - Record information about sent Ack Vectors - * @av: Ack Vector records to update - * @seqno: Sequence number of the packet carrying the Ack Vector just sent - * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector - */ -int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) -{ - struct dccp_ackvec_record *avr; - - avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); - if (avr == NULL) - return -ENOBUFS; - - avr->avr_ack_seqno = seqno; - avr->avr_ack_ptr = av->av_buf_head; - avr->avr_ack_ackno = av->av_buf_ackno; - avr->avr_ack_nonce = nonce_sum; - avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); - /* - * When the buffer overflows, we keep no more than one record. This is - * the simplest way of disambiguating sender-Acks dating from before the - * overflow from sender-Acks which refer to after the overflow; a simple - * solution is preferable here since we are handling an exception. - */ - if (av->av_overflow) - dccp_ackvec_purge_records(av); - /* - * Since GSS is incremented for each packet, the list is automatically - * arranged in descending order of @ack_seqno. - */ - list_add(&avr->avr_node, &av->av_records); - - dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", - (unsigned long long)avr->avr_ack_seqno, - (unsigned long long)avr->avr_ack_ackno, - avr->avr_ack_runlen); - return 0; -} - -static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, - const u64 ackno) -{ - struct dccp_ackvec_record *avr; - /* - * Exploit that records are inserted in descending order of sequence - * number, start with the oldest record first. If @ackno is `before' - * the earliest ack_ackno, the packet is too old to be considered. - */ - list_for_each_entry_reverse(avr, av_list, avr_node) { - if (avr->avr_ack_seqno == ackno) - return avr; - if (before48(ackno, avr->avr_ack_seqno)) - break; - } - return NULL; -} - -/* - * Buffer index and length computation using modulo-buffersize arithmetic. - * Note that, as pointers move from right to left, head is `before' tail. - */ -static inline u16 __ackvec_idx_add(const u16 a, const u16 b) -{ - return (a + b) % DCCPAV_MAX_ACKVEC_LEN; -} - -static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) -{ - return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); -} - -u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) -{ - if (unlikely(av->av_overflow)) - return DCCPAV_MAX_ACKVEC_LEN; - return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); -} - -/** - * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 - * @av: non-empty buffer to update - * @distance: negative or zero distance of @seqno from buf_ackno downward - * @seqno: the (old) sequence number whose record is to be updated - * @state: state in which packet carrying @seqno was received - */ -static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, - u64 seqno, enum dccp_ackvec_states state) -{ - u16 ptr = av->av_buf_head; - - BUG_ON(distance > 0); - if (unlikely(dccp_ackvec_is_empty(av))) - return; - - do { - u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); - - if (distance + runlen >= 0) { - /* - * Only update the state if packet has not been received - * yet. This is OK as per the second table in RFC 4340, - * 11.4.1; i.e. here we are using the following table: - * RECEIVED - * 0 1 3 - * S +---+---+---+ - * T 0 | 0 | 0 | 0 | - * O +---+---+---+ - * R 1 | 1 | 1 | 1 | - * E +---+---+---+ - * D 3 | 0 | 1 | 3 | - * +---+---+---+ - * The "Not Received" state was set by reserve_seats(). - */ - if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) - av->av_buf[ptr] = state; - else - dccp_pr_debug("Not changing %llu state to %u\n", - (unsigned long long)seqno, state); - break; - } - - distance += runlen + 1; - ptr = __ackvec_idx_add(ptr, 1); - - } while (ptr != av->av_buf_tail); -} - -/* Mark @num entries after buf_head as "Not yet received". */ -static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) -{ - u16 start = __ackvec_idx_add(av->av_buf_head, 1), - len = DCCPAV_MAX_ACKVEC_LEN - start; - - /* check for buffer wrap-around */ - if (num > len) { - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); - start = 0; - num -= len; - } - if (num) - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); -} - -/** - * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer - * @av: container of buffer to update (can be empty or non-empty) - * @num_packets: number of packets to register (must be >= 1) - * @seqno: sequence number of the first packet in @num_packets - * @state: state in which packet carrying @seqno was received - */ -static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, - u64 seqno, enum dccp_ackvec_states state) -{ - u32 num_cells = num_packets; - - if (num_packets > DCCPAV_BURST_THRESH) { - u32 lost_packets = num_packets - 1; - - DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets); - /* - * We received 1 packet and have a loss of size "num_packets-1" - * which we squeeze into num_cells-1 rather than reserving an - * entire byte for each lost packet. - * The reason is that the vector grows in O(burst_length); when - * it grows too large there will no room left for the payload. - * This is a trade-off: if a few packets out of the burst show - * up later, their state will not be changed; it is simply too - * costly to reshuffle/reallocate/copy the buffer each time. - * Should such problems persist, we will need to switch to a - * different underlying data structure. - */ - for (num_packets = num_cells = 1; lost_packets; ++num_cells) { - u8 len = min_t(u32, lost_packets, DCCPAV_MAX_RUNLEN); - - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); - av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; - - lost_packets -= len; - } - } - - if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { - DCCP_CRIT("Ack Vector buffer overflow: dropping old entries"); - av->av_overflow = true; - } - - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets); - if (av->av_overflow) - av->av_buf_tail = av->av_buf_head; - - av->av_buf[av->av_buf_head] = state; - av->av_buf_ackno = seqno; - - if (num_packets > 1) - dccp_ackvec_reserve_seats(av, num_packets - 1); -} - -/** - * dccp_ackvec_input - Register incoming packet in the buffer - * @av: Ack Vector to register packet to - * @skb: Packet to register - */ -void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) -{ - u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; - enum dccp_ackvec_states state = DCCPAV_RECEIVED; - - if (dccp_ackvec_is_empty(av)) { - dccp_ackvec_add_new(av, 1, seqno, state); - av->av_tail_ackno = seqno; - - } else { - s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); - u8 *current_head = av->av_buf + av->av_buf_head; - - if (num_packets == 1 && - dccp_ackvec_state(current_head) == state && - dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) { - - *current_head += 1; - av->av_buf_ackno = seqno; - - } else if (num_packets > 0) { - dccp_ackvec_add_new(av, num_packets, seqno, state); - } else { - dccp_ackvec_update_old(av, num_packets, seqno, state); - } - } -} - -/** - * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection - * @av: Ack Vector record to clean - * @ackno: last Ack Vector which has been acknowledged - * - * This routine is called when the peer acknowledges the receipt of Ack Vectors - * up to and including @ackno. While based on section A.3 of RFC 4340, here - * are additional precautions to prevent corrupted buffer state. In particular, - * we use tail_ackno to identify outdated records; it always marks the earliest - * packet of group (2) in 11.4.2. - */ -void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) -{ - struct dccp_ackvec_record *avr, *next; - u8 runlen_now, eff_runlen; - s64 delta; - - avr = dccp_ackvec_lookup(&av->av_records, ackno); - if (avr == NULL) - return; - /* - * Deal with outdated acknowledgments: this arises when e.g. there are - * several old records and the acks from the peer come in slowly. In - * that case we may still have records that pre-date tail_ackno. - */ - delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); - if (delta < 0) - goto free_records; - /* - * Deal with overlapping Ack Vectors: don't subtract more than the - * number of packets between tail_ackno and ack_ackno. - */ - eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; - - runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); - /* - * The run length of Ack Vector cells does not decrease over time. If - * the run length is the same as at the time the Ack Vector was sent, we - * free the ack_ptr cell. That cell can however not be freed if the run - * length has increased: in this case we need to move the tail pointer - * backwards (towards higher indices), to its next-oldest neighbour. - */ - if (runlen_now > eff_runlen) { - - av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; - av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); - - /* This move may not have cleared the overflow flag. */ - if (av->av_overflow) - av->av_overflow = (av->av_buf_head == av->av_buf_tail); - } else { - av->av_buf_tail = avr->avr_ack_ptr; - /* - * We have made sure that avr points to a valid cell within the - * buffer. This cell is either older than head, or equals head - * (empty buffer): in both cases we no longer have any overflow. - */ - av->av_overflow = 0; - } - - /* - * The peer has acknowledged up to and including ack_ackno. Hence the - * first packet in group (2) of 11.4.2 is the successor of ack_ackno. - */ - av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); - -free_records: - list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { - list_del(&avr->avr_node); - kmem_cache_free(dccp_ackvec_record_slab, avr); - } -} - -/* - * Routines to keep track of Ack Vectors received in an skb - */ -int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce) -{ - struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); - - if (new == NULL) - return -ENOBUFS; - new->vec = vec; - new->len = len; - new->nonce = nonce; - - list_add_tail(&new->node, head); - return 0; -} -EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add); - -void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks) -{ - struct dccp_ackvec_parsed *cur, *next; - - list_for_each_entry_safe(cur, next, parsed_chunks, node) - kfree(cur); - INIT_LIST_HEAD(parsed_chunks); -} -EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); - -int __init dccp_ackvec_init(void) -{ - dccp_ackvec_slab = KMEM_CACHE(dccp_ackvec, SLAB_HWCACHE_ALIGN); - if (dccp_ackvec_slab == NULL) - goto out_err; - - dccp_ackvec_record_slab = KMEM_CACHE(dccp_ackvec_record, SLAB_HWCACHE_ALIGN); - if (dccp_ackvec_record_slab == NULL) - goto out_destroy_slab; - - return 0; - -out_destroy_slab: - kmem_cache_destroy(dccp_ackvec_slab); - dccp_ackvec_slab = NULL; -out_err: - DCCP_CRIT("Unable to create Ack Vector slab cache"); - return -ENOBUFS; -} - -void dccp_ackvec_exit(void) -{ - kmem_cache_destroy(dccp_ackvec_slab); - dccp_ackvec_slab = NULL; - kmem_cache_destroy(dccp_ackvec_record_slab); - dccp_ackvec_record_slab = NULL; -} diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h deleted file mode 100644 index d2c4220fb377..000000000000 --- a/net/dccp/ackvec.h +++ /dev/null @@ -1,136 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _ACKVEC_H -#define _ACKVEC_H -/* - * net/dccp/ackvec.h - * - * An implementation of Ack Vectors for the DCCP protocol - * Copyright (c) 2007 University of Aberdeen, Scotland, UK - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include - -/* - * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN, - * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 - * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives - * more headroom if Ack Ratio is higher or when the sender acknowledges slowly. - * The maximum value is bounded by the u16 types for indices and functions. - */ -#define DCCPAV_NUM_ACKVECS 2 -#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS) - -/* Estimated minimum average Ack Vector length - used for updating MPS */ -#define DCCPAV_MIN_OPTLEN 16 - -/* Threshold for coping with large bursts of losses */ -#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8) - -enum dccp_ackvec_states { - DCCPAV_RECEIVED = 0x00, - DCCPAV_ECN_MARKED = 0x40, - DCCPAV_RESERVED = 0x80, - DCCPAV_NOT_RECEIVED = 0xC0 -}; -#define DCCPAV_MAX_RUNLEN 0x3F - -static inline u8 dccp_ackvec_runlen(const u8 *cell) -{ - return *cell & DCCPAV_MAX_RUNLEN; -} - -static inline u8 dccp_ackvec_state(const u8 *cell) -{ - return *cell & ~DCCPAV_MAX_RUNLEN; -} - -/** - * struct dccp_ackvec - Ack Vector main data structure - * - * This implements a fixed-size circular buffer within an array and is largely - * based on Appendix A of RFC 4340. - * - * @av_buf: circular buffer storage area - * @av_buf_head: head index; begin of live portion in @av_buf - * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf - * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf - * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf - * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to - * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf - * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound - * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) - */ -struct dccp_ackvec { - u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; - u16 av_buf_head; - u16 av_buf_tail; - u64 av_buf_ackno:48; - u64 av_tail_ackno:48; - bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; - u8 av_overflow:1; - struct list_head av_records; -}; - -/** - * struct dccp_ackvec_record - Records information about sent Ack Vectors - * - * These list entries define the additional information which the HC-Receiver - * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A. - * - * @avr_node: the list node in @av_records - * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on - * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to - * @avr_ack_ptr: pointer into @av_buf where this record starts - * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending - * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent - * - * The list as a whole is sorted in descending order by @avr_ack_seqno. - */ -struct dccp_ackvec_record { - struct list_head avr_node; - u64 avr_ack_seqno:48; - u64 avr_ack_ackno:48; - u16 avr_ack_ptr; - u8 avr_ack_runlen; - u8 avr_ack_nonce:1; -}; - -int dccp_ackvec_init(void); -void dccp_ackvec_exit(void); - -struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); -void dccp_ackvec_free(struct dccp_ackvec *av); - -void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); -int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); -void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); -u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); - -static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) -{ - return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; -} - -/** - * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb - * @vec: start of vector (offset into skb) - * @len: length of @vec - * @nonce: whether @vec had an ECN nonce of 0 or 1 - * @node: FIFO - arranged in descending order of ack_ackno - * - * This structure is used by CCIDs to access Ack Vectors in a received skb. - */ -struct dccp_ackvec_parsed { - u8 *vec, - len, - nonce:1; - struct list_head node; -}; - -int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce); -void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks); -#endif /* _ACKVEC_H */ diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c deleted file mode 100644 index 6beac5d348e2..000000000000 --- a/net/dccp/ccid.c +++ /dev/null @@ -1,219 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/ccid.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - * - * CCID infrastructure - */ - -#include - -#include "ccid.h" -#include "ccids/lib/tfrc.h" - -static struct ccid_operations *ccids[] = { - &ccid2_ops, -#ifdef CONFIG_IP_DCCP_CCID3 - &ccid3_ops, -#endif -}; - -static struct ccid_operations *ccid_by_number(const u8 id) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(ccids); i++) - if (ccids[i]->ccid_id == id) - return ccids[i]; - return NULL; -} - -/* check that up to @array_len members in @ccid_array are supported */ -bool ccid_support_check(u8 const *ccid_array, u8 array_len) -{ - while (array_len > 0) - if (ccid_by_number(ccid_array[--array_len]) == NULL) - return false; - return true; -} - -/** - * ccid_get_builtin_ccids - Populate a list of built-in CCIDs - * @ccid_array: pointer to copy into - * @array_len: value to return length into - * - * This function allocates memory - caller must see that it is freed after use. - */ -int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) -{ - *ccid_array = kmalloc(ARRAY_SIZE(ccids), gfp_any()); - if (*ccid_array == NULL) - return -ENOBUFS; - - for (*array_len = 0; *array_len < ARRAY_SIZE(ccids); *array_len += 1) - (*ccid_array)[*array_len] = ccids[*array_len]->ccid_id; - return 0; -} - -int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, - char __user *optval, int __user *optlen) -{ - u8 *ccid_array, array_len; - int err = 0; - - if (ccid_get_builtin_ccids(&ccid_array, &array_len)) - return -ENOBUFS; - - if (put_user(array_len, optlen)) - err = -EFAULT; - else if (len > 0 && copy_to_user(optval, ccid_array, - len > array_len ? array_len : len)) - err = -EFAULT; - - kfree(ccid_array); - return err; -} - -static __printf(3, 4) struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...) -{ - struct kmem_cache *slab; - va_list args; - - va_start(args, fmt); - vsnprintf(slab_name_fmt, CCID_SLAB_NAME_LENGTH, fmt, args); - va_end(args); - - slab = kmem_cache_create(slab_name_fmt, sizeof(struct ccid) + obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); - return slab; -} - -static void ccid_kmem_cache_destroy(struct kmem_cache *slab) -{ - kmem_cache_destroy(slab); -} - -static int __init ccid_activate(struct ccid_operations *ccid_ops) -{ - int err = -ENOBUFS; - - ccid_ops->ccid_hc_rx_slab = - ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size, - ccid_ops->ccid_hc_rx_slab_name, - "ccid%u_hc_rx_sock", - ccid_ops->ccid_id); - if (ccid_ops->ccid_hc_rx_slab == NULL) - goto out; - - ccid_ops->ccid_hc_tx_slab = - ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size, - ccid_ops->ccid_hc_tx_slab_name, - "ccid%u_hc_tx_sock", - ccid_ops->ccid_id); - if (ccid_ops->ccid_hc_tx_slab == NULL) - goto out_free_rx_slab; - - pr_info("DCCP: Activated CCID %d (%s)\n", - ccid_ops->ccid_id, ccid_ops->ccid_name); - err = 0; -out: - return err; -out_free_rx_slab: - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab); - ccid_ops->ccid_hc_rx_slab = NULL; - goto out; -} - -static void ccid_deactivate(struct ccid_operations *ccid_ops) -{ - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab); - ccid_ops->ccid_hc_tx_slab = NULL; - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab); - ccid_ops->ccid_hc_rx_slab = NULL; - - pr_info("DCCP: Deactivated CCID %d (%s)\n", - ccid_ops->ccid_id, ccid_ops->ccid_name); -} - -struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx) -{ - struct ccid_operations *ccid_ops = ccid_by_number(id); - struct ccid *ccid = NULL; - - if (ccid_ops == NULL) - goto out; - - ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab : - ccid_ops->ccid_hc_tx_slab, gfp_any()); - if (ccid == NULL) - goto out; - ccid->ccid_ops = ccid_ops; - if (rx) { - memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size); - if (ccid->ccid_ops->ccid_hc_rx_init != NULL && - ccid->ccid_ops->ccid_hc_rx_init(ccid, sk) != 0) - goto out_free_ccid; - } else { - memset(ccid + 1, 0, ccid_ops->ccid_hc_tx_obj_size); - if (ccid->ccid_ops->ccid_hc_tx_init != NULL && - ccid->ccid_ops->ccid_hc_tx_init(ccid, sk) != 0) - goto out_free_ccid; - } -out: - return ccid; -out_free_ccid: - kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab : - ccid_ops->ccid_hc_tx_slab, ccid); - ccid = NULL; - goto out; -} - -void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk) -{ - if (ccid != NULL) { - if (ccid->ccid_ops->ccid_hc_rx_exit != NULL) - ccid->ccid_ops->ccid_hc_rx_exit(sk); - kmem_cache_free(ccid->ccid_ops->ccid_hc_rx_slab, ccid); - } -} - -void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk) -{ - if (ccid != NULL) { - if (ccid->ccid_ops->ccid_hc_tx_exit != NULL) - ccid->ccid_ops->ccid_hc_tx_exit(sk); - kmem_cache_free(ccid->ccid_ops->ccid_hc_tx_slab, ccid); - } -} - -int __init ccid_initialize_builtins(void) -{ - int i, err = tfrc_lib_init(); - - if (err) - return err; - - for (i = 0; i < ARRAY_SIZE(ccids); i++) { - err = ccid_activate(ccids[i]); - if (err) - goto unwind_registrations; - } - return 0; - -unwind_registrations: - while(--i >= 0) - ccid_deactivate(ccids[i]); - tfrc_lib_exit(); - return err; -} - -void ccid_cleanup_builtins(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(ccids); i++) - ccid_deactivate(ccids[i]); - tfrc_lib_exit(); -} diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h deleted file mode 100644 index 105f3734dadb..000000000000 --- a/net/dccp/ccid.h +++ /dev/null @@ -1,262 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _CCID_H -#define _CCID_H -/* - * net/dccp/ccid.h - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - * - * CCID infrastructure - */ - -#include -#include -#include -#include -#include - -/* maximum value for a CCID (RFC 4340, 19.5) */ -#define CCID_MAX 255 -#define CCID_SLAB_NAME_LENGTH 32 - -struct tcp_info; - -/** - * struct ccid_operations - Interface to Congestion-Control Infrastructure - * - * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.) - * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled) - * @ccid_name: alphabetical identifier string for @ccid_id - * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection - * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket - * - * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup) - * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction) - * @ccid_hc_rx_packet_recv: implements the HC-receiver side - * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options - * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options - * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender - * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender - * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender - * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender - * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender - */ -struct ccid_operations { - unsigned char ccid_id; - __u32 ccid_ccmps; - const char *ccid_name; - struct kmem_cache *ccid_hc_rx_slab, - *ccid_hc_tx_slab; - char ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH]; - char ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH]; - __u32 ccid_hc_rx_obj_size, - ccid_hc_tx_obj_size; - /* Interface Routines */ - int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk); - int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk); - void (*ccid_hc_rx_exit)(struct sock *sk); - void (*ccid_hc_tx_exit)(struct sock *sk); - void (*ccid_hc_rx_packet_recv)(struct sock *sk, - struct sk_buff *skb); - int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, - u8 opt, u8 *val, u8 len); - int (*ccid_hc_rx_insert_options)(struct sock *sk, - struct sk_buff *skb); - void (*ccid_hc_tx_packet_recv)(struct sock *sk, - struct sk_buff *skb); - int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, - u8 opt, u8 *val, u8 len); - int (*ccid_hc_tx_send_packet)(struct sock *sk, - struct sk_buff *skb); - void (*ccid_hc_tx_packet_sent)(struct sock *sk, - unsigned int len); - void (*ccid_hc_rx_get_info)(struct sock *sk, - struct tcp_info *info); - void (*ccid_hc_tx_get_info)(struct sock *sk, - struct tcp_info *info); - int (*ccid_hc_rx_getsockopt)(struct sock *sk, - const int optname, int len, - u32 __user *optval, - int __user *optlen); - int (*ccid_hc_tx_getsockopt)(struct sock *sk, - const int optname, int len, - u32 __user *optval, - int __user *optlen); -}; - -extern struct ccid_operations ccid2_ops; -#ifdef CONFIG_IP_DCCP_CCID3 -extern struct ccid_operations ccid3_ops; -#endif - -int ccid_initialize_builtins(void); -void ccid_cleanup_builtins(void); - -struct ccid { - struct ccid_operations *ccid_ops; - char ccid_priv[]; -}; - -static inline void *ccid_priv(const struct ccid *ccid) -{ - return (void *)ccid->ccid_priv; -} - -bool ccid_support_check(u8 const *ccid_array, u8 array_len); -int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); -int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, - char __user *, int __user *); - -struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx); - -static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) -{ - struct ccid *ccid = dp->dccps_hc_rx_ccid; - - if (ccid == NULL || ccid->ccid_ops == NULL) - return -1; - return ccid->ccid_ops->ccid_id; -} - -static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) -{ - struct ccid *ccid = dp->dccps_hc_tx_ccid; - - if (ccid == NULL || ccid->ccid_ops == NULL) - return -1; - return ccid->ccid_ops->ccid_id; -} - -void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); -void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); - -/* - * Congestion control of queued data packets via CCID decision. - * - * The TX CCID performs its congestion-control by indicating whether and when a - * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). - * The following modes are supported via the symbolic constants below: - * - timer-based pacing (CCID returns a delay value in milliseconds); - * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). - */ - -enum ccid_dequeueing_decision { - CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ - CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ - CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ - CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ - CCID_PACKET_ERR = 0xF0000, /* error condition */ -}; - -static inline int ccid_packet_dequeue_eval(const int return_code) -{ - if (return_code < 0) - return CCID_PACKET_ERR; - if (return_code == 0) - return CCID_PACKET_SEND_AT_ONCE; - if (return_code <= CCID_PACKET_DELAY_MAX) - return CCID_PACKET_DELAY; - return return_code; -} - -static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) - return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); - return CCID_PACKET_SEND_AT_ONCE; -} - -static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, - unsigned int len) -{ - if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) - ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); -} - -static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL) - ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb); -} - -static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL) - ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); -} - -/** - * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver - * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) - * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) - * @val: value of @opt - * @len: length of @val in bytes - */ -static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, - u8 pkt, u8 opt, u8 *val, u8 len) -{ - if (!ccid || !ccid->ccid_ops->ccid_hc_tx_parse_options) - return 0; - return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); -} - -/** - * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender - * Arguments are analogous to ccid_hc_tx_parse_options() - */ -static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, - u8 pkt, u8 opt, u8 *val, u8 len) -{ - if (!ccid || !ccid->ccid_ops->ccid_hc_rx_parse_options) - return 0; - return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); -} - -static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL) - return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb); - return 0; -} - -static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk, - struct tcp_info *info) -{ - if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL) - ccid->ccid_ops->ccid_hc_rx_get_info(sk, info); -} - -static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk, - struct tcp_info *info) -{ - if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL) - ccid->ccid_ops->ccid_hc_tx_get_info(sk, info); -} - -static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk, - const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - int rc = -ENOPROTOOPT; - if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) - rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len, - optval, optlen); - return rc; -} - -static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk, - const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - int rc = -ENOPROTOOPT; - if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) - rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len, - optval, optlen); - return rc; -} -#endif /* _CCID_H */ diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig deleted file mode 100644 index e3d388c33d25..000000000000 --- a/net/dccp/ccids/Kconfig +++ /dev/null @@ -1,55 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -menu "DCCP CCIDs Configuration" - -config IP_DCCP_CCID2_DEBUG - bool "CCID-2 debugging messages" - help - Enable CCID-2 specific debugging messages. - - The debugging output can additionally be toggled by setting the - ccid2_debug parameter to 0 or 1. - - If in doubt, say N. - -config IP_DCCP_CCID3 - bool "CCID-3 (TCP-Friendly)" - default IP_DCCP = y || IP_DCCP = m - help - CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based - rate-controlled congestion control mechanism. TFRC is designed to - be reasonably fair when competing for bandwidth with TCP-like flows, - where a flow is "reasonably fair" if its sending rate is generally - within a factor of two of the sending rate of a TCP flow under the - same conditions. However, TFRC has a much lower variation of - throughput over time compared with TCP, which makes CCID-3 more - suitable than CCID-2 for applications such streaming media where a - relatively smooth sending rate is of importance. - - CCID-3 is further described in RFC 4342, - https://www.ietf.org/rfc/rfc4342.txt - - The TFRC congestion control algorithms were initially described in - RFC 5348. - - This text was extracted from RFC 4340 (sec. 10.2), - https://www.ietf.org/rfc/rfc4340.txt - - If in doubt, say N. - -config IP_DCCP_CCID3_DEBUG - bool "CCID-3 debugging messages" - depends on IP_DCCP_CCID3 - help - Enable CCID-3 specific debugging messages. - - The debugging output can additionally be toggled by setting the - ccid3_debug parameter to 0 or 1. - - If in doubt, say N. - -config IP_DCCP_TFRC_LIB - def_bool y if IP_DCCP_CCID3 - -config IP_DCCP_TFRC_DEBUG - def_bool y if IP_DCCP_CCID3_DEBUG -endmenu diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c deleted file mode 100644 index d6b30700af67..000000000000 --- a/net/dccp/ccids/ccid2.c +++ /dev/null @@ -1,794 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2005, 2006 Andrea Bittau - * - * Changes to meet Linux coding standards, and DCCP infrastructure fixes. - * - * Copyright (c) 2006 Arnaldo Carvalho de Melo - */ - -/* - * This implementation should follow RFC 4341 - */ -#include -#include "../feat.h" -#include "ccid2.h" - - -#ifdef CONFIG_IP_DCCP_CCID2_DEBUG -static bool ccid2_debug; -#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) -#else -#define ccid2_pr_debug(format, a...) -#endif - -static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) -{ - struct ccid2_seq *seqp; - int i; - - /* check if we have space to preserve the pointer to the buffer */ - if (hc->tx_seqbufc >= (sizeof(hc->tx_seqbuf) / - sizeof(struct ccid2_seq *))) - return -ENOMEM; - - /* allocate buffer and initialize linked list */ - seqp = kmalloc_array(CCID2_SEQBUF_LEN, sizeof(struct ccid2_seq), - gfp_any()); - if (seqp == NULL) - return -ENOMEM; - - for (i = 0; i < (CCID2_SEQBUF_LEN - 1); i++) { - seqp[i].ccid2s_next = &seqp[i + 1]; - seqp[i + 1].ccid2s_prev = &seqp[i]; - } - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = seqp; - seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; - - /* This is the first allocation. Initiate the head and tail. */ - if (hc->tx_seqbufc == 0) - hc->tx_seqh = hc->tx_seqt = seqp; - else { - /* link the existing list with the one we just created */ - hc->tx_seqh->ccid2s_next = seqp; - seqp->ccid2s_prev = hc->tx_seqh; - - hc->tx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hc->tx_seqt; - } - - /* store the original pointer to the buffer so we can free it */ - hc->tx_seqbuf[hc->tx_seqbufc] = seqp; - hc->tx_seqbufc++; - - return 0; -} - -static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) -{ - if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) - return CCID_PACKET_WILL_DEQUEUE_LATER; - return CCID_PACKET_SEND_AT_ONCE; -} - -static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) -{ - u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2); - - /* - * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from - * RFC 4341, 6.1.2. We ignore the statement that Ack Ratio 2 is always - * acceptable since this causes starvation/deadlock whenever cwnd < 2. - * The same problem arises when Ack Ratio is 0 (ie. Ack Ratio disabled). - */ - if (val == 0 || val > max_ratio) { - DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); - val = max_ratio; - } - dccp_feat_signal_nn_change(sk, DCCPF_ACK_RATIO, - min_t(u32, val, DCCPF_ACK_RATIO_MAX)); -} - -static void ccid2_check_l_ack_ratio(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - /* - * After a loss, idle period, application limited period, or RTO we - * need to check that the ack ratio is still less than the congestion - * window. Otherwise, we will send an entire congestion window of - * packets and got no response because we haven't sent ack ratio - * packets yet. - * If the ack ratio does need to be reduced, we reduce it to half of - * the congestion window (or 1 if that's zero) instead of to the - * congestion window. This prevents problems if one ack is lost. - */ - if (dccp_feat_nn_get(sk, DCCPF_ACK_RATIO) > hc->tx_cwnd) - ccid2_change_l_ack_ratio(sk, hc->tx_cwnd/2 ? : 1U); -} - -static void ccid2_change_l_seq_window(struct sock *sk, u64 val) -{ - dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW, - clamp_val(val, DCCPF_SEQ_WMIN, - DCCPF_SEQ_WMAX)); -} - -static void dccp_tasklet_schedule(struct sock *sk) -{ - struct tasklet_struct *t = &dccp_sk(sk)->dccps_xmitlet; - - if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { - sock_hold(sk); - __tasklet_schedule(t); - } -} - -static void ccid2_hc_tx_rto_expire(struct timer_list *t) -{ - struct ccid2_hc_tx_sock *hc = from_timer(hc, t, tx_rtotimer); - struct sock *sk = hc->sk; - const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5); - goto out; - } - - ccid2_pr_debug("RTO_EXPIRE\n"); - - if (sk->sk_state == DCCP_CLOSED) - goto out; - - /* back-off timer */ - hc->tx_rto <<= 1; - if (hc->tx_rto > DCCP_RTO_MAX) - hc->tx_rto = DCCP_RTO_MAX; - - /* adjust pipe, cwnd etc */ - hc->tx_ssthresh = hc->tx_cwnd / 2; - if (hc->tx_ssthresh < 2) - hc->tx_ssthresh = 2; - hc->tx_cwnd = 1; - hc->tx_pipe = 0; - - /* clear state about stuff we sent */ - hc->tx_seqt = hc->tx_seqh; - hc->tx_packets_acked = 0; - - /* clear ack ratio state. */ - hc->tx_rpseq = 0; - hc->tx_rpdupack = -1; - ccid2_change_l_ack_ratio(sk, 1); - - /* if we were blocked before, we may now send cwnd=1 packet */ - if (sender_was_blocked) - dccp_tasklet_schedule(sk); - /* restart backed-off timer */ - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -/* - * Congestion window validation (RFC 2861). - */ -static bool ccid2_do_cwv = true; -module_param(ccid2_do_cwv, bool, 0644); -MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation"); - -/** - * ccid2_update_used_window - Track how much of cwnd is actually used - * @hc: socket to update window - * @new_wnd: new window values to add into the filter - * - * This is done in addition to CWV. The sender needs to have an idea of how many - * packets may be in flight, to set the local Sequence Window value accordingly - * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the - * maximum-used window. We use an EWMA low-pass filter to filter out noise. - */ -static void ccid2_update_used_window(struct ccid2_hc_tx_sock *hc, u32 new_wnd) -{ - hc->tx_expected_wnd = (3 * hc->tx_expected_wnd + new_wnd) / 4; -} - -/* This borrows the code of tcp_cwnd_application_limited() */ -static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - /* don't reduce cwnd below the initial window (IW) */ - u32 init_win = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache), - win_used = max(hc->tx_cwnd_used, init_win); - - if (win_used < hc->tx_cwnd) { - hc->tx_ssthresh = max(hc->tx_ssthresh, - (hc->tx_cwnd >> 1) + (hc->tx_cwnd >> 2)); - hc->tx_cwnd = (hc->tx_cwnd + win_used) >> 1; - } - hc->tx_cwnd_used = 0; - hc->tx_cwnd_stamp = now; - - ccid2_check_l_ack_ratio(sk); -} - -/* This borrows the code of tcp_cwnd_restart() */ -static void ccid2_cwnd_restart(struct sock *sk, const u32 now) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - u32 cwnd = hc->tx_cwnd, restart_cwnd, - iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache); - s32 delta = now - hc->tx_lsndtime; - - hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2)); - - /* don't reduce cwnd below the initial window (IW) */ - restart_cwnd = min(cwnd, iwnd); - - while ((delta -= hc->tx_rto) >= 0 && cwnd > restart_cwnd) - cwnd >>= 1; - hc->tx_cwnd = max(cwnd, restart_cwnd); - hc->tx_cwnd_stamp = now; - hc->tx_cwnd_used = 0; - - ccid2_check_l_ack_ratio(sk); -} - -static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - const u32 now = ccid2_jiffies32; - struct ccid2_seq *next; - - /* slow-start after idle periods (RFC 2581, RFC 2861) */ - if (ccid2_do_cwv && !hc->tx_pipe && - (s32)(now - hc->tx_lsndtime) >= hc->tx_rto) - ccid2_cwnd_restart(sk, now); - - hc->tx_lsndtime = now; - hc->tx_pipe += 1; - - /* see whether cwnd was fully used (RFC 2861), update expected window */ - if (ccid2_cwnd_network_limited(hc)) { - ccid2_update_used_window(hc, hc->tx_cwnd); - hc->tx_cwnd_used = 0; - hc->tx_cwnd_stamp = now; - } else { - if (hc->tx_pipe > hc->tx_cwnd_used) - hc->tx_cwnd_used = hc->tx_pipe; - - ccid2_update_used_window(hc, hc->tx_cwnd_used); - - if (ccid2_do_cwv && (s32)(now - hc->tx_cwnd_stamp) >= hc->tx_rto) - ccid2_cwnd_application_limited(sk, now); - } - - hc->tx_seqh->ccid2s_seq = dp->dccps_gss; - hc->tx_seqh->ccid2s_acked = 0; - hc->tx_seqh->ccid2s_sent = now; - - next = hc->tx_seqh->ccid2s_next; - /* check if we need to alloc more space */ - if (next == hc->tx_seqt) { - if (ccid2_hc_tx_alloc_seq(hc)) { - DCCP_CRIT("packet history - out of memory!"); - /* FIXME: find a more graceful way to bail out */ - return; - } - next = hc->tx_seqh->ccid2s_next; - BUG_ON(next == hc->tx_seqt); - } - hc->tx_seqh = next; - - ccid2_pr_debug("cwnd=%d pipe=%d\n", hc->tx_cwnd, hc->tx_pipe); - - /* - * FIXME: The code below is broken and the variables have been removed - * from the socket struct. The `ackloss' variable was always set to 0, - * and with arsent there are several problems: - * (i) it doesn't just count the number of Acks, but all sent packets; - * (ii) it is expressed in # of packets, not # of windows, so the - * comparison below uses the wrong formula: Appendix A of RFC 4341 - * comes up with the number K = cwnd / (R^2 - R) of consecutive windows - * of data with no lost or marked Ack packets. If arsent were the # of - * consecutive Acks received without loss, then Ack Ratio needs to be - * decreased by 1 when - * arsent >= K * cwnd / R = cwnd^2 / (R^3 - R^2) - * where cwnd / R is the number of Acks received per window of data - * (cf. RFC 4341, App. A). The problems are that - * - arsent counts other packets as well; - * - the comparison uses a formula different from RFC 4341; - * - computing a cubic/quadratic equation each time is too complicated. - * Hence a different algorithm is needed. - */ -#if 0 - /* Ack Ratio. Need to maintain a concept of how many windows we sent */ - hc->tx_arsent++; - /* We had an ack loss in this window... */ - if (hc->tx_ackloss) { - if (hc->tx_arsent >= hc->tx_cwnd) { - hc->tx_arsent = 0; - hc->tx_ackloss = 0; - } - } else { - /* No acks lost up to now... */ - /* decrease ack ratio if enough packets were sent */ - if (dp->dccps_l_ack_ratio > 1) { - /* XXX don't calculate denominator each time */ - int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - - dp->dccps_l_ack_ratio; - - denom = hc->tx_cwnd * hc->tx_cwnd / denom; - - if (hc->tx_arsent >= denom) { - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); - hc->tx_arsent = 0; - } - } else { - /* we can't increase ack ratio further [1] */ - hc->tx_arsent = 0; /* or maybe set it to cwnd*/ - } - } -#endif - - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); - -#ifdef CONFIG_IP_DCCP_CCID2_DEBUG - do { - struct ccid2_seq *seqp = hc->tx_seqt; - - while (seqp != hc->tx_seqh) { - ccid2_pr_debug("out seq=%llu acked=%d time=%u\n", - (unsigned long long)seqp->ccid2s_seq, - seqp->ccid2s_acked, seqp->ccid2s_sent); - seqp = seqp->ccid2s_next; - } - } while (0); - ccid2_pr_debug("=========\n"); -#endif -} - -/** - * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm - * @sk: socket to perform estimator on - * @mrtt: measured RTT - * - * This code is almost identical with TCP's tcp_rtt_estimator(), since - * - it has a higher sampling frequency (recommended by RFC 1323), - * - the RTO does not collapse into RTT due to RTTVAR going towards zero, - * - it is simple (cf. more complex proposals such as Eifel timer or research - * which suggests that the gain should be set according to window size), - * - in tests it was found to work well with CCID2 [gerrit]. - */ -static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - long m = mrtt ? : 1; - - if (hc->tx_srtt == 0) { - /* First measurement m */ - hc->tx_srtt = m << 3; - hc->tx_mdev = m << 1; - - hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk)); - hc->tx_rttvar = hc->tx_mdev_max; - - hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; - } else { - /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ - m -= (hc->tx_srtt >> 3); - hc->tx_srtt += m; - - /* Similarly, update scaled mdev with regard to |m| */ - if (m < 0) { - m = -m; - m -= (hc->tx_mdev >> 2); - /* - * This neutralises RTO increase when RTT < SRTT - mdev - * (see P. Sarolahti, A. Kuznetsov,"Congestion Control - * in Linux TCP", USENIX 2002, pp. 49-62). - */ - if (m > 0) - m >>= 3; - } else { - m -= (hc->tx_mdev >> 2); - } - hc->tx_mdev += m; - - if (hc->tx_mdev > hc->tx_mdev_max) { - hc->tx_mdev_max = hc->tx_mdev; - if (hc->tx_mdev_max > hc->tx_rttvar) - hc->tx_rttvar = hc->tx_mdev_max; - } - - /* - * Decay RTTVAR at most once per flight, exploiting that - * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) - * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) - * GAR is a useful bound for FlightSize = pipe. - * AWL is probably too low here, as it over-estimates pipe. - */ - if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) { - if (hc->tx_mdev_max < hc->tx_rttvar) - hc->tx_rttvar -= (hc->tx_rttvar - - hc->tx_mdev_max) >> 2; - hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; - hc->tx_mdev_max = tcp_rto_min(sk); - } - } - - /* - * Set RTO from SRTT and RTTVAR - * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms. - * This agrees with RFC 4341, 5: - * "Because DCCP does not retransmit data, DCCP does not require - * TCP's recommended minimum timeout of one second". - */ - hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar; - - if (hc->tx_rto > DCCP_RTO_MAX) - hc->tx_rto = DCCP_RTO_MAX; -} - -static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, - unsigned int *maxincr) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - int r_seq_used = hc->tx_cwnd / dp->dccps_l_ack_ratio; - - if (hc->tx_cwnd < dp->dccps_l_seq_win && - r_seq_used < dp->dccps_r_seq_win) { - if (hc->tx_cwnd < hc->tx_ssthresh) { - if (*maxincr > 0 && ++hc->tx_packets_acked >= 2) { - hc->tx_cwnd += 1; - *maxincr -= 1; - hc->tx_packets_acked = 0; - } - } else if (++hc->tx_packets_acked >= hc->tx_cwnd) { - hc->tx_cwnd += 1; - hc->tx_packets_acked = 0; - } - } - - /* - * Adjust the local sequence window and the ack ratio to allow about - * 5 times the number of packets in the network (RFC 4340 7.5.2) - */ - if (r_seq_used * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_r_seq_win) - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio * 2); - else if (r_seq_used * CCID2_WIN_CHANGE_FACTOR < dp->dccps_r_seq_win/2) - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio / 2 ? : 1U); - - if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_l_seq_win) - ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win * 2); - else if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR < dp->dccps_l_seq_win/2) - ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win / 2); - - /* - * FIXME: RTT is sampled several times per acknowledgment (for each - * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). - * This causes the RTT to be over-estimated, since the older entries - * in the Ack Vector have earlier sending times. - * The cleanest solution is to not use the ccid2s_sent field at all - * and instead use DCCP timestamps: requires changes in other places. - */ - ccid2_rtt_estimator(sk, ccid2_jiffies32 - seqp->ccid2s_sent); -} - -static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) { - ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); - return; - } - - hc->tx_last_cong = ccid2_jiffies32; - - hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; - hc->tx_ssthresh = max(hc->tx_cwnd, 2U); - - ccid2_check_l_ack_ratio(sk); -} - -static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - switch (option) { - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen, - option - DCCPO_ACK_VECTOR_0); - } - return 0; -} - -static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); - struct dccp_ackvec_parsed *avp; - u64 ackno, seqno; - struct ccid2_seq *seqp; - int done = 0; - unsigned int maxincr = 0; - - /* check reverse path congestion */ - seqno = DCCP_SKB_CB(skb)->dccpd_seq; - - /* XXX this whole "algorithm" is broken. Need to fix it to keep track - * of the seqnos of the dupacks so that rpseq and rpdupack are correct - * -sorbo. - */ - /* need to bootstrap */ - if (hc->tx_rpdupack == -1) { - hc->tx_rpdupack = 0; - hc->tx_rpseq = seqno; - } else { - /* check if packet is consecutive */ - if (dccp_delta_seqno(hc->tx_rpseq, seqno) == 1) - hc->tx_rpseq = seqno; - /* it's a later packet */ - else if (after48(seqno, hc->tx_rpseq)) { - hc->tx_rpdupack++; - - /* check if we got enough dupacks */ - if (hc->tx_rpdupack >= NUMDUPACK) { - hc->tx_rpdupack = -1; /* XXX lame */ - hc->tx_rpseq = 0; -#ifdef __CCID2_COPES_GRACEFULLY_WITH_ACK_CONGESTION_CONTROL__ - /* - * FIXME: Ack Congestion Control is broken; in - * the current state instabilities occurred with - * Ack Ratios greater than 1; causing hang-ups - * and long RTO timeouts. This needs to be fixed - * before opening up dynamic changes. -- gerrit - */ - ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); -#endif - } - } - } - - /* check forward path congestion */ - if (dccp_packet_without_ack(skb)) - return; - - /* still didn't send out new data packets */ - if (hc->tx_seqh == hc->tx_seqt) - goto done; - - ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; - if (after48(ackno, hc->tx_high_ack)) - hc->tx_high_ack = ackno; - - seqp = hc->tx_seqt; - while (before48(seqp->ccid2s_seq, ackno)) { - seqp = seqp->ccid2s_next; - if (seqp == hc->tx_seqh) { - seqp = hc->tx_seqh->ccid2s_prev; - break; - } - } - - /* - * In slow-start, cwnd can increase up to a maximum of Ack Ratio/2 - * packets per acknowledgement. Rounding up avoids that cwnd is not - * advanced when Ack Ratio is 1 and gives a slight edge otherwise. - */ - if (hc->tx_cwnd < hc->tx_ssthresh) - maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); - - /* go through all ack vectors */ - list_for_each_entry(avp, &hc->tx_av_chunks, node) { - /* go through this ack vector */ - for (; avp->len--; avp->vec++) { - u64 ackno_end_rl = SUB48(ackno, - dccp_ackvec_runlen(avp->vec)); - - ccid2_pr_debug("ackvec %llu |%u,%u|\n", - (unsigned long long)ackno, - dccp_ackvec_state(avp->vec) >> 6, - dccp_ackvec_runlen(avp->vec)); - /* if the seqno we are analyzing is larger than the - * current ackno, then move towards the tail of our - * seqnos. - */ - while (after48(seqp->ccid2s_seq, ackno)) { - if (seqp == hc->tx_seqt) { - done = 1; - break; - } - seqp = seqp->ccid2s_prev; - } - if (done) - break; - - /* check all seqnos in the range of the vector - * run length - */ - while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { - const u8 state = dccp_ackvec_state(avp->vec); - - /* new packet received or marked */ - if (state != DCCPAV_NOT_RECEIVED && - !seqp->ccid2s_acked) { - if (state == DCCPAV_ECN_MARKED) - ccid2_congestion_event(sk, - seqp); - else - ccid2_new_ack(sk, seqp, - &maxincr); - - seqp->ccid2s_acked = 1; - ccid2_pr_debug("Got ack for %llu\n", - (unsigned long long)seqp->ccid2s_seq); - hc->tx_pipe--; - } - if (seqp == hc->tx_seqt) { - done = 1; - break; - } - seqp = seqp->ccid2s_prev; - } - if (done) - break; - - ackno = SUB48(ackno_end_rl, 1); - } - if (done) - break; - } - - /* The state about what is acked should be correct now - * Check for NUMDUPACK - */ - seqp = hc->tx_seqt; - while (before48(seqp->ccid2s_seq, hc->tx_high_ack)) { - seqp = seqp->ccid2s_next; - if (seqp == hc->tx_seqh) { - seqp = hc->tx_seqh->ccid2s_prev; - break; - } - } - done = 0; - while (1) { - if (seqp->ccid2s_acked) { - done++; - if (done == NUMDUPACK) - break; - } - if (seqp == hc->tx_seqt) - break; - seqp = seqp->ccid2s_prev; - } - - /* If there are at least 3 acknowledgements, anything unacknowledged - * below the last sequence number is considered lost - */ - if (done == NUMDUPACK) { - struct ccid2_seq *last_acked = seqp; - - /* check for lost packets */ - while (1) { - if (!seqp->ccid2s_acked) { - ccid2_pr_debug("Packet lost: %llu\n", - (unsigned long long)seqp->ccid2s_seq); - /* XXX need to traverse from tail -> head in - * order to detect multiple congestion events in - * one ack vector. - */ - ccid2_congestion_event(sk, seqp); - hc->tx_pipe--; - } - if (seqp == hc->tx_seqt) - break; - seqp = seqp->ccid2s_prev; - } - - hc->tx_seqt = last_acked; - } - - /* trim acked packets in tail */ - while (hc->tx_seqt != hc->tx_seqh) { - if (!hc->tx_seqt->ccid2s_acked) - break; - - hc->tx_seqt = hc->tx_seqt->ccid2s_next; - } - - /* restart RTO timer if not all outstanding data has been acked */ - if (hc->tx_pipe == 0) - sk_stop_timer(sk, &hc->tx_rtotimer); - else - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); -done: - /* check if incoming Acks allow pending packets to be sent */ - if (sender_was_blocked && !ccid2_cwnd_network_limited(hc)) - dccp_tasklet_schedule(sk); - dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); -} - -static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid_priv(ccid); - struct dccp_sock *dp = dccp_sk(sk); - u32 max_ratio; - - /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ - hc->tx_ssthresh = ~0U; - - /* Use larger initial windows (RFC 4341, section 5). */ - hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); - hc->tx_expected_wnd = hc->tx_cwnd; - - /* Make sure that Ack Ratio is enabled and within bounds. */ - max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2); - if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) - dp->dccps_l_ack_ratio = max_ratio; - - /* XXX init ~ to window size... */ - if (ccid2_hc_tx_alloc_seq(hc)) - return -ENOMEM; - - hc->tx_rto = DCCP_TIMEOUT_INIT; - hc->tx_rpdupack = -1; - hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_jiffies32; - hc->tx_cwnd_used = 0; - hc->sk = sk; - timer_setup(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, 0); - INIT_LIST_HEAD(&hc->tx_av_chunks); - return 0; -} - -static void ccid2_hc_tx_exit(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - int i; - - sk_stop_timer(sk, &hc->tx_rtotimer); - - for (i = 0; i < hc->tx_seqbufc; i++) - kfree(hc->tx_seqbuf[i]); - hc->tx_seqbufc = 0; - dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); -} - -static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk); - - if (!dccp_data_packet(skb)) - return; - - if (++hc->rx_num_data_pkts >= dccp_sk(sk)->dccps_r_ack_ratio) { - dccp_send_ack(sk); - hc->rx_num_data_pkts = 0; - } -} - -struct ccid_operations ccid2_ops = { - .ccid_id = DCCPC_CCID2, - .ccid_name = "TCP-like", - .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), - .ccid_hc_tx_init = ccid2_hc_tx_init, - .ccid_hc_tx_exit = ccid2_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, - .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, - .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, - .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), - .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, -}; - -#ifdef CONFIG_IP_DCCP_CCID2_DEBUG -module_param(ccid2_debug, bool, 0644); -MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages"); -#endif diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h deleted file mode 100644 index 330c7b4ec001..000000000000 --- a/net/dccp/ccids/ccid2.h +++ /dev/null @@ -1,121 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (c) 2005 Andrea Bittau - */ -#ifndef _DCCP_CCID2_H_ -#define _DCCP_CCID2_H_ - -#include -#include -#include "../ccid.h" -#include "../dccp.h" - -/* - * CCID-2 timestamping faces the same issues as TCP timestamping. - * Hence we reuse/share as much of the code as possible. - */ -#define ccid2_jiffies32 ((u32)jiffies) - -/* NUMDUPACK parameter from RFC 4341, p. 6 */ -#define NUMDUPACK 3 - -struct ccid2_seq { - u64 ccid2s_seq; - u32 ccid2s_sent; - int ccid2s_acked; - struct ccid2_seq *ccid2s_prev; - struct ccid2_seq *ccid2s_next; -}; - -#define CCID2_SEQBUF_LEN 1024 -#define CCID2_SEQBUF_MAX 128 - -/* - * Multiple of congestion window to keep the sequence window at - * (RFC 4340 7.5.2) - */ -#define CCID2_WIN_CHANGE_FACTOR 5 - -/** - * struct ccid2_hc_tx_sock - CCID2 TX half connection - * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 - * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) - * @tx_srtt: smoothed RTT estimate, scaled by 2^3 - * @tx_mdev: smoothed RTT variation, scaled by 2^2 - * @tx_mdev_max: maximum of @mdev during one flight - * @tx_rttvar: moving average/maximum of @mdev_max - * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) - * @tx_rtt_seq: to decay RTTVAR at most once per flight - * @tx_cwnd_used: actually used cwnd, W_used of RFC 2861 - * @tx_expected_wnd: moving average of @tx_cwnd_used - * @tx_cwnd_stamp: to track idle periods in CWV - * @tx_lsndtime: last time (in jiffies) a data packet was sent - * @tx_rpseq: last consecutive seqno - * @tx_rpdupack: dupacks since rpseq - * @tx_av_chunks: list of Ack Vectors received on current skb - */ -struct ccid2_hc_tx_sock { - u32 tx_cwnd; - u32 tx_ssthresh; - u32 tx_pipe; - u32 tx_packets_acked; - struct ccid2_seq *tx_seqbuf[CCID2_SEQBUF_MAX]; - int tx_seqbufc; - struct ccid2_seq *tx_seqh; - struct ccid2_seq *tx_seqt; - - /* RTT measurement: variables/principles are the same as in TCP */ - u32 tx_srtt, - tx_mdev, - tx_mdev_max, - tx_rttvar, - tx_rto; - u64 tx_rtt_seq:48; - struct timer_list tx_rtotimer; - struct sock *sk; - - /* Congestion Window validation (optional, RFC 2861) */ - u32 tx_cwnd_used, - tx_expected_wnd, - tx_cwnd_stamp, - tx_lsndtime; - - u64 tx_rpseq; - int tx_rpdupack; - u32 tx_last_cong; - u64 tx_high_ack; - struct list_head tx_av_chunks; -}; - -static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc) -{ - return hc->tx_pipe >= hc->tx_cwnd; -} - -/* - * Convert RFC 3390 larger initial window into an equivalent number of packets. - * This is based on the numbers specified in RFC 5681, 3.1. - */ -static inline u32 rfc3390_bytes_to_packets(const u32 smss) -{ - return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3); -} - -/** - * struct ccid2_hc_rx_sock - Receiving end of CCID-2 half-connection - * @rx_num_data_pkts: number of data packets received since last feedback - */ -struct ccid2_hc_rx_sock { - u32 rx_num_data_pkts; -}; - -static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) -{ - return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid); -} - -static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk) -{ - return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid); -} -#endif /* _DCCP_CCID2_H_ */ diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c deleted file mode 100644 index f349d16dd8f6..000000000000 --- a/net/dccp/ccids/ccid3.c +++ /dev/null @@ -1,866 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-7 Ian McDonald - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo . - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ -#include "../dccp.h" -#include "ccid3.h" - -#include - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static bool ccid3_debug; -#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a) -#else -#define ccid3_pr_debug(format, a...) -#endif - -/* - * Transmitter Half-Connection Routines - */ -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) -{ - static const char *const ccid3_state_names[] = { - [TFRC_SSTATE_NO_SENT] = "NO_SENT", - [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", - [TFRC_SSTATE_FBACK] = "FBACK", - }; - - return ccid3_state_names[state]; -} -#endif - -static void ccid3_hc_tx_set_state(struct sock *sk, - enum ccid3_hc_tx_states state) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - enum ccid3_hc_tx_states oldstate = hc->tx_state; - - ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", - dccp_role(sk), sk, ccid3_tx_state_name(oldstate), - ccid3_tx_state_name(state)); - WARN_ON(state == oldstate); - hc->tx_state = state; -} - -/* - * Compute the initial sending rate X_init in the manner of RFC 3390: - * - * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT - * - * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis - * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. - * For consistency with other parts of the code, X_init is scaled by 2^6. - */ -static inline u64 rfc3390_initial_rate(struct sock *sk) -{ - const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - const __u32 w_init = clamp_t(__u32, 4380U, 2 * hc->tx_s, 4 * hc->tx_s); - - return scaled_div(w_init << 6, hc->tx_rtt); -} - -/** - * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst - * @hc: socket to have the send interval updated - * - * This respects the granularity of X_inst (64 * bytes/second). - */ -static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc) -{ - hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x); - - DCCP_BUG_ON(hc->tx_t_ipi == 0); - ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi, - hc->tx_s, (unsigned int)(hc->tx_x >> 6)); -} - -static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now) -{ - u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count); - - return delta / hc->tx_rtt; -} - -/** - * ccid3_hc_tx_update_x - Update allowed sending rate X - * @sk: socket to be updated - * @stamp: most recent time if available - can be left NULL. - * - * This function tracks draft rfc3448bis, check there for latest details. - * - * Note: X and X_recv are both stored in units of 64 * bytes/second, to support - * fine-grained resolution of sending rates. This requires scaling by 2^6 - * throughout the code. Only X_calc is unscaled (in bytes/second). - * - */ -static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - __u64 min_rate = 2 * hc->tx_x_recv; - const __u64 old_x = hc->tx_x; - ktime_t now = stamp ? *stamp : ktime_get_real(); - - /* - * Handle IDLE periods: do not reduce below RFC3390 initial sending rate - * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis: - * a sender is idle if it has not sent anything over a 2-RTT-period. - * For consistency with X and X_recv, min_rate is also scaled by 2^6. - */ - if (ccid3_hc_tx_idle_rtt(hc, now) >= 2) { - min_rate = rfc3390_initial_rate(sk); - min_rate = max(min_rate, 2 * hc->tx_x_recv); - } - - if (hc->tx_p > 0) { - - hc->tx_x = min(((__u64)hc->tx_x_calc) << 6, min_rate); - hc->tx_x = max(hc->tx_x, (((__u64)hc->tx_s) << 6) / TFRC_T_MBI); - - } else if (ktime_us_delta(now, hc->tx_t_ld) - (s64)hc->tx_rtt >= 0) { - - hc->tx_x = min(2 * hc->tx_x, min_rate); - hc->tx_x = max(hc->tx_x, - scaled_div(((__u64)hc->tx_s) << 6, hc->tx_rtt)); - hc->tx_t_ld = now; - } - - if (hc->tx_x != old_x) { - ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " - "X_recv=%u\n", (unsigned int)(old_x >> 6), - (unsigned int)(hc->tx_x >> 6), hc->tx_x_calc, - (unsigned int)(hc->tx_x_recv >> 6)); - - ccid3_update_send_interval(hc); - } -} - -/** - * ccid3_hc_tx_update_s - Track the mean packet size `s' - * @hc: socket to be updated - * @len: DCCP packet payload size in bytes - * - * cf. RFC 4342, 5.3 and RFC 3448, 4.1 - */ -static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len) -{ - const u16 old_s = hc->tx_s; - - hc->tx_s = tfrc_ewma(hc->tx_s, len, 9); - - if (hc->tx_s != old_s) - ccid3_update_send_interval(hc); -} - -/* - * Update Window Counter using the algorithm from [RFC 4342, 8.1]. - * As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt(). - */ -static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc, - ktime_t now) -{ - u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count), - quarter_rtts = (4 * delta) / hc->tx_rtt; - - if (quarter_rtts > 0) { - hc->tx_t_last_win_count = now; - hc->tx_last_win_count += min(quarter_rtts, 5U); - hc->tx_last_win_count &= 0xF; /* mod 16 */ - } -} - -static void ccid3_hc_tx_no_feedback_timer(struct timer_list *t) -{ - struct ccid3_hc_tx_sock *hc = from_timer(hc, t, tx_no_feedback_timer); - struct sock *sk = hc->sk; - unsigned long t_nfb = USEC_PER_SEC / 5; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later. */ - /* XXX: set some sensible MIB */ - goto restart_timer; - } - - ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk, - ccid3_tx_state_name(hc->tx_state)); - - /* Ignore and do not restart after leaving the established state */ - if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) - goto out; - - /* Reset feedback state to "no feedback received" */ - if (hc->tx_state == TFRC_SSTATE_FBACK) - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - - /* - * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 - * RTO is 0 if and only if no feedback has been received yet. - */ - if (hc->tx_t_rto == 0 || hc->tx_p == 0) { - - /* halve send rate directly */ - hc->tx_x = max(hc->tx_x / 2, - (((__u64)hc->tx_s) << 6) / TFRC_T_MBI); - ccid3_update_send_interval(hc); - } else { - /* - * Modify the cached value of X_recv - * - * If (X_calc > 2 * X_recv) - * X_recv = max(X_recv / 2, s / (2 * t_mbi)); - * Else - * X_recv = X_calc / 4; - * - * Note that X_recv is scaled by 2^6 while X_calc is not - */ - if (hc->tx_x_calc > (hc->tx_x_recv >> 5)) - hc->tx_x_recv = - max(hc->tx_x_recv / 2, - (((__u64)hc->tx_s) << 6) / (2*TFRC_T_MBI)); - else { - hc->tx_x_recv = hc->tx_x_calc; - hc->tx_x_recv <<= 4; - } - ccid3_hc_tx_update_x(sk, NULL); - } - ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", - (unsigned long long)hc->tx_x); - - /* - * Set new timeout for the nofeedback timer. - * See comments in packet_recv() regarding the value of t_RTO. - */ - if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */ - t_nfb = TFRC_INITIAL_TIMEOUT; - else - t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); - -restart_timer: - sk_reset_timer(sk, &hc->tx_no_feedback_timer, - jiffies + usecs_to_jiffies(t_nfb)); -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -/** - * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets - * @sk: socket to send packet from - * @skb: next packet candidate to send on @sk - * - * This function uses the convention of ccid_packet_dequeue_eval() and - * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. - */ -static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - ktime_t now = ktime_get_real(); - s64 delay; - - /* - * This function is called only for Data and DataAck packets. Sending - * zero-sized Data(Ack)s is theoretically possible, but for congestion - * control this case is pathological - ignore it. - */ - if (unlikely(skb->len == 0)) - return -EBADMSG; - - if (hc->tx_state == TFRC_SSTATE_NO_SENT) { - sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies + - usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); - hc->tx_last_win_count = 0; - hc->tx_t_last_win_count = now; - - /* Set t_0 for initial packet */ - hc->tx_t_nom = now; - - hc->tx_s = skb->len; - - /* - * Use initial RTT sample when available: recommended by erratum - * to RFC 4342. This implements the initialisation procedure of - * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6. - */ - if (dp->dccps_syn_rtt) { - ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); - hc->tx_rtt = dp->dccps_syn_rtt; - hc->tx_x = rfc3390_initial_rate(sk); - hc->tx_t_ld = now; - } else { - /* - * Sender does not have RTT sample: - * - set fallback RTT (RFC 4340, 3.4) since a RTT value - * is needed in several parts (e.g. window counter); - * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. - */ - hc->tx_rtt = DCCP_FALLBACK_RTT; - hc->tx_x = hc->tx_s; - hc->tx_x <<= 6; - } - ccid3_update_send_interval(hc); - - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - - } else { - delay = ktime_us_delta(hc->tx_t_nom, now); - ccid3_pr_debug("delay=%ld\n", (long)delay); - /* - * Scheduling of packet transmissions (RFC 5348, 8.3) - * - * if (t_now > t_nom - delta) - * // send the packet now - * else - * // send the packet in (t_nom - t_now) milliseconds. - */ - if (delay >= TFRC_T_DELTA) - return (u32)delay / USEC_PER_MSEC; - - ccid3_hc_tx_update_win_count(hc, now); - } - - /* prepare to send now (add options etc.) */ - dp->dccps_hc_tx_insert_options = 1; - DCCP_SKB_CB(skb)->dccpd_ccval = hc->tx_last_win_count; - - /* set the nominal send time for the next following packet */ - hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi); - return CCID_PACKET_SEND_AT_ONCE; -} - -static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - - ccid3_hc_tx_update_s(hc, len); - - if (tfrc_tx_hist_add(&hc->tx_hist, dccp_sk(sk)->dccps_gss)) - DCCP_CRIT("packet history - out of memory!"); -} - -static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct tfrc_tx_hist_entry *acked; - ktime_t now; - unsigned long t_nfb; - u32 r_sample; - - /* we are only interested in ACKs */ - if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || - DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) - return; - /* - * Locate the acknowledged packet in the TX history. - * - * Returning "entry not found" here can for instance happen when - * - the host has not sent out anything (e.g. a passive server), - * - the Ack is outdated (packet with higher Ack number was received), - * - it is a bogus Ack (for a packet not sent on this connection). - */ - acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb)); - if (acked == NULL) - return; - /* For the sake of RTT sampling, ignore/remove all older entries */ - tfrc_tx_hist_purge(&acked->next); - - /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ - now = ktime_get_real(); - r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); - hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9); - - /* - * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 - */ - if (hc->tx_state == TFRC_SSTATE_NO_FBACK) { - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); - - if (hc->tx_t_rto == 0) { - /* - * Initial feedback packet: Larger Initial Windows (4.2) - */ - hc->tx_x = rfc3390_initial_rate(sk); - hc->tx_t_ld = now; - - ccid3_update_send_interval(hc); - - goto done_computing_x; - } else if (hc->tx_p == 0) { - /* - * First feedback after nofeedback timer expiry (4.3) - */ - goto done_computing_x; - } - } - - /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ - if (hc->tx_p > 0) - hc->tx_x_calc = tfrc_calc_x(hc->tx_s, hc->tx_rtt, hc->tx_p); - ccid3_hc_tx_update_x(sk, &now); - -done_computing_x: - ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " - "p=%u, X_calc=%u, X_recv=%u, X=%u\n", - dccp_role(sk), sk, hc->tx_rtt, r_sample, - hc->tx_s, hc->tx_p, hc->tx_x_calc, - (unsigned int)(hc->tx_x_recv >> 6), - (unsigned int)(hc->tx_x >> 6)); - - /* unschedule no feedback timer */ - sk_stop_timer(sk, &hc->tx_no_feedback_timer); - - /* - * As we have calculated new ipi, delta, t_nom it is possible - * that we now can send a packet, so wake up dccp_wait_for_ccid - */ - sk->sk_write_space(sk); - - /* - * Update timeout interval for the nofeedback timer. In order to control - * rate halving on networks with very low RTTs (<= 1 ms), use per-route - * tunable RTAX_RTO_MIN value as the lower bound. - */ - hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, - USEC_PER_SEC/HZ * tcp_rto_min(sk)); - /* - * Schedule no feedback timer to expire in - * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) - */ - t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); - - ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " - "expire in %lu jiffies (%luus)\n", - dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); - - sk_reset_timer(sk, &hc->tx_no_feedback_timer, - jiffies + usecs_to_jiffies(t_nfb)); -} - -static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - __be32 opt_val; - - switch (option) { - case TFRC_OPT_RECEIVE_RATE: - case TFRC_OPT_LOSS_EVENT_RATE: - /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ - if (packet_type == DCCP_PKT_DATA) - break; - if (unlikely(optlen != 4)) { - DCCP_WARN("%s(%p), invalid len %d for %u\n", - dccp_role(sk), sk, optlen, option); - return -EINVAL; - } - opt_val = ntohl(get_unaligned((__be32 *)optval)); - - if (option == TFRC_OPT_RECEIVE_RATE) { - /* Receive Rate is kept in units of 64 bytes/second */ - hc->tx_x_recv = opt_val; - hc->tx_x_recv <<= 6; - - ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", - dccp_role(sk), sk, opt_val); - } else { - /* Update the fixpoint Loss Event Rate fraction */ - hc->tx_p = tfrc_invert_loss_event_rate(opt_val); - - ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", - dccp_role(sk), sk, opt_val); - } - } - return 0; -} - -static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) -{ - struct ccid3_hc_tx_sock *hc = ccid_priv(ccid); - - hc->tx_state = TFRC_SSTATE_NO_SENT; - hc->tx_hist = NULL; - hc->sk = sk; - timer_setup(&hc->tx_no_feedback_timer, - ccid3_hc_tx_no_feedback_timer, 0); - return 0; -} - -static void ccid3_hc_tx_exit(struct sock *sk) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - - sk_stop_timer(sk, &hc->tx_no_feedback_timer); - tfrc_tx_hist_purge(&hc->tx_hist); -} - -static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) -{ - info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto; - info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt; -} - -static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct tfrc_tx_info tfrc; - const void *val; - - switch (optname) { - case DCCP_SOCKOPT_CCID_TX_INFO: - if (len < sizeof(tfrc)) - return -EINVAL; - memset(&tfrc, 0, sizeof(tfrc)); - tfrc.tfrctx_x = hc->tx_x; - tfrc.tfrctx_x_recv = hc->tx_x_recv; - tfrc.tfrctx_x_calc = hc->tx_x_calc; - tfrc.tfrctx_rtt = hc->tx_rtt; - tfrc.tfrctx_p = hc->tx_p; - tfrc.tfrctx_rto = hc->tx_t_rto; - tfrc.tfrctx_ipi = hc->tx_t_ipi; - len = sizeof(tfrc); - val = &tfrc; - break; - default: - return -ENOPROTOOPT; - } - - if (put_user(len, optlen) || copy_to_user(optval, val, len)) - return -EFAULT; - - return 0; -} - -/* - * Receiver Half-Connection Routines - */ - -/* CCID3 feedback types */ -enum ccid3_fback_type { - CCID3_FBACK_NONE = 0, - CCID3_FBACK_INITIAL, - CCID3_FBACK_PERIODIC, - CCID3_FBACK_PARAM_CHANGE -}; - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) -{ - static const char *const ccid3_rx_state_names[] = { - [TFRC_RSTATE_NO_DATA] = "NO_DATA", - [TFRC_RSTATE_DATA] = "DATA", - }; - - return ccid3_rx_state_names[state]; -} -#endif - -static void ccid3_hc_rx_set_state(struct sock *sk, - enum ccid3_hc_rx_states state) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - enum ccid3_hc_rx_states oldstate = hc->rx_state; - - ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", - dccp_role(sk), sk, ccid3_rx_state_name(oldstate), - ccid3_rx_state_name(state)); - WARN_ON(state == oldstate); - hc->rx_state = state; -} - -static void ccid3_hc_rx_send_feedback(struct sock *sk, - const struct sk_buff *skb, - enum ccid3_fback_type fbtype) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - ktime_t now = ktime_get(); - s64 delta = 0; - - switch (fbtype) { - case CCID3_FBACK_INITIAL: - hc->rx_x_recv = 0; - hc->rx_pinv = ~0U; /* see RFC 4342, 8.5 */ - break; - case CCID3_FBACK_PARAM_CHANGE: - /* - * When parameters change (new loss or p > p_prev), we do not - * have a reliable estimate for R_m of [RFC 3448, 6.2] and so - * need to reuse the previous value of X_recv. However, when - * X_recv was 0 (due to early loss), this would kill X down to - * s/t_mbi (i.e. one packet in 64 seconds). - * To avoid such drastic reduction, we approximate X_recv as - * the number of bytes since last feedback. - * This is a safe fallback, since X is bounded above by X_calc. - */ - if (hc->rx_x_recv > 0) - break; - fallthrough; - case CCID3_FBACK_PERIODIC: - delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback); - if (delta <= 0) - delta = 1; - hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta); - break; - default: - return; - } - - ccid3_pr_debug("Interval %lldusec, X_recv=%u, 1/p=%u\n", delta, - hc->rx_x_recv, hc->rx_pinv); - - hc->rx_tstamp_last_feedback = now; - hc->rx_last_counter = dccp_hdr(skb)->dccph_ccval; - hc->rx_bytes_recv = 0; - - dp->dccps_hc_rx_insert_options = 1; - dccp_send_ack(sk); -} - -static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) -{ - const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - __be32 x_recv, pinv; - - if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) - return 0; - - if (dccp_packet_without_ack(skb)) - return 0; - - x_recv = htonl(hc->rx_x_recv); - pinv = htonl(hc->rx_pinv); - - if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE, - &pinv, sizeof(pinv)) || - dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE, - &x_recv, sizeof(x_recv))) - return -1; - - return 0; -} - -/** - * ccid3_first_li - Implements [RFC 5348, 6.3.1] - * @sk: socket to calculate loss interval for - * - * Determine the length of the first loss interval via inverse lookup. - * Assume that X_recv can be computed by the throughput equation - * s - * X_recv = -------- - * R * fval - * Find some p such that f(p) = fval; return 1/p (scaled). - */ -static u32 ccid3_first_li(struct sock *sk) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - u32 x_recv, p; - s64 delta; - u64 fval; - - if (hc->rx_rtt == 0) { - DCCP_WARN("No RTT estimate available, using fallback RTT\n"); - hc->rx_rtt = DCCP_FALLBACK_RTT; - } - - delta = ktime_us_delta(ktime_get(), hc->rx_tstamp_last_feedback); - if (delta <= 0) - delta = 1; - x_recv = scaled_div32(hc->rx_bytes_recv, delta); - if (x_recv == 0) { /* would also trigger divide-by-zero */ - DCCP_WARN("X_recv==0\n"); - if (hc->rx_x_recv == 0) { - DCCP_BUG("stored value of X_recv is zero"); - return ~0U; - } - x_recv = hc->rx_x_recv; - } - - fval = scaled_div(hc->rx_s, hc->rx_rtt); - fval = scaled_div32(fval, x_recv); - p = tfrc_calc_x_reverse_lookup(fval); - - ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " - "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); - - return p == 0 ? ~0U : scaled_div(1, p); -} - -static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; - const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; - const bool is_data_packet = dccp_data_packet(skb); - - if (unlikely(hc->rx_state == TFRC_RSTATE_NO_DATA)) { - if (is_data_packet) { - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; - do_feedback = CCID3_FBACK_INITIAL; - ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); - hc->rx_s = payload; - /* - * Not necessary to update rx_bytes_recv here, - * since X_recv = 0 for the first feedback packet (cf. - * RFC 3448, 6.3) -- gerrit - */ - } - goto update_records; - } - - if (tfrc_rx_hist_duplicate(&hc->rx_hist, skb)) - return; /* done receiving */ - - if (is_data_packet) { - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; - /* - * Update moving-average of s and the sum of received payload bytes - */ - hc->rx_s = tfrc_ewma(hc->rx_s, payload, 9); - hc->rx_bytes_recv += payload; - } - - /* - * Perform loss detection and handle pending losses - */ - if (tfrc_rx_handle_loss(&hc->rx_hist, &hc->rx_li_hist, - skb, ndp, ccid3_first_li, sk)) { - do_feedback = CCID3_FBACK_PARAM_CHANGE; - goto done_receiving; - } - - if (tfrc_rx_hist_loss_pending(&hc->rx_hist)) - return; /* done receiving */ - - /* - * Handle data packets: RTT sampling and monitoring p - */ - if (unlikely(!is_data_packet)) - goto update_records; - - if (!tfrc_lh_is_initialised(&hc->rx_li_hist)) { - const u32 sample = tfrc_rx_hist_sample_rtt(&hc->rx_hist, skb); - /* - * Empty loss history: no loss so far, hence p stays 0. - * Sample RTT values, since an RTT estimate is required for the - * computation of p when the first loss occurs; RFC 3448, 6.3.1. - */ - if (sample != 0) - hc->rx_rtt = tfrc_ewma(hc->rx_rtt, sample, 9); - - } else if (tfrc_lh_update_i_mean(&hc->rx_li_hist, skb)) { - /* - * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean - * has decreased (resp. p has increased), send feedback now. - */ - do_feedback = CCID3_FBACK_PARAM_CHANGE; - } - - /* - * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 - */ - if (SUB16(dccp_hdr(skb)->dccph_ccval, hc->rx_last_counter) > 3) - do_feedback = CCID3_FBACK_PERIODIC; - -update_records: - tfrc_rx_hist_add_packet(&hc->rx_hist, skb, ndp); - -done_receiving: - if (do_feedback) - ccid3_hc_rx_send_feedback(sk, skb, do_feedback); -} - -static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) -{ - struct ccid3_hc_rx_sock *hc = ccid_priv(ccid); - - hc->rx_state = TFRC_RSTATE_NO_DATA; - tfrc_lh_init(&hc->rx_li_hist); - return tfrc_rx_hist_alloc(&hc->rx_hist); -} - -static void ccid3_hc_rx_exit(struct sock *sk) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - - tfrc_rx_hist_purge(&hc->rx_hist); - tfrc_lh_cleanup(&hc->rx_li_hist); -} - -static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) -{ - info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state; - info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt; -} - -static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - struct tfrc_rx_info rx_info; - const void *val; - - switch (optname) { - case DCCP_SOCKOPT_CCID_RX_INFO: - if (len < sizeof(rx_info)) - return -EINVAL; - rx_info.tfrcrx_x_recv = hc->rx_x_recv; - rx_info.tfrcrx_rtt = hc->rx_rtt; - rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv); - len = sizeof(rx_info); - val = &rx_info; - break; - default: - return -ENOPROTOOPT; - } - - if (put_user(len, optlen) || copy_to_user(optval, val, len)) - return -EFAULT; - - return 0; -} - -struct ccid_operations ccid3_ops = { - .ccid_id = DCCPC_CCID3, - .ccid_name = "TCP-Friendly Rate Control", - .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock), - .ccid_hc_tx_init = ccid3_hc_tx_init, - .ccid_hc_tx_exit = ccid3_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent, - .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv, - .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options, - .ccid_hc_rx_obj_size = sizeof(struct ccid3_hc_rx_sock), - .ccid_hc_rx_init = ccid3_hc_rx_init, - .ccid_hc_rx_exit = ccid3_hc_rx_exit, - .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options, - .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv, - .ccid_hc_rx_get_info = ccid3_hc_rx_get_info, - .ccid_hc_tx_get_info = ccid3_hc_tx_get_info, - .ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt, - .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, -}; - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -module_param(ccid3_debug, bool, 0644); -MODULE_PARM_DESC(ccid3_debug, "Enable CCID-3 debug messages"); -#endif diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h deleted file mode 100644 index 02e0fc9f6334..000000000000 --- a/net/dccp/ccids/ccid3.h +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo . - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ -#ifndef _DCCP_CCID3_H_ -#define _DCCP_CCID3_H_ - -#include -#include -#include -#include -#include "lib/tfrc.h" -#include "../ccid.h" - -/* Two seconds as per RFC 5348, 4.2 */ -#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) - -/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ -#define TFRC_T_MBI 64 - -/* - * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are - * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. - * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse - * resolution of HZ < 500 means that the error is below one timer tick (t_gran) - * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). - */ -#if (HZ >= 500) -# define TFRC_T_DELTA USEC_PER_MSEC -#else -# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) -#endif - -enum ccid3_options { - TFRC_OPT_LOSS_EVENT_RATE = 192, - TFRC_OPT_LOSS_INTERVALS = 193, - TFRC_OPT_RECEIVE_RATE = 194, -}; - -/* TFRC sender states */ -enum ccid3_hc_tx_states { - TFRC_SSTATE_NO_SENT = 1, - TFRC_SSTATE_NO_FBACK, - TFRC_SSTATE_FBACK, -}; - -/** - * struct ccid3_hc_tx_sock - CCID3 sender half-connection socket - * @tx_x: Current sending rate in 64 * bytes per second - * @tx_x_recv: Receive rate in 64 * bytes per second - * @tx_x_calc: Calculated rate in bytes per second - * @tx_rtt: Estimate of current round trip time in usecs - * @tx_p: Current loss event rate (0-1) scaled by 1000000 - * @tx_s: Packet size in bytes - * @tx_t_rto: Nofeedback Timer setting in usecs - * @tx_t_ipi: Interpacket (send) interval (RFC 3448, 4.6) in usecs - * @tx_state: Sender state, one of %ccid3_hc_tx_states - * @tx_last_win_count: Last window counter sent - * @tx_t_last_win_count: Timestamp of earliest packet - * with last_win_count value sent - * @tx_no_feedback_timer: Handle to no feedback timer - * @tx_t_ld: Time last doubled during slow start - * @tx_t_nom: Nominal send time of next packet - * @tx_hist: Packet history - */ -struct ccid3_hc_tx_sock { - u64 tx_x; - u64 tx_x_recv; - u32 tx_x_calc; - u32 tx_rtt; - u32 tx_p; - u32 tx_t_rto; - u32 tx_t_ipi; - u16 tx_s; - enum ccid3_hc_tx_states tx_state:8; - u8 tx_last_win_count; - ktime_t tx_t_last_win_count; - struct timer_list tx_no_feedback_timer; - struct sock *sk; - ktime_t tx_t_ld; - ktime_t tx_t_nom; - struct tfrc_tx_hist_entry *tx_hist; -}; - -static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) -{ - struct ccid3_hc_tx_sock *hctx = ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid); - BUG_ON(hctx == NULL); - return hctx; -} - -/* TFRC receiver states */ -enum ccid3_hc_rx_states { - TFRC_RSTATE_NO_DATA = 1, - TFRC_RSTATE_DATA, -}; - -/** - * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket - * @rx_last_counter: Tracks window counter (RFC 4342, 8.1) - * @rx_state: Receiver state, one of %ccid3_hc_rx_states - * @rx_bytes_recv: Total sum of DCCP payload bytes - * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3) - * @rx_rtt: Receiver estimate of RTT - * @rx_tstamp_last_feedback: Time at which last feedback was sent - * @rx_hist: Packet history (loss detection + RTT sampling) - * @rx_li_hist: Loss Interval database - * @rx_s: Received packet size in bytes - * @rx_pinv: Inverse of Loss Event Rate (RFC 4342, sec. 8.5) - */ -struct ccid3_hc_rx_sock { - u8 rx_last_counter:4; - enum ccid3_hc_rx_states rx_state:8; - u32 rx_bytes_recv; - u32 rx_x_recv; - u32 rx_rtt; - ktime_t rx_tstamp_last_feedback; - struct tfrc_rx_hist rx_hist; - struct tfrc_loss_hist rx_li_hist; - u16 rx_s; -#define rx_pinv rx_li_hist.i_mean -}; - -static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) -{ - struct ccid3_hc_rx_sock *hcrx = ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid); - BUG_ON(hcrx == NULL); - return hcrx; -} - -#endif /* _DCCP_CCID3_H_ */ diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c deleted file mode 100644 index da95319842bb..000000000000 --- a/net/dccp/ccids/lib/loss_interval.c +++ /dev/null @@ -1,184 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-7 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ -#include -#include "tfrc.h" - -static struct kmem_cache *tfrc_lh_slab __read_mostly; -/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */ -static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 }; - -/* implements LIFO semantics on the array */ -static inline u8 LIH_INDEX(const u8 ctr) -{ - return LIH_SIZE - 1 - (ctr % LIH_SIZE); -} - -/* the `counter' index always points at the next entry to be populated */ -static inline struct tfrc_loss_interval *tfrc_lh_peek(struct tfrc_loss_hist *lh) -{ - return lh->counter ? lh->ring[LIH_INDEX(lh->counter - 1)] : NULL; -} - -/* given i with 0 <= i <= k, return I_i as per the rfc3448bis notation */ -static inline u32 tfrc_lh_get_interval(struct tfrc_loss_hist *lh, const u8 i) -{ - BUG_ON(i >= lh->counter); - return lh->ring[LIH_INDEX(lh->counter - i - 1)]->li_length; -} - -/* - * On-demand allocation and de-allocation of entries - */ -static struct tfrc_loss_interval *tfrc_lh_demand_next(struct tfrc_loss_hist *lh) -{ - if (lh->ring[LIH_INDEX(lh->counter)] == NULL) - lh->ring[LIH_INDEX(lh->counter)] = kmem_cache_alloc(tfrc_lh_slab, - GFP_ATOMIC); - return lh->ring[LIH_INDEX(lh->counter)]; -} - -void tfrc_lh_cleanup(struct tfrc_loss_hist *lh) -{ - if (!tfrc_lh_is_initialised(lh)) - return; - - for (lh->counter = 0; lh->counter < LIH_SIZE; lh->counter++) - if (lh->ring[LIH_INDEX(lh->counter)] != NULL) { - kmem_cache_free(tfrc_lh_slab, - lh->ring[LIH_INDEX(lh->counter)]); - lh->ring[LIH_INDEX(lh->counter)] = NULL; - } -} - -static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) -{ - u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0; - int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */ - - if (k <= 0) - return; - - for (i = 0; i <= k; i++) { - i_i = tfrc_lh_get_interval(lh, i); - - if (i < k) { - i_tot0 += i_i * tfrc_lh_weights[i]; - w_tot += tfrc_lh_weights[i]; - } - if (i > 0) - i_tot1 += i_i * tfrc_lh_weights[i-1]; - } - - lh->i_mean = max(i_tot0, i_tot1) / w_tot; -} - -/** - * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 - * @lh: histogram to update - * @skb: received socket triggering loss interval update - * - * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev - */ -u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) -{ - struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); - u32 old_i_mean = lh->i_mean; - s64 len; - - if (cur == NULL) /* not initialised */ - return 0; - - len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; - - if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ - return 0; - - if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) - /* - * Implements RFC 4342, 10.2: - * If a packet S (skb) exists whose seqno comes `after' the one - * starting the current loss interval (cur) and if the modulo-16 - * distance from C(cur) to C(S) is greater than 4, consider all - * subsequent packets as belonging to a new loss interval. This - * test is necessary since CCVal may wrap between intervals. - */ - cur->li_is_closed = 1; - - if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ - return 0; - - cur->li_length = len; - tfrc_lh_calc_i_mean(lh); - - return lh->i_mean < old_i_mean; -} - -/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ -static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, - struct tfrc_rx_hist_entry *new_loss) -{ - return dccp_delta_seqno(cur->li_seqno, new_loss->tfrchrx_seqno) > 0 && - (cur->li_is_closed || SUB16(new_loss->tfrchrx_ccval, cur->li_ccval) > 4); -} - -/** - * tfrc_lh_interval_add - Insert new record into the Loss Interval database - * @lh: Loss Interval database - * @rh: Receive history containing a fresh loss event - * @calc_first_li: Caller-dependent routine to compute length of first interval - * @sk: Used by @calc_first_li in caller-specific way (subtyping) - * - * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. - */ -int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, - u32 (*calc_first_li)(struct sock *), struct sock *sk) -{ - struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; - - if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) - return 0; - - new = tfrc_lh_demand_next(lh); - if (unlikely(new == NULL)) { - DCCP_CRIT("Cannot allocate/add loss record."); - return 0; - } - - new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; - new->li_ccval = tfrc_rx_hist_loss_prev(rh)->tfrchrx_ccval; - new->li_is_closed = 0; - - if (++lh->counter == 1) - lh->i_mean = new->li_length = (*calc_first_li)(sk); - else { - cur->li_length = dccp_delta_seqno(cur->li_seqno, new->li_seqno); - new->li_length = dccp_delta_seqno(new->li_seqno, - tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno) + 1; - if (lh->counter > (2*LIH_SIZE)) - lh->counter -= LIH_SIZE; - - tfrc_lh_calc_i_mean(lh); - } - return 1; -} - -int __init tfrc_li_init(void) -{ - tfrc_lh_slab = kmem_cache_create("tfrc_li_hist", - sizeof(struct tfrc_loss_interval), 0, - SLAB_HWCACHE_ALIGN, NULL); - return tfrc_lh_slab == NULL ? -ENOBUFS : 0; -} - -void tfrc_li_exit(void) -{ - if (tfrc_lh_slab != NULL) { - kmem_cache_destroy(tfrc_lh_slab); - tfrc_lh_slab = NULL; - } -} diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h deleted file mode 100644 index c3d95f85e43b..000000000000 --- a/net/dccp/ccids/lib/loss_interval.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _DCCP_LI_HIST_ -#define _DCCP_LI_HIST_ -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-7 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ -#include -#include -#include - -/* - * Number of loss intervals (RFC 4342, 8.6.1). The history size is one more than - * NINTERVAL, since the `open' interval I_0 is always stored as the first entry. - */ -#define NINTERVAL 8 -#define LIH_SIZE (NINTERVAL + 1) - -/** - * tfrc_loss_interval - Loss history record for TFRC-based protocols - * @li_seqno: Highest received seqno before the start of loss - * @li_ccval: The CCVal belonging to @li_seqno - * @li_is_closed: Whether @li_seqno is older than 1 RTT - * @li_length: Loss interval sequence length - */ -struct tfrc_loss_interval { - u64 li_seqno:48, - li_ccval:4, - li_is_closed:1; - u32 li_length; -}; - -/** - * tfrc_loss_hist - Loss record database - * @ring: Circular queue managed in LIFO manner - * @counter: Current count of entries (can be more than %LIH_SIZE) - * @i_mean: Current Average Loss Interval [RFC 3448, 5.4] - */ -struct tfrc_loss_hist { - struct tfrc_loss_interval *ring[LIH_SIZE]; - u8 counter; - u32 i_mean; -}; - -static inline void tfrc_lh_init(struct tfrc_loss_hist *lh) -{ - memset(lh, 0, sizeof(struct tfrc_loss_hist)); -} - -static inline u8 tfrc_lh_is_initialised(struct tfrc_loss_hist *lh) -{ - return lh->counter > 0; -} - -static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh) -{ - return min(lh->counter, (u8)LIH_SIZE); -} - -struct tfrc_rx_hist; - -int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, - u32 (*first_li)(struct sock *), struct sock *); -u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); -void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); - -#endif /* _DCCP_LI_HIST_ */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c deleted file mode 100644 index 0cdda3c66fb5..000000000000 --- a/net/dccp/ccids/lib/packet_history.c +++ /dev/null @@ -1,439 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo . - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ - -#include -#include -#include "packet_history.h" -#include "../../dccp.h" - -/* - * Transmitter History Routines - */ -static struct kmem_cache *tfrc_tx_hist_slab; - -int __init tfrc_tx_packet_history_init(void) -{ - tfrc_tx_hist_slab = kmem_cache_create("tfrc_tx_hist", - sizeof(struct tfrc_tx_hist_entry), - 0, SLAB_HWCACHE_ALIGN, NULL); - return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0; -} - -void tfrc_tx_packet_history_exit(void) -{ - if (tfrc_tx_hist_slab != NULL) { - kmem_cache_destroy(tfrc_tx_hist_slab); - tfrc_tx_hist_slab = NULL; - } -} - -int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) -{ - struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); - - if (entry == NULL) - return -ENOBUFS; - entry->seqno = seqno; - entry->stamp = ktime_get_real(); - entry->next = *headp; - *headp = entry; - return 0; -} - -void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) -{ - struct tfrc_tx_hist_entry *head = *headp; - - while (head != NULL) { - struct tfrc_tx_hist_entry *next = head->next; - - kmem_cache_free(tfrc_tx_hist_slab, head); - head = next; - } - - *headp = NULL; -} - -/* - * Receiver History Routines - */ -static struct kmem_cache *tfrc_rx_hist_slab; - -int __init tfrc_rx_packet_history_init(void) -{ - tfrc_rx_hist_slab = kmem_cache_create("tfrc_rxh_cache", - sizeof(struct tfrc_rx_hist_entry), - 0, SLAB_HWCACHE_ALIGN, NULL); - return tfrc_rx_hist_slab == NULL ? -ENOBUFS : 0; -} - -void tfrc_rx_packet_history_exit(void) -{ - if (tfrc_rx_hist_slab != NULL) { - kmem_cache_destroy(tfrc_rx_hist_slab); - tfrc_rx_hist_slab = NULL; - } -} - -static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry, - const struct sk_buff *skb, - const u64 ndp) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - - entry->tfrchrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; - entry->tfrchrx_ccval = dh->dccph_ccval; - entry->tfrchrx_type = dh->dccph_type; - entry->tfrchrx_ndp = ndp; - entry->tfrchrx_tstamp = ktime_get_real(); -} - -void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, - const struct sk_buff *skb, - const u64 ndp) -{ - struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h); - - tfrc_rx_hist_entry_from_skb(entry, skb, ndp); -} - -/* has the packet contained in skb been seen before? */ -int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) -{ - const u64 seq = DCCP_SKB_CB(skb)->dccpd_seq; - int i; - - if (dccp_delta_seqno(tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, seq) <= 0) - return 1; - - for (i = 1; i <= h->loss_count; i++) - if (tfrc_rx_hist_entry(h, i)->tfrchrx_seqno == seq) - return 1; - - return 0; -} - -static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) -{ - const u8 idx_a = tfrc_rx_hist_index(h, a), - idx_b = tfrc_rx_hist_index(h, b); - - swap(h->ring[idx_a], h->ring[idx_b]); -} - -/* - * Private helper functions for loss detection. - * - * In the descriptions, `Si' refers to the sequence number of entry number i, - * whose NDP count is `Ni' (lower case is used for variables). - * Note: All __xxx_loss functions expect that a test against duplicates has been - * performed already: the seqno of the skb must not be less than the seqno - * of loss_prev; and it must not equal that of any valid history entry. - */ -static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1) -{ - u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, - s1 = DCCP_SKB_CB(skb)->dccpd_seq; - - if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ - h->loss_count = 1; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1); - } -} - -static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) -{ - u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, - s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, - s2 = DCCP_SKB_CB(skb)->dccpd_seq; - - if (likely(dccp_delta_seqno(s1, s2) > 0)) { /* S1 < S2 */ - h->loss_count = 2; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2); - return; - } - - /* S0 < S2 < S1 */ - - if (dccp_loss_free(s0, s2, n2)) { - u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp; - - if (dccp_loss_free(s2, s1, n1)) { - /* hole is filled: S0, S2, and S1 are consecutive */ - h->loss_count = 0; - h->loss_start = tfrc_rx_hist_index(h, 1); - } else - /* gap between S2 and S1: just update loss_prev */ - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); - - } else { /* gap between S0 and S2 */ - /* - * Reorder history to insert S2 between S0 and S1 - */ - tfrc_rx_hist_swap(h, 0, 3); - h->loss_start = tfrc_rx_hist_index(h, 3); - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n2); - h->loss_count = 2; - } -} - -/* return 1 if a new loss event has been identified */ -static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3) -{ - u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, - s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, - s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno, - s3 = DCCP_SKB_CB(skb)->dccpd_seq; - - if (likely(dccp_delta_seqno(s2, s3) > 0)) { /* S2 < S3 */ - h->loss_count = 3; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3); - return 1; - } - - /* S3 < S2 */ - - if (dccp_delta_seqno(s1, s3) > 0) { /* S1 < S3 < S2 */ - /* - * Reorder history to insert S3 between S1 and S2 - */ - tfrc_rx_hist_swap(h, 2, 3); - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3); - h->loss_count = 3; - return 1; - } - - /* S0 < S3 < S1 */ - - if (dccp_loss_free(s0, s3, n3)) { - u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp; - - if (dccp_loss_free(s3, s1, n1)) { - /* hole between S0 and S1 filled by S3 */ - u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp; - - if (dccp_loss_free(s1, s2, n2)) { - /* entire hole filled by S0, S3, S1, S2 */ - h->loss_start = tfrc_rx_hist_index(h, 2); - h->loss_count = 0; - } else { - /* gap remains between S1 and S2 */ - h->loss_start = tfrc_rx_hist_index(h, 1); - h->loss_count = 1; - } - - } else /* gap exists between S3 and S1, loss_count stays at 2 */ - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n3); - - return 0; - } - - /* - * The remaining case: S0 < S3 < S1 < S2; gap between S0 and S3 - * Reorder history to insert S3 between S0 and S1. - */ - tfrc_rx_hist_swap(h, 0, 3); - h->loss_start = tfrc_rx_hist_index(h, 3); - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n3); - h->loss_count = 3; - - return 1; -} - -/* recycle RX history records to continue loss detection if necessary */ -static void __three_after_loss(struct tfrc_rx_hist *h) -{ - /* - * At this stage we know already that there is a gap between S0 and S1 - * (since S0 was the highest sequence number received before detecting - * the loss). To recycle the loss record, it is thus only necessary to - * check for other possible gaps between S1/S2 and between S2/S3. - */ - u64 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, - s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno, - s3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_seqno; - u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp, - n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp; - - if (dccp_loss_free(s1, s2, n2)) { - - if (dccp_loss_free(s2, s3, n3)) { - /* no gap between S2 and S3: entire hole is filled */ - h->loss_start = tfrc_rx_hist_index(h, 3); - h->loss_count = 0; - } else { - /* gap between S2 and S3 */ - h->loss_start = tfrc_rx_hist_index(h, 2); - h->loss_count = 1; - } - - } else { /* gap between S1 and S2 */ - h->loss_start = tfrc_rx_hist_index(h, 1); - h->loss_count = 2; - } -} - -/** - * tfrc_rx_handle_loss - Loss detection and further processing - * @h: The non-empty RX history object - * @lh: Loss Intervals database to update - * @skb: Currently received packet - * @ndp: The NDP count belonging to @skb - * @calc_first_li: Caller-dependent computation of first loss interval in @lh - * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) - * - * Chooses action according to pending loss, updates LI database when a new - * loss was detected, and does required post-processing. Returns 1 when caller - * should send feedback, 0 otherwise. - * Since it also takes care of reordering during loss detection and updates the - * records accordingly, the caller should not perform any more RX history - * operations when loss_count is greater than 0 after calling this function. - */ -int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, - struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*calc_first_li)(struct sock *), struct sock *sk) -{ - int is_new_loss = 0; - - if (h->loss_count == 0) { - __do_track_loss(h, skb, ndp); - } else if (h->loss_count == 1) { - __one_after_loss(h, skb, ndp); - } else if (h->loss_count != 2) { - DCCP_BUG("invalid loss_count %d", h->loss_count); - } else if (__two_after_loss(h, skb, ndp)) { - /* - * Update Loss Interval database and recycle RX records - */ - is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); - __three_after_loss(h); - } - return is_new_loss; -} - -int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) -{ - int i; - - for (i = 0; i <= TFRC_NDUPACK; i++) { - h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); - if (h->ring[i] == NULL) - goto out_free; - } - - h->loss_count = h->loss_start = 0; - return 0; - -out_free: - while (i-- != 0) { - kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); - h->ring[i] = NULL; - } - return -ENOBUFS; -} - -void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) -{ - int i; - - for (i = 0; i <= TFRC_NDUPACK; ++i) - if (h->ring[i] != NULL) { - kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); - h->ring[i] = NULL; - } -} - -/** - * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against - * @h: The non-empty RX history object - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) -{ - return h->ring[0]; -} - -/** - * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry - * @h: The non-empty RX history object - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) -{ - return h->ring[h->rtt_sample_prev]; -} - -/** - * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal - * @h: receive histogram - * @skb: packet containing timestamp. - * - * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able - * to compute a sample with given data - calling function should check this. - */ -u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) -{ - u32 sample = 0, - delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - - if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ - if (h->rtt_sample_prev == 2) { /* previous candidate stored */ - sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - if (sample) - sample = 4 / sample * - ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); - else /* - * FIXME: This condition is in principle not - * possible but occurs when CCID is used for - * two-way data traffic. I have tried to trace - * it, but the cause does not seem to be here. - */ - DCCP_BUG("please report to dccp@vger.kernel.org" - " => prev = %u, last = %u", - tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - } else if (delta_v < 1) { - h->rtt_sample_prev = 1; - goto keep_ref_for_next_time; - } - - } else if (delta_v == 4) /* optimal match */ - sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); - else { /* suboptimal match */ - h->rtt_sample_prev = 2; - goto keep_ref_for_next_time; - } - - if (unlikely(sample > DCCP_SANE_RTT_MAX)) { - DCCP_WARN("RTT sample %u too large, using max\n", sample); - sample = DCCP_SANE_RTT_MAX; - } - - h->rtt_sample_prev = 0; /* use current entry as next reference */ -keep_ref_for_next_time: - - return sample; -} diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h deleted file mode 100644 index 159cc9326eab..000000000000 --- a/net/dccp/ccids/lib/packet_history.h +++ /dev/null @@ -1,142 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Packet RX/TX history data structures and routines for TFRC-based protocols. - * - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo . - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ - -#ifndef _DCCP_PKT_HIST_ -#define _DCCP_PKT_HIST_ - -#include -#include -#include "tfrc.h" - -/** - * tfrc_tx_hist_entry - Simple singly-linked TX history list - * @next: next oldest entry (LIFO order) - * @seqno: sequence number of this entry - * @stamp: send time of packet with sequence number @seqno - */ -struct tfrc_tx_hist_entry { - struct tfrc_tx_hist_entry *next; - u64 seqno; - ktime_t stamp; -}; - -static inline struct tfrc_tx_hist_entry * - tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) -{ - while (head != NULL && head->seqno != seqno) - head = head->next; - return head; -} - -int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); -void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); - -/* Subtraction a-b modulo-16, respects circular wrap-around */ -#define SUB16(a, b) (((a) + 16 - (b)) & 0xF) - -/* Number of packets to wait after a missing packet (RFC 4342, 6.1) */ -#define TFRC_NDUPACK 3 - -/** - * tfrc_rx_hist_entry - Store information about a single received packet - * @tfrchrx_seqno: DCCP packet sequence number - * @tfrchrx_ccval: window counter value of packet (RFC 4342, 8.1) - * @tfrchrx_ndp: the NDP count (if any) of the packet - * @tfrchrx_tstamp: actual receive time of packet - */ -struct tfrc_rx_hist_entry { - u64 tfrchrx_seqno:48, - tfrchrx_ccval:4, - tfrchrx_type:4; - u64 tfrchrx_ndp:48; - ktime_t tfrchrx_tstamp; -}; - -/** - * tfrc_rx_hist - RX history structure for TFRC-based protocols - * @ring: Packet history for RTT sampling and loss detection - * @loss_count: Number of entries in circular history - * @loss_start: Movable index (for loss detection) - * @rtt_sample_prev: Used during RTT sampling, points to candidate entry - */ -struct tfrc_rx_hist { - struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; - u8 loss_count:2, - loss_start:2; -#define rtt_sample_prev loss_start -}; - -/** - * tfrc_rx_hist_index - index to reach n-th entry after loss_start - */ -static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n) -{ - return (h->loss_start + n) & TFRC_NDUPACK; -} - -/** - * tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h) -{ - return h->ring[tfrc_rx_hist_index(h, h->loss_count)]; -} - -/** - * tfrc_rx_hist_entry - return the n-th history entry after loss_start - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n) -{ - return h->ring[tfrc_rx_hist_index(h, n)]; -} - -/** - * tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h) -{ - return h->ring[h->loss_start]; -} - -/* indicate whether previously a packet was detected missing */ -static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h) -{ - return h->loss_count > 0; -} - -void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb, - const u64 ndp); - -int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); - -struct tfrc_loss_hist; -int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*first_li)(struct sock *sk), struct sock *sk); -u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb); -int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); -void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); - -#endif /* _DCCP_PKT_HIST_ */ diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c deleted file mode 100644 index d7f265e1f50c..000000000000 --- a/net/dccp/ccids/lib/tfrc.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * TFRC library initialisation - * - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2007 Arnaldo Carvalho de Melo - */ -#include -#include "tfrc.h" - -#ifdef CONFIG_IP_DCCP_TFRC_DEBUG -bool tfrc_debug; -module_param(tfrc_debug, bool, 0644); -MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages"); -#endif - -int __init tfrc_lib_init(void) -{ - int rc = tfrc_li_init(); - - if (rc) - goto out; - - rc = tfrc_tx_packet_history_init(); - if (rc) - goto out_free_loss_intervals; - - rc = tfrc_rx_packet_history_init(); - if (rc) - goto out_free_tx_history; - return 0; - -out_free_tx_history: - tfrc_tx_packet_history_exit(); -out_free_loss_intervals: - tfrc_li_exit(); -out: - return rc; -} - -void tfrc_lib_exit(void) -{ - tfrc_rx_packet_history_exit(); - tfrc_tx_packet_history_exit(); - tfrc_li_exit(); -} diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h deleted file mode 100644 index 0a63e8750cc5..000000000000 --- a/net/dccp/ccids/lib/tfrc.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _TFRC_H_ -#define _TFRC_H_ -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-6 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - */ -#include -#include -#include "../../dccp.h" - -/* internal includes that this library exports: */ -#include "loss_interval.h" -#include "packet_history.h" - -#ifdef CONFIG_IP_DCCP_TFRC_DEBUG -extern bool tfrc_debug; -#define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a) -#else -#define tfrc_pr_debug(format, a...) -#endif - -/* integer-arithmetic divisions of type (a * 1000000)/b */ -static inline u64 scaled_div(u64 a, u64 b) -{ - BUG_ON(b == 0); - return div64_u64(a * 1000000, b); -} - -static inline u32 scaled_div32(u64 a, u64 b) -{ - u64 result = scaled_div(a, b); - - if (result > UINT_MAX) { - DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX", - (unsigned long long)a, (unsigned long long)b); - return UINT_MAX; - } - return result; -} - -/** - * tfrc_ewma - Exponentially weighted moving average - * @weight: Weight to be used as damping factor, in units of 1/10 - */ -static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) -{ - return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval; -} - -u32 tfrc_calc_x(u16 s, u32 R, u32 p); -u32 tfrc_calc_x_reverse_lookup(u32 fvalue); -u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); - -int tfrc_tx_packet_history_init(void); -void tfrc_tx_packet_history_exit(void); -int tfrc_rx_packet_history_init(void); -void tfrc_rx_packet_history_exit(void); - -int tfrc_li_init(void); -void tfrc_li_exit(void); - -#ifdef CONFIG_IP_DCCP_TFRC_LIB -int tfrc_lib_init(void); -void tfrc_lib_exit(void); -#else -#define tfrc_lib_init() (0) -#define tfrc_lib_exit() -#endif -#endif /* _TFRC_H_ */ diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c deleted file mode 100644 index 92a8c6bea316..000000000000 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ /dev/null @@ -1,702 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - */ - -#include -#include "../../dccp.h" -#include "tfrc.h" - -#define TFRC_CALC_X_ARRSIZE 500 -#define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */ -#define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE) - -/* - TFRC TCP Reno Throughput Equation Lookup Table for f(p) - - The following two-column lookup table implements a part of the TCP throughput - equation from [RFC 3448, sec. 3.1]: - - s - X_calc = -------------------------------------------------------------- - R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3)) - - Where: - X is the transmit rate in bytes/second - s is the packet size in bytes - R is the round trip time in seconds - p is the loss event rate, between 0 and 1.0, of the number of loss - events as a fraction of the number of packets transmitted - t_RTO is the TCP retransmission timeout value in seconds - b is the number of packets acknowledged by a single TCP ACK - - We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes: - - s - X_calc = ------------------------------------------------------- - R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3)) - - which we can break down into: - - s - X_calc = --------- - R * f(p) - - where f(p) is given for 0 < p <= 1 by: - - f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3) - - Since this is kernel code, floating-point arithmetic is avoided in favour of - integer arithmetic. This means that nearly all fractional parameters are - scaled by 1000000: - * the parameters p and R - * the return result f(p) - The lookup table therefore actually tabulates the following function g(q): - - g(q) = 1000000 * f(q/1000000) - - Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer - granularity for the practically more relevant case of small values of p (up to - 5%), the second column is used; the first one ranges up to 100%. This split - corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also - determines the smallest resolution possible with this lookup table: - - TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE - - The entire table is generated by: - for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) { - lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE); - lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE); - } - - With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1, - lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%) - lookup[M][0] = g(1000000) = 1000000 * f(100%) - lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%) - lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%) - - In summary, the two columns represent f(p) for the following ranges: - * The first column is for 0.002 <= p <= 1.0 - * The second column is for 0.0001 <= p <= 0.05 - Where the columns overlap, the second (finer-grained) is given preference, - i.e. the first column is used only for p >= 0.05. - */ -static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { - { 37172, 8172 }, - { 53499, 11567 }, - { 66664, 14180 }, - { 78298, 16388 }, - { 89021, 18339 }, - { 99147, 20108 }, - { 108858, 21738 }, - { 118273, 23260 }, - { 127474, 24693 }, - { 136520, 26052 }, - { 145456, 27348 }, - { 154316, 28589 }, - { 163130, 29783 }, - { 171919, 30935 }, - { 180704, 32049 }, - { 189502, 33130 }, - { 198328, 34180 }, - { 207194, 35202 }, - { 216114, 36198 }, - { 225097, 37172 }, - { 234153, 38123 }, - { 243294, 39055 }, - { 252527, 39968 }, - { 261861, 40864 }, - { 271305, 41743 }, - { 280866, 42607 }, - { 290553, 43457 }, - { 300372, 44293 }, - { 310333, 45117 }, - { 320441, 45929 }, - { 330705, 46729 }, - { 341131, 47518 }, - { 351728, 48297 }, - { 362501, 49066 }, - { 373460, 49826 }, - { 384609, 50577 }, - { 395958, 51320 }, - { 407513, 52054 }, - { 419281, 52780 }, - { 431270, 53499 }, - { 443487, 54211 }, - { 455940, 54916 }, - { 468635, 55614 }, - { 481581, 56306 }, - { 494785, 56991 }, - { 508254, 57671 }, - { 521996, 58345 }, - { 536019, 59014 }, - { 550331, 59677 }, - { 564939, 60335 }, - { 579851, 60988 }, - { 595075, 61636 }, - { 610619, 62279 }, - { 626491, 62918 }, - { 642700, 63553 }, - { 659253, 64183 }, - { 676158, 64809 }, - { 693424, 65431 }, - { 711060, 66050 }, - { 729073, 66664 }, - { 747472, 67275 }, - { 766266, 67882 }, - { 785464, 68486 }, - { 805073, 69087 }, - { 825103, 69684 }, - { 845562, 70278 }, - { 866460, 70868 }, - { 887805, 71456 }, - { 909606, 72041 }, - { 931873, 72623 }, - { 954614, 73202 }, - { 977839, 73778 }, - { 1001557, 74352 }, - { 1025777, 74923 }, - { 1050508, 75492 }, - { 1075761, 76058 }, - { 1101544, 76621 }, - { 1127867, 77183 }, - { 1154739, 77741 }, - { 1182172, 78298 }, - { 1210173, 78852 }, - { 1238753, 79405 }, - { 1267922, 79955 }, - { 1297689, 80503 }, - { 1328066, 81049 }, - { 1359060, 81593 }, - { 1390684, 82135 }, - { 1422947, 82675 }, - { 1455859, 83213 }, - { 1489430, 83750 }, - { 1523671, 84284 }, - { 1558593, 84817 }, - { 1594205, 85348 }, - { 1630518, 85878 }, - { 1667543, 86406 }, - { 1705290, 86932 }, - { 1743770, 87457 }, - { 1782994, 87980 }, - { 1822973, 88501 }, - { 1863717, 89021 }, - { 1905237, 89540 }, - { 1947545, 90057 }, - { 1990650, 90573 }, - { 2034566, 91087 }, - { 2079301, 91600 }, - { 2124869, 92111 }, - { 2171279, 92622 }, - { 2218543, 93131 }, - { 2266673, 93639 }, - { 2315680, 94145 }, - { 2365575, 94650 }, - { 2416371, 95154 }, - { 2468077, 95657 }, - { 2520707, 96159 }, - { 2574271, 96660 }, - { 2628782, 97159 }, - { 2684250, 97658 }, - { 2740689, 98155 }, - { 2798110, 98651 }, - { 2856524, 99147 }, - { 2915944, 99641 }, - { 2976382, 100134 }, - { 3037850, 100626 }, - { 3100360, 101117 }, - { 3163924, 101608 }, - { 3228554, 102097 }, - { 3294263, 102586 }, - { 3361063, 103073 }, - { 3428966, 103560 }, - { 3497984, 104045 }, - { 3568131, 104530 }, - { 3639419, 105014 }, - { 3711860, 105498 }, - { 3785467, 105980 }, - { 3860253, 106462 }, - { 3936229, 106942 }, - { 4013410, 107422 }, - { 4091808, 107902 }, - { 4171435, 108380 }, - { 4252306, 108858 }, - { 4334431, 109335 }, - { 4417825, 109811 }, - { 4502501, 110287 }, - { 4588472, 110762 }, - { 4675750, 111236 }, - { 4764349, 111709 }, - { 4854283, 112182 }, - { 4945564, 112654 }, - { 5038206, 113126 }, - { 5132223, 113597 }, - { 5227627, 114067 }, - { 5324432, 114537 }, - { 5422652, 115006 }, - { 5522299, 115474 }, - { 5623389, 115942 }, - { 5725934, 116409 }, - { 5829948, 116876 }, - { 5935446, 117342 }, - { 6042439, 117808 }, - { 6150943, 118273 }, - { 6260972, 118738 }, - { 6372538, 119202 }, - { 6485657, 119665 }, - { 6600342, 120128 }, - { 6716607, 120591 }, - { 6834467, 121053 }, - { 6953935, 121514 }, - { 7075025, 121976 }, - { 7197752, 122436 }, - { 7322131, 122896 }, - { 7448175, 123356 }, - { 7575898, 123815 }, - { 7705316, 124274 }, - { 7836442, 124733 }, - { 7969291, 125191 }, - { 8103877, 125648 }, - { 8240216, 126105 }, - { 8378321, 126562 }, - { 8518208, 127018 }, - { 8659890, 127474 }, - { 8803384, 127930 }, - { 8948702, 128385 }, - { 9095861, 128840 }, - { 9244875, 129294 }, - { 9395760, 129748 }, - { 9548529, 130202 }, - { 9703198, 130655 }, - { 9859782, 131108 }, - { 10018296, 131561 }, - { 10178755, 132014 }, - { 10341174, 132466 }, - { 10505569, 132917 }, - { 10671954, 133369 }, - { 10840345, 133820 }, - { 11010757, 134271 }, - { 11183206, 134721 }, - { 11357706, 135171 }, - { 11534274, 135621 }, - { 11712924, 136071 }, - { 11893673, 136520 }, - { 12076536, 136969 }, - { 12261527, 137418 }, - { 12448664, 137867 }, - { 12637961, 138315 }, - { 12829435, 138763 }, - { 13023101, 139211 }, - { 13218974, 139658 }, - { 13417071, 140106 }, - { 13617407, 140553 }, - { 13819999, 140999 }, - { 14024862, 141446 }, - { 14232012, 141892 }, - { 14441465, 142339 }, - { 14653238, 142785 }, - { 14867346, 143230 }, - { 15083805, 143676 }, - { 15302632, 144121 }, - { 15523842, 144566 }, - { 15747453, 145011 }, - { 15973479, 145456 }, - { 16201939, 145900 }, - { 16432847, 146345 }, - { 16666221, 146789 }, - { 16902076, 147233 }, - { 17140429, 147677 }, - { 17381297, 148121 }, - { 17624696, 148564 }, - { 17870643, 149007 }, - { 18119154, 149451 }, - { 18370247, 149894 }, - { 18623936, 150336 }, - { 18880241, 150779 }, - { 19139176, 151222 }, - { 19400759, 151664 }, - { 19665007, 152107 }, - { 19931936, 152549 }, - { 20201564, 152991 }, - { 20473907, 153433 }, - { 20748982, 153875 }, - { 21026807, 154316 }, - { 21307399, 154758 }, - { 21590773, 155199 }, - { 21876949, 155641 }, - { 22165941, 156082 }, - { 22457769, 156523 }, - { 22752449, 156964 }, - { 23049999, 157405 }, - { 23350435, 157846 }, - { 23653774, 158287 }, - { 23960036, 158727 }, - { 24269236, 159168 }, - { 24581392, 159608 }, - { 24896521, 160049 }, - { 25214642, 160489 }, - { 25535772, 160929 }, - { 25859927, 161370 }, - { 26187127, 161810 }, - { 26517388, 162250 }, - { 26850728, 162690 }, - { 27187165, 163130 }, - { 27526716, 163569 }, - { 27869400, 164009 }, - { 28215234, 164449 }, - { 28564236, 164889 }, - { 28916423, 165328 }, - { 29271815, 165768 }, - { 29630428, 166208 }, - { 29992281, 166647 }, - { 30357392, 167087 }, - { 30725779, 167526 }, - { 31097459, 167965 }, - { 31472452, 168405 }, - { 31850774, 168844 }, - { 32232445, 169283 }, - { 32617482, 169723 }, - { 33005904, 170162 }, - { 33397730, 170601 }, - { 33792976, 171041 }, - { 34191663, 171480 }, - { 34593807, 171919 }, - { 34999428, 172358 }, - { 35408544, 172797 }, - { 35821174, 173237 }, - { 36237335, 173676 }, - { 36657047, 174115 }, - { 37080329, 174554 }, - { 37507197, 174993 }, - { 37937673, 175433 }, - { 38371773, 175872 }, - { 38809517, 176311 }, - { 39250924, 176750 }, - { 39696012, 177190 }, - { 40144800, 177629 }, - { 40597308, 178068 }, - { 41053553, 178507 }, - { 41513554, 178947 }, - { 41977332, 179386 }, - { 42444904, 179825 }, - { 42916290, 180265 }, - { 43391509, 180704 }, - { 43870579, 181144 }, - { 44353520, 181583 }, - { 44840352, 182023 }, - { 45331092, 182462 }, - { 45825761, 182902 }, - { 46324378, 183342 }, - { 46826961, 183781 }, - { 47333531, 184221 }, - { 47844106, 184661 }, - { 48358706, 185101 }, - { 48877350, 185541 }, - { 49400058, 185981 }, - { 49926849, 186421 }, - { 50457743, 186861 }, - { 50992759, 187301 }, - { 51531916, 187741 }, - { 52075235, 188181 }, - { 52622735, 188622 }, - { 53174435, 189062 }, - { 53730355, 189502 }, - { 54290515, 189943 }, - { 54854935, 190383 }, - { 55423634, 190824 }, - { 55996633, 191265 }, - { 56573950, 191706 }, - { 57155606, 192146 }, - { 57741621, 192587 }, - { 58332014, 193028 }, - { 58926806, 193470 }, - { 59526017, 193911 }, - { 60129666, 194352 }, - { 60737774, 194793 }, - { 61350361, 195235 }, - { 61967446, 195677 }, - { 62589050, 196118 }, - { 63215194, 196560 }, - { 63845897, 197002 }, - { 64481179, 197444 }, - { 65121061, 197886 }, - { 65765563, 198328 }, - { 66414705, 198770 }, - { 67068508, 199213 }, - { 67726992, 199655 }, - { 68390177, 200098 }, - { 69058085, 200540 }, - { 69730735, 200983 }, - { 70408147, 201426 }, - { 71090343, 201869 }, - { 71777343, 202312 }, - { 72469168, 202755 }, - { 73165837, 203199 }, - { 73867373, 203642 }, - { 74573795, 204086 }, - { 75285124, 204529 }, - { 76001380, 204973 }, - { 76722586, 205417 }, - { 77448761, 205861 }, - { 78179926, 206306 }, - { 78916102, 206750 }, - { 79657310, 207194 }, - { 80403571, 207639 }, - { 81154906, 208084 }, - { 81911335, 208529 }, - { 82672880, 208974 }, - { 83439562, 209419 }, - { 84211402, 209864 }, - { 84988421, 210309 }, - { 85770640, 210755 }, - { 86558080, 211201 }, - { 87350762, 211647 }, - { 88148708, 212093 }, - { 88951938, 212539 }, - { 89760475, 212985 }, - { 90574339, 213432 }, - { 91393551, 213878 }, - { 92218133, 214325 }, - { 93048107, 214772 }, - { 93883493, 215219 }, - { 94724314, 215666 }, - { 95570590, 216114 }, - { 96422343, 216561 }, - { 97279594, 217009 }, - { 98142366, 217457 }, - { 99010679, 217905 }, - { 99884556, 218353 }, - { 100764018, 218801 }, - { 101649086, 219250 }, - { 102539782, 219698 }, - { 103436128, 220147 }, - { 104338146, 220596 }, - { 105245857, 221046 }, - { 106159284, 221495 }, - { 107078448, 221945 }, - { 108003370, 222394 }, - { 108934074, 222844 }, - { 109870580, 223294 }, - { 110812910, 223745 }, - { 111761087, 224195 }, - { 112715133, 224646 }, - { 113675069, 225097 }, - { 114640918, 225548 }, - { 115612702, 225999 }, - { 116590442, 226450 }, - { 117574162, 226902 }, - { 118563882, 227353 }, - { 119559626, 227805 }, - { 120561415, 228258 }, - { 121569272, 228710 }, - { 122583219, 229162 }, - { 123603278, 229615 }, - { 124629471, 230068 }, - { 125661822, 230521 }, - { 126700352, 230974 }, - { 127745083, 231428 }, - { 128796039, 231882 }, - { 129853241, 232336 }, - { 130916713, 232790 }, - { 131986475, 233244 }, - { 133062553, 233699 }, - { 134144966, 234153 }, - { 135233739, 234608 }, - { 136328894, 235064 }, - { 137430453, 235519 }, - { 138538440, 235975 }, - { 139652876, 236430 }, - { 140773786, 236886 }, - { 141901190, 237343 }, - { 143035113, 237799 }, - { 144175576, 238256 }, - { 145322604, 238713 }, - { 146476218, 239170 }, - { 147636442, 239627 }, - { 148803298, 240085 }, - { 149976809, 240542 }, - { 151156999, 241000 }, - { 152343890, 241459 }, - { 153537506, 241917 }, - { 154737869, 242376 }, - { 155945002, 242835 }, - { 157158929, 243294 }, - { 158379673, 243753 }, - { 159607257, 244213 }, - { 160841704, 244673 }, - { 162083037, 245133 }, - { 163331279, 245593 }, - { 164586455, 246054 }, - { 165848586, 246514 }, - { 167117696, 246975 }, - { 168393810, 247437 }, - { 169676949, 247898 }, - { 170967138, 248360 }, - { 172264399, 248822 }, - { 173568757, 249284 }, - { 174880235, 249747 }, - { 176198856, 250209 }, - { 177524643, 250672 }, - { 178857621, 251136 }, - { 180197813, 251599 }, - { 181545242, 252063 }, - { 182899933, 252527 }, - { 184261908, 252991 }, - { 185631191, 253456 }, - { 187007807, 253920 }, - { 188391778, 254385 }, - { 189783129, 254851 }, - { 191181884, 255316 }, - { 192588065, 255782 }, - { 194001698, 256248 }, - { 195422805, 256714 }, - { 196851411, 257181 }, - { 198287540, 257648 }, - { 199731215, 258115 }, - { 201182461, 258582 }, - { 202641302, 259050 }, - { 204107760, 259518 }, - { 205581862, 259986 }, - { 207063630, 260454 }, - { 208553088, 260923 }, - { 210050262, 261392 }, - { 211555174, 261861 }, - { 213067849, 262331 }, - { 214588312, 262800 }, - { 216116586, 263270 }, - { 217652696, 263741 }, - { 219196666, 264211 }, - { 220748520, 264682 }, - { 222308282, 265153 }, - { 223875978, 265625 }, - { 225451630, 266097 }, - { 227035265, 266569 }, - { 228626905, 267041 }, - { 230226576, 267514 }, - { 231834302, 267986 }, - { 233450107, 268460 }, - { 235074016, 268933 }, - { 236706054, 269407 }, - { 238346244, 269881 }, - { 239994613, 270355 }, - { 241651183, 270830 }, - { 243315981, 271305 } -}; - -/* return largest index i such that fval <= lookup[i][small] */ -static inline u32 tfrc_binsearch(u32 fval, u8 small) -{ - u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1; - - while (low < high) { - try = (low + high) / 2; - if (fval <= tfrc_calc_x_lookup[try][small]) - high = try; - else - low = try + 1; - } - return high; -} - -/** - * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448 - * @s: packet size in bytes - * @R: RTT scaled by 1000000 (i.e., microseconds) - * @p: loss ratio estimate scaled by 1000000 - * - * Returns X_calc in bytes per second (not scaled). - */ -u32 tfrc_calc_x(u16 s, u32 R, u32 p) -{ - u16 index; - u32 f; - u64 result; - - /* check against invalid parameters and divide-by-zero */ - BUG_ON(p > 1000000); /* p must not exceed 100% */ - BUG_ON(p == 0); /* f(0) = 0, divide by zero */ - if (R == 0) { /* possible divide by zero */ - DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc."); - return ~0U; - } - - if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ - if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ - DCCP_WARN("Value of p (%d) below resolution. " - "Substituting %d\n", p, TFRC_SMALLEST_P); - index = 0; - } else /* 0.0001 <= p <= 0.05 */ - index = p/TFRC_SMALLEST_P - 1; - - f = tfrc_calc_x_lookup[index][1]; - - } else { /* 0.05 < p <= 1.00 */ - index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1; - - f = tfrc_calc_x_lookup[index][0]; - } - - /* - * Compute X = s/(R*f(p)) in bytes per second. - * Since f(p) and R are both scaled by 1000000, we need to multiply by - * 1000000^2. To avoid overflow, the result is computed in two stages. - * This works under almost all reasonable operational conditions, for a - * wide range of parameters. Yet, should some strange combination of - * parameters result in overflow, the use of scaled_div32 will catch - * this and return UINT_MAX - which is a logically adequate consequence. - */ - result = scaled_div(s, R); - return scaled_div32(result, f); -} - -/** - * tfrc_calc_x_reverse_lookup - try to find p given f(p) - * @fvalue: function value to match, scaled by 1000000 - * - * Returns closest match for p, also scaled by 1000000 - */ -u32 tfrc_calc_x_reverse_lookup(u32 fvalue) -{ - int index; - - if (fvalue == 0) /* f(p) = 0 whenever p = 0 */ - return 0; - - /* Error cases. */ - if (fvalue < tfrc_calc_x_lookup[0][1]) { - DCCP_WARN("fvalue %u smaller than resolution\n", fvalue); - return TFRC_SMALLEST_P; - } - if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) { - DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue); - return 1000000; - } - - if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) { - index = tfrc_binsearch(fvalue, 1); - return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE; - } - - /* else ... it must be in the coarse-grained column */ - index = tfrc_binsearch(fvalue, 0); - return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; -} - -/** - * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% - * @loss_event_rate: loss event rate to invert - * When @loss_event_rate is large, there is a chance that p is truncated to 0. - * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. - */ -u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) -{ - if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ - return 0; - if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ - return 1000000; - return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); -} diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h deleted file mode 100644 index 1f748ed1279d..000000000000 --- a/net/dccp/dccp.h +++ /dev/null @@ -1,483 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _DCCP_H -#define _DCCP_H -/* - * net/dccp/dccp.h - * - * An implementation of the DCCP protocol - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2005-6 Ian McDonald - */ - -#include -#include -#include -#include -#include -#include "ackvec.h" - -/* - * DCCP - specific warning and debugging macros. - */ -#define DCCP_WARN(fmt, ...) \ - net_warn_ratelimited("%s: " fmt, __func__, ##__VA_ARGS__) -#define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \ - __FILE__, __LINE__, __func__) -#define DCCP_BUG(a...) do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0) -#define DCCP_BUG_ON(cond) do { if (unlikely((cond) != 0)) \ - DCCP_BUG("\"%s\" holds (exception!)", \ - __stringify(cond)); \ - } while (0) - -#define DCCP_PRINTK(enable, fmt, args...) do { if (enable) \ - printk(fmt, ##args); \ - } while(0) -#define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \ - "%s: " fmt, __func__, ##a) - -#ifdef CONFIG_IP_DCCP_DEBUG -extern bool dccp_debug; -#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) -#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) -#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) -#else -#define dccp_pr_debug(format, a...) do {} while (0) -#define dccp_pr_debug_cat(format, a...) do {} while (0) -#define dccp_debug(format, a...) do {} while (0) -#endif - -extern struct inet_hashinfo dccp_hashinfo; - -DECLARE_PER_CPU(unsigned int, dccp_orphan_count); - -void dccp_time_wait(struct sock *sk, int state, int timeo); - -/* - * Set safe upper bounds for header and option length. Since Data Offset is 8 - * bits (RFC 4340, sec. 5.1), the total header length can never be more than - * 4 * 255 = 1020 bytes. The largest possible header length is 28 bytes (X=1): - * - DCCP-Response with ACK Subheader and 4 bytes of Service code OR - * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields - * Hence a safe upper bound for the maximum option length is 1020-28 = 992 - */ -#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) -#define DCCP_MAX_PACKET_HDR 28 -#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) -#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) - -/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ -#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) - -#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT - * state, about 60 seconds */ - -/* RFC 1122, 4.2.3.1 initial RTO value */ -#define DCCP_TIMEOUT_INIT ((unsigned int)(3 * HZ)) - -/* - * The maximum back-off value for retransmissions. This is needed for - * - retransmitting client-Requests (sec. 8.1.1), - * - retransmitting Close/CloseReq when closing (sec. 8.3), - * - feature-negotiation retransmission (sec. 6.6.3), - * - Acks in client-PARTOPEN state (sec. 8.1.5). - */ -#define DCCP_RTO_MAX ((unsigned int)(64 * HZ)) - -/* - * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 - */ -#define DCCP_SANE_RTT_MIN 100 -#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) -#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) - -/* sysctl variables for DCCP */ -extern int sysctl_dccp_request_retries; -extern int sysctl_dccp_retries1; -extern int sysctl_dccp_retries2; -extern int sysctl_dccp_tx_qlen; -extern int sysctl_dccp_sync_ratelimit; - -/* - * 48-bit sequence number arithmetic (signed and unsigned) - */ -#define INT48_MIN 0x800000000000LL /* 2^47 */ -#define UINT48_MAX 0xFFFFFFFFFFFFLL /* 2^48 - 1 */ -#define COMPLEMENT48(x) (0x1000000000000LL - (x)) /* 2^48 - x */ -#define TO_SIGNED48(x) (((x) < INT48_MIN)? (x) : -COMPLEMENT48( (x))) -#define TO_UNSIGNED48(x) (((x) >= 0)? (x) : COMPLEMENT48(-(x))) -#define ADD48(a, b) (((a) + (b)) & UINT48_MAX) -#define SUB48(a, b) ADD48((a), COMPLEMENT48(b)) - -static inline void dccp_inc_seqno(u64 *seqno) -{ - *seqno = ADD48(*seqno, 1); -} - -/* signed mod-2^48 distance: pos. if seqno1 < seqno2, neg. if seqno1 > seqno2 */ -static inline s64 dccp_delta_seqno(const u64 seqno1, const u64 seqno2) -{ - u64 delta = SUB48(seqno2, seqno1); - - return TO_SIGNED48(delta); -} - -/* is seq1 < seq2 ? */ -static inline int before48(const u64 seq1, const u64 seq2) -{ - return (s64)((seq2 << 16) - (seq1 << 16)) > 0; -} - -/* is seq1 > seq2 ? */ -#define after48(seq1, seq2) before48(seq2, seq1) - -/* is seq2 <= seq1 <= seq3 ? */ -static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3) -{ - return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16); -} - -/** - * dccp_loss_count - Approximate the number of lost data packets in a burst loss - * @s1: last known sequence number before the loss ('hole') - * @s2: first sequence number seen after the 'hole' - * @ndp: NDP count on packet with sequence number @s2 - */ -static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp) -{ - s64 delta = dccp_delta_seqno(s1, s2); - - WARN_ON(delta < 0); - delta -= ndp + 1; - - return delta > 0 ? delta : 0; -} - -/** - * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1 - */ -static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp) -{ - return dccp_loss_count(s1, s2, ndp) == 0; -} - -enum { - DCCP_MIB_NUM = 0, - DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */ - DCCP_MIB_ESTABRESETS, /* EstabResets */ - DCCP_MIB_CURRESTAB, /* CurrEstab */ - DCCP_MIB_OUTSEGS, /* OutSegs */ - DCCP_MIB_OUTRSTS, - DCCP_MIB_ABORTONTIMEOUT, - DCCP_MIB_TIMEOUTS, - DCCP_MIB_ABORTFAILED, - DCCP_MIB_PASSIVEOPENS, - DCCP_MIB_ATTEMPTFAILS, - DCCP_MIB_OUTDATAGRAMS, - DCCP_MIB_INERRS, - DCCP_MIB_OPTMANDATORYERROR, - DCCP_MIB_INVALIDOPT, - __DCCP_MIB_MAX -}; - -#define DCCP_MIB_MAX __DCCP_MIB_MAX -struct dccp_mib { - unsigned long mibs[DCCP_MIB_MAX]; -}; - -DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); -#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) -#define __DCCP_INC_STATS(field) __SNMP_INC_STATS(dccp_statistics, field) -#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) - -/* - * Checksumming routines - */ -static inline unsigned int dccp_csum_coverage(const struct sk_buff *skb) -{ - const struct dccp_hdr* dh = dccp_hdr(skb); - - if (dh->dccph_cscov == 0) - return skb->len; - return (dh->dccph_doff + dh->dccph_cscov - 1) * sizeof(u32); -} - -static inline void dccp_csum_outgoing(struct sk_buff *skb) -{ - unsigned int cov = dccp_csum_coverage(skb); - - if (cov >= skb->len) - dccp_hdr(skb)->dccph_cscov = 0; - - skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0); -} - -void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb); - -int dccp_retransmit_skb(struct sock *sk); - -void dccp_send_ack(struct sock *sk); -void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, - struct request_sock *rsk); - -void dccp_send_sync(struct sock *sk, const u64 seq, - const enum dccp_pkt_type pkt_type); - -/* - * TX Packet Dequeueing Interface - */ -void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); -bool dccp_qpolicy_full(struct sock *sk); -void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); -struct sk_buff *dccp_qpolicy_top(struct sock *sk); -struct sk_buff *dccp_qpolicy_pop(struct sock *sk); -bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); - -/* - * TX Packet Output and TX Timers - */ -void dccp_write_xmit(struct sock *sk); -void dccp_write_space(struct sock *sk); -void dccp_flush_write_queue(struct sock *sk, long *time_budget); - -void dccp_init_xmit_timers(struct sock *sk); -static inline void dccp_clear_xmit_timers(struct sock *sk) -{ - inet_csk_clear_xmit_timers(sk); -} - -unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu); - -const char *dccp_packet_name(const int type); - -void dccp_set_state(struct sock *sk, const int state); -void dccp_done(struct sock *sk); - -int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, - struct sk_buff const *skb); - -int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); - -struct sock *dccp_create_openreq_child(const struct sock *sk, - const struct request_sock *req, - const struct sk_buff *skb); - -int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); - -struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req); -struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, - struct request_sock *req); - -int dccp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb); -int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - struct dccp_hdr *dh, unsigned int len); -int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned int len); - -void dccp_destruct_common(struct sock *sk); -int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized); -void dccp_destroy_sock(struct sock *sk); - -void dccp_close(struct sock *sk, long timeout); -struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, - struct request_sock *req); - -int dccp_connect(struct sock *sk); -int dccp_disconnect(struct sock *sk, int flags); -int dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -int dccp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen); -int dccp_ioctl(struct sock *sk, int cmd, int *karg); -int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); -int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len); -void dccp_shutdown(struct sock *sk, int how); -int inet_dccp_listen(struct socket *sock, int backlog); -__poll_t dccp_poll(struct file *file, struct socket *sock, - poll_table *wait); -int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); -void dccp_req_err(struct sock *sk, u64 seq); - -struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb); -int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); -void dccp_send_close(struct sock *sk, const int active); -int dccp_invalid_packet(struct sk_buff *skb); -u32 dccp_sample_rtt(struct sock *sk, long delta); - -static inline bool dccp_bad_service_code(const struct sock *sk, - const __be32 service) -{ - const struct dccp_sock *dp = dccp_sk(sk); - - if (dp->dccps_service == service) - return false; - return !dccp_list_has_service(dp->dccps_service_list, service); -} - -/** - * dccp_skb_cb - DCCP per-packet control information - * @dccpd_type: one of %dccp_pkt_type (or unknown) - * @dccpd_ccval: CCVal field (5.1), see e.g. RFC 4342, 8.1 - * @dccpd_reset_code: one of %dccp_reset_codes - * @dccpd_reset_data: Data1..3 fields (depend on @dccpd_reset_code) - * @dccpd_opt_len: total length of all options (5.8) in the packet - * @dccpd_seq: sequence number - * @dccpd_ack_seq: acknowledgment number subheader field value - * - * This is used for transmission as well as for reception. - */ -struct dccp_skb_cb { - union { - struct inet_skb_parm h4; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_skb_parm h6; -#endif - } header; - __u8 dccpd_type:4; - __u8 dccpd_ccval:4; - __u8 dccpd_reset_code, - dccpd_reset_data[3]; - __u16 dccpd_opt_len; - __u64 dccpd_seq; - __u64 dccpd_ack_seq; -}; - -#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0])) - -/* RFC 4340, sec. 7.7 */ -static inline int dccp_non_data_packet(const struct sk_buff *skb) -{ - const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; - - return type == DCCP_PKT_ACK || - type == DCCP_PKT_CLOSE || - type == DCCP_PKT_CLOSEREQ || - type == DCCP_PKT_RESET || - type == DCCP_PKT_SYNC || - type == DCCP_PKT_SYNCACK; -} - -/* RFC 4340, sec. 7.7 */ -static inline int dccp_data_packet(const struct sk_buff *skb) -{ - const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; - - return type == DCCP_PKT_DATA || - type == DCCP_PKT_DATAACK || - type == DCCP_PKT_REQUEST || - type == DCCP_PKT_RESPONSE; -} - -static inline int dccp_packet_without_ack(const struct sk_buff *skb) -{ - const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; - - return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST; -} - -#define DCCP_PKT_WITHOUT_ACK_SEQ (UINT48_MAX << 2) - -static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss) -{ - struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh + - sizeof(*dh)); - dh->dccph_seq2 = 0; - dh->dccph_seq = htons((gss >> 32) & 0xfffff); - dhx->dccph_seq_low = htonl(gss & 0xffffffff); -} - -static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, - const u64 gsr) -{ - dhack->dccph_reserved1 = 0; - dhack->dccph_ack_nr_high = htons(gsr >> 32); - dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff); -} - -static inline void dccp_update_gsr(struct sock *sk, u64 seq) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (after48(seq, dp->dccps_gsr)) - dp->dccps_gsr = seq; - /* Sequence validity window depends on remote Sequence Window (7.5.1) */ - dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); - /* - * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, - * 7.5.1 we perform this check beyond the initial handshake: W/W' are - * always > 32, so for the first W/W' packets in the lifetime of a - * connection we always have to adjust SWL. - * A second reason why we are doing this is that the window depends on - * the feature-remote value of Sequence Window: nothing stops the peer - * from updating this value while we are busy adjusting SWL for the - * first W packets (we would have to count from scratch again then). - * Therefore it is safer to always make sure that the Sequence Window - * is not artificially extended by a peer who grows SWL downwards by - * continually updating the feature-remote Sequence-Window. - * If sequence numbers wrap it is bad luck. But that will take a while - * (48 bit), and this measure prevents Sequence-number attacks. - */ - if (before48(dp->dccps_swl, dp->dccps_isr)) - dp->dccps_swl = dp->dccps_isr; - dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); -} - -static inline void dccp_update_gss(struct sock *sk, u64 seq) -{ - struct dccp_sock *dp = dccp_sk(sk); - - dp->dccps_gss = seq; - /* Ack validity window depends on local Sequence Window value (7.5.1) */ - dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); - /* Adjust AWL so that it is not below ISS - see comment above for SWL */ - if (before48(dp->dccps_awl, dp->dccps_iss)) - dp->dccps_awl = dp->dccps_iss; - dp->dccps_awh = dp->dccps_gss; -} - -static inline int dccp_ackvec_pending(const struct sock *sk) -{ - return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL && - !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec); -} - -static inline int dccp_ack_pending(const struct sock *sk) -{ - return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); -} - -int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val); -int dccp_feat_finalise_settings(struct dccp_sock *dp); -int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); -int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, - struct sk_buff *skb); -int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); -void dccp_feat_list_purge(struct list_head *fn_list); - -int dccp_insert_options(struct sock *sk, struct sk_buff *skb); -int dccp_insert_options_rsk(struct dccp_request_sock *, struct sk_buff *); -u32 dccp_timestamp(void); -void dccp_timestamping_init(void); -int dccp_insert_option(struct sk_buff *skb, unsigned char option, - const void *value, unsigned char len); - -#ifdef CONFIG_SYSCTL -int dccp_sysctl_init(void); -void dccp_sysctl_exit(void); -#else -static inline int dccp_sysctl_init(void) -{ - return 0; -} - -static inline void dccp_sysctl_exit(void) -{ -} -#endif - -#endif /* _DCCP_H */ diff --git a/net/dccp/diag.c b/net/dccp/diag.c deleted file mode 100644 index f5019d95c3ae..000000000000 --- a/net/dccp/diag.c +++ /dev/null @@ -1,85 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/diag.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - - -#include -#include - -#include "ccid.h" -#include "dccp.h" - -static void dccp_get_info(struct sock *sk, struct tcp_info *info) -{ - struct dccp_sock *dp = dccp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - - memset(info, 0, sizeof(*info)); - - info->tcpi_state = sk->sk_state; - info->tcpi_retransmits = icsk->icsk_retransmits; - info->tcpi_probes = icsk->icsk_probes_out; - info->tcpi_backoff = icsk->icsk_backoff; - info->tcpi_pmtu = icsk->icsk_pmtu_cookie; - - if (dp->dccps_hc_rx_ackvec != NULL) - info->tcpi_options |= TCPI_OPT_SACK; - - if (dp->dccps_hc_rx_ccid != NULL) - ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); - - if (dp->dccps_hc_tx_ccid != NULL) - ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info); -} - -static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, - void *_info) -{ - r->idiag_rqueue = r->idiag_wqueue = 0; - - if (_info != NULL) - dccp_get_info(sk, _info); -} - -static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r) -{ - inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r); -} - -static int dccp_diag_dump_one(struct netlink_callback *cb, - const struct inet_diag_req_v2 *req) -{ - return inet_diag_dump_one_icsk(&dccp_hashinfo, cb, req); -} - -static const struct inet_diag_handler dccp_diag_handler = { - .owner = THIS_MODULE, - .dump = dccp_diag_dump, - .dump_one = dccp_diag_dump_one, - .idiag_get_info = dccp_diag_get_info, - .idiag_type = IPPROTO_DCCP, - .idiag_info_size = sizeof(struct tcp_info), -}; - -static int __init dccp_diag_init(void) -{ - return inet_diag_register(&dccp_diag_handler); -} - -static void __exit dccp_diag_fini(void) -{ - inet_diag_unregister(&dccp_diag_handler); -} - -module_init(dccp_diag_init); -module_exit(dccp_diag_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo "); -MODULE_DESCRIPTION("DCCP inet_diag handler"); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-33 /* AF_INET - IPPROTO_DCCP */); diff --git a/net/dccp/feat.c b/net/dccp/feat.c deleted file mode 100644 index f7554dcdaaba..000000000000 --- a/net/dccp/feat.c +++ /dev/null @@ -1,1581 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/feat.c - * - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) - * - * Copyright (c) 2008 Gerrit Renker - * Rewrote from scratch, some bits from earlier code by - * Copyright (c) 2005 Andrea Bittau - * - * ASSUMPTIONS - * ----------- - * o Feature negotiation is coordinated with connection setup (as in TCP), wild - * changes of parameters of an established connection are not supported. - * o Changing non-negotiable (NN) values is supported in state OPEN/PARTOPEN. - * o All currently known SP features have 1-byte quantities. If in the future - * extensions of RFCs 4340..42 define features with item lengths larger than - * one byte, a feature-specific extension of the code will be required. - */ -#include -#include -#include "ccid.h" -#include "feat.h" - -/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ -unsigned long sysctl_dccp_sequence_window __read_mostly = 100; -int sysctl_dccp_rx_ccid __read_mostly = 2, - sysctl_dccp_tx_ccid __read_mostly = 2; - -/* - * Feature activation handlers. - * - * These all use an u64 argument, to provide enough room for NN/SP features. At - * this stage the negotiated values have been checked to be within their range. - */ -static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid *new_ccid = ccid_new(ccid, sk, rx); - - if (new_ccid == NULL) - return -ENOMEM; - - if (rx) { - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = new_ccid; - } else { - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_tx_ccid = new_ccid; - } - return 0; -} - -static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (rx) { - dp->dccps_r_seq_win = seq_win; - /* propagate changes to update SWL/SWH */ - dccp_update_gsr(sk, dp->dccps_gsr); - } else { - dp->dccps_l_seq_win = seq_win; - /* propagate changes to update AWL */ - dccp_update_gss(sk, dp->dccps_gss); - } - return 0; -} - -static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) -{ - if (rx) - dccp_sk(sk)->dccps_r_ack_ratio = ratio; - else - dccp_sk(sk)->dccps_l_ack_ratio = ratio; - return 0; -} - -static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (rx) { - if (enable && dp->dccps_hc_rx_ackvec == NULL) { - dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); - if (dp->dccps_hc_rx_ackvec == NULL) - return -ENOMEM; - } else if (!enable) { - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - } - } - return 0; -} - -static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) -{ - if (!rx) - dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); - return 0; -} - -/* - * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that - * `rx' holds when the sending peer informs about his partial coverage via a - * ChangeR() option. In the other case, we are the sender and the receiver - * announces its coverage via ChangeL() options. The policy here is to honour - * such communication by enabling the corresponding partial coverage - but only - * if it has not been set manually before; the warning here means that all - * packets will be dropped. - */ -static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (rx) - dp->dccps_pcrlen = cscov; - else { - if (dp->dccps_pcslen == 0) - dp->dccps_pcslen = cscov; - else if (cscov > dp->dccps_pcslen) - DCCP_WARN("CsCov %u too small, peer requires >= %u\n", - dp->dccps_pcslen, (u8)cscov); - } - return 0; -} - -static const struct { - u8 feat_num; /* DCCPF_xxx */ - enum dccp_feat_type rxtx; /* RX or TX */ - enum dccp_feat_type reconciliation; /* SP or NN */ - u8 default_value; /* as in 6.4 */ - int (*activation_hdlr)(struct sock *sk, u64 val, bool rx); -/* - * Lookup table for location and type of features (from RFC 4340/4342) - * +--------------------------+----+-----+----+----+---------+-----------+ - * | Feature | Location | Reconc. | Initial | Section | - * | | RX | TX | SP | NN | Value | Reference | - * +--------------------------+----+-----+----+----+---------+-----------+ - * | DCCPF_CCID | | X | X | | 2 | 10 | - * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 | - * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 | - * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 | - * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 | - * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 | - * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 | - * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 | - * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 | - * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 | - * +--------------------------+----+-----+----+----+---------+-----------+ - */ -} dccp_feat_table[] = { - { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid }, - { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL }, - { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win }, - { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL }, - { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio}, - { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec }, - { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp }, - { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov}, - { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL }, - { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL }, -}; -#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) - -/** - * dccp_feat_index - Hash function to map feature number into array position - * @feat_num: feature to hash, one of %dccp_feature_numbers - * - * Returns consecutive array index or -1 if the feature is not understood. - */ -static int dccp_feat_index(u8 feat_num) -{ - /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ - if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) - return feat_num - 1; - - /* - * Other features: add cases for new feature types here after adding - * them to the above table. - */ - switch (feat_num) { - case DCCPF_SEND_LEV_RATE: - return DCCP_FEAT_SUPPORTED_MAX - 1; - } - return -1; -} - -static u8 dccp_feat_type(u8 feat_num) -{ - int idx = dccp_feat_index(feat_num); - - if (idx < 0) - return FEAT_UNKNOWN; - return dccp_feat_table[idx].reconciliation; -} - -static int dccp_feat_default_value(u8 feat_num) -{ - int idx = dccp_feat_index(feat_num); - /* - * There are no default values for unknown features, so encountering a - * negative index here indicates a serious problem somewhere else. - */ - DCCP_BUG_ON(idx < 0); - - return idx < 0 ? 0 : dccp_feat_table[idx].default_value; -} - -/* - * Debugging and verbose-printing section - */ -static const char *dccp_feat_fname(const u8 feat) -{ - static const char *const feature_names[] = { - [DCCPF_RESERVED] = "Reserved", - [DCCPF_CCID] = "CCID", - [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", - [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", - [DCCPF_ECN_INCAPABLE] = "ECN Incapable", - [DCCPF_ACK_RATIO] = "Ack Ratio", - [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", - [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", - [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", - [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", - }; - if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) - return feature_names[DCCPF_RESERVED]; - - if (feat == DCCPF_SEND_LEV_RATE) - return "Send Loss Event Rate"; - if (feat >= DCCPF_MIN_CCID_SPECIFIC) - return "CCID-specific"; - - return feature_names[feat]; -} - -static const char *const dccp_feat_sname[] = { - "DEFAULT", "INITIALISING", "CHANGING", "UNSTABLE", "STABLE", -}; - -#ifdef CONFIG_IP_DCCP_DEBUG -static const char *dccp_feat_oname(const u8 opt) -{ - switch (opt) { - case DCCPO_CHANGE_L: return "Change_L"; - case DCCPO_CONFIRM_L: return "Confirm_L"; - case DCCPO_CHANGE_R: return "Change_R"; - case DCCPO_CONFIRM_R: return "Confirm_R"; - } - return NULL; -} - -static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) -{ - u8 i, type = dccp_feat_type(feat_num); - - if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) - dccp_pr_debug_cat("(NULL)"); - else if (type == FEAT_SP) - for (i = 0; i < val->sp.len; i++) - dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); - else if (type == FEAT_NN) - dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); - else - dccp_pr_debug_cat("unknown type %u", type); -} - -static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len) -{ - u8 type = dccp_feat_type(feat_num); - dccp_feat_val fval = { .sp.vec = list, .sp.len = len }; - - if (type == FEAT_NN) - fval.nn = dccp_decode_value_var(list, len); - dccp_feat_printval(feat_num, &fval); -} - -static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) -{ - dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", - dccp_feat_fname(entry->feat_num)); - dccp_feat_printval(entry->feat_num, &entry->val); - dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state], - entry->needs_confirm ? "(Confirm pending)" : ""); -} - -#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ - dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ - dccp_feat_printvals(feat, val, len); \ - dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0) - -#define dccp_feat_print_fnlist(fn_list) { \ - const struct dccp_feat_entry *___entry; \ - \ - dccp_pr_debug("List Dump:\n"); \ - list_for_each_entry(___entry, fn_list, node) \ - dccp_feat_print_entry(___entry); \ -} -#else /* ! CONFIG_IP_DCCP_DEBUG */ -#define dccp_feat_print_opt(opt, feat, val, len, mandatory) -#define dccp_feat_print_fnlist(fn_list) -#endif - -static int __dccp_feat_activate(struct sock *sk, const int idx, - const bool is_local, dccp_feat_val const *fval) -{ - bool rx; - u64 val; - - if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) - return -1; - if (dccp_feat_table[idx].activation_hdlr == NULL) - return 0; - - if (fval == NULL) { - val = dccp_feat_table[idx].default_value; - } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { - if (fval->sp.vec == NULL) { - /* - * This can happen when an empty Confirm is sent - * for an SP (i.e. known) feature. In this case - * we would be using the default anyway. - */ - DCCP_CRIT("Feature #%d undefined: using default", idx); - val = dccp_feat_table[idx].default_value; - } else { - val = fval->sp.vec[0]; - } - } else { - val = fval->nn; - } - - /* Location is RX if this is a local-RX or remote-TX feature */ - rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); - - dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX", - dccp_feat_fname(dccp_feat_table[idx].feat_num), - fval ? "" : "default ", (unsigned long long)val); - - return dccp_feat_table[idx].activation_hdlr(sk, val, rx); -} - -/** - * dccp_feat_activate - Activate feature value on socket - * @sk: fully connected DCCP socket (after handshake is complete) - * @feat_num: feature to activate, one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is meant - * @fval: the value (SP or NN) to activate, or NULL to use the default value - * - * For general use this function is preferable over __dccp_feat_activate(). - */ -static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local, - dccp_feat_val const *fval) -{ - return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval); -} - -/* Test for "Req'd" feature (RFC 4340, 6.4) */ -static inline int dccp_feat_must_be_understood(u8 feat_num) -{ - return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS || - feat_num == DCCPF_SEQUENCE_WINDOW; -} - -/* copy constructor, fval must not already contain allocated memory */ -static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) -{ - fval->sp.len = len; - if (fval->sp.len > 0) { - fval->sp.vec = kmemdup(val, len, gfp_any()); - if (fval->sp.vec == NULL) { - fval->sp.len = 0; - return -ENOMEM; - } - } - return 0; -} - -static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) -{ - if (unlikely(val == NULL)) - return; - if (dccp_feat_type(feat_num) == FEAT_SP) - kfree(val->sp.vec); - memset(val, 0, sizeof(*val)); -} - -static struct dccp_feat_entry * - dccp_feat_clone_entry(struct dccp_feat_entry const *original) -{ - struct dccp_feat_entry *new; - u8 type = dccp_feat_type(original->feat_num); - - if (type == FEAT_UNKNOWN) - return NULL; - - new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); - if (new == NULL) - return NULL; - - if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, - original->val.sp.vec, - original->val.sp.len)) { - kfree(new); - return NULL; - } - return new; -} - -static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) -{ - if (entry != NULL) { - dccp_feat_val_destructor(entry->feat_num, &entry->val); - kfree(entry); - } -} - -/* - * List management functions - * - * Feature negotiation lists rely on and maintain the following invariants: - * - each feat_num in the list is known, i.e. we know its type and default value - * - each feat_num/is_local combination is unique (old entries are overwritten) - * - SP values are always freshly allocated - * - list is sorted in increasing order of feature number (faster lookup) - */ -static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, - u8 feat_num, bool is_local) -{ - struct dccp_feat_entry *entry; - - list_for_each_entry(entry, fn_list, node) { - if (entry->feat_num == feat_num && entry->is_local == is_local) - return entry; - else if (entry->feat_num > feat_num) - break; - } - return NULL; -} - -/** - * dccp_feat_entry_new - Central list update routine (called by all others) - * @head: list to add to - * @feat: feature number - * @local: whether the local (1) or remote feature with number @feat is meant - * - * This is the only constructor and serves to ensure the above invariants. - */ -static struct dccp_feat_entry * - dccp_feat_entry_new(struct list_head *head, u8 feat, bool local) -{ - struct dccp_feat_entry *entry; - - list_for_each_entry(entry, head, node) - if (entry->feat_num == feat && entry->is_local == local) { - dccp_feat_val_destructor(entry->feat_num, &entry->val); - return entry; - } else if (entry->feat_num > feat) { - head = &entry->node; - break; - } - - entry = kmalloc(sizeof(*entry), gfp_any()); - if (entry != NULL) { - entry->feat_num = feat; - entry->is_local = local; - list_add_tail(&entry->node, head); - } - return entry; -} - -/** - * dccp_feat_push_change - Add/overwrite a Change option in the list - * @fn_list: feature-negotiation list to update - * @feat: one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is meant - * @mandatory: whether to use Mandatory feature negotiation options - * @fval: pointer to NN/SP value to be inserted (will be copied) - */ -static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, - u8 mandatory, dccp_feat_val *fval) -{ - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); - - if (new == NULL) - return -ENOMEM; - - new->feat_num = feat; - new->is_local = local; - new->state = FEAT_INITIALISING; - new->needs_confirm = false; - new->empty_confirm = false; - new->val = *fval; - new->needs_mandatory = mandatory; - - return 0; -} - -/** - * dccp_feat_push_confirm - Add a Confirm entry to the FN list - * @fn_list: feature-negotiation list to add to - * @feat: one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is being confirmed - * @fval: pointer to NN/SP value to be inserted or NULL - * - * Returns 0 on success, a Reset code for further processing otherwise. - */ -static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, - dccp_feat_val *fval) -{ - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); - - if (new == NULL) - return DCCP_RESET_CODE_TOO_BUSY; - - new->feat_num = feat; - new->is_local = local; - new->state = FEAT_STABLE; /* transition in 6.6.2 */ - new->needs_confirm = true; - new->empty_confirm = (fval == NULL); - new->val.nn = 0; /* zeroes the whole structure */ - if (!new->empty_confirm) - new->val = *fval; - new->needs_mandatory = false; - - return 0; -} - -static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) -{ - return dccp_feat_push_confirm(fn_list, feat, local, NULL); -} - -static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) -{ - list_del(&entry->node); - dccp_feat_entry_destructor(entry); -} - -void dccp_feat_list_purge(struct list_head *fn_list) -{ - struct dccp_feat_entry *entry, *next; - - list_for_each_entry_safe(entry, next, fn_list, node) - dccp_feat_entry_destructor(entry); - INIT_LIST_HEAD(fn_list); -} -EXPORT_SYMBOL_GPL(dccp_feat_list_purge); - -/* generate @to as full clone of @from - @to must not contain any nodes */ -int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) -{ - struct dccp_feat_entry *entry, *new; - - INIT_LIST_HEAD(to); - list_for_each_entry(entry, from, node) { - new = dccp_feat_clone_entry(entry); - if (new == NULL) - goto cloning_failed; - list_add_tail(&new->node, to); - } - return 0; - -cloning_failed: - dccp_feat_list_purge(to); - return -ENOMEM; -} - -/** - * dccp_feat_valid_nn_length - Enforce length constraints on NN options - * @feat_num: feature to return length of, one of %dccp_feature_numbers - * - * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, - * incoming options are accepted as long as their values are valid. - */ -static u8 dccp_feat_valid_nn_length(u8 feat_num) -{ - if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */ - return 2; - if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */ - return 6; - return 0; -} - -static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) -{ - switch (feat_num) { - case DCCPF_ACK_RATIO: - return val <= DCCPF_ACK_RATIO_MAX; - case DCCPF_SEQUENCE_WINDOW: - return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX; - } - return 0; /* feature unknown - so we can't tell */ -} - -/* check that SP values are within the ranges defined in RFC 4340 */ -static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val) -{ - switch (feat_num) { - case DCCPF_CCID: - return val == DCCPC_CCID2 || val == DCCPC_CCID3; - /* Type-check Boolean feature values: */ - case DCCPF_SHORT_SEQNOS: - case DCCPF_ECN_INCAPABLE: - case DCCPF_SEND_ACK_VECTOR: - case DCCPF_SEND_NDP_COUNT: - case DCCPF_DATA_CHECKSUM: - case DCCPF_SEND_LEV_RATE: - return val < 2; - case DCCPF_MIN_CSUM_COVER: - return val < 16; - } - return 0; /* feature unknown */ -} - -static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) -{ - if (sp_list == NULL || sp_len < 1) - return 0; - while (sp_len--) - if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++)) - return 0; - return 1; -} - -/** - * dccp_feat_insert_opts - Generate FN options from current list state - * @skb: next sk_buff to be sent to the peer - * @dp: for client during handshake and general negotiation - * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND) - */ -int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; - struct dccp_feat_entry *pos, *next; - u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN]; - bool rpt; - - /* put entries into @skb in the order they appear in the list */ - list_for_each_entry_safe_reverse(pos, next, fn, node) { - opt = dccp_feat_genopt(pos); - type = dccp_feat_type(pos->feat_num); - rpt = false; - - if (pos->empty_confirm) { - len = 0; - ptr = NULL; - } else { - if (type == FEAT_SP) { - len = pos->val.sp.len; - ptr = pos->val.sp.vec; - rpt = pos->needs_confirm; - } else if (type == FEAT_NN) { - len = dccp_feat_valid_nn_length(pos->feat_num); - ptr = nn_in_nbo; - dccp_encode_value_var(pos->val.nn, ptr, len); - } else { - DCCP_BUG("unknown feature %u", pos->feat_num); - return -1; - } - } - dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0); - - if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) - return -1; - if (pos->needs_mandatory && dccp_insert_option_mandatory(skb)) - return -1; - - if (skb->sk->sk_state == DCCP_OPEN && - (opt == DCCPO_CONFIRM_R || opt == DCCPO_CONFIRM_L)) { - /* - * Confirms don't get retransmitted (6.6.3) once the - * connection is in state OPEN - */ - dccp_feat_list_pop(pos); - } else { - /* - * Enter CHANGING after transmitting the Change - * option (6.6.2). - */ - if (pos->state == FEAT_INITIALISING) - pos->state = FEAT_CHANGING; - } - } - return 0; -} - -/** - * __feat_register_nn - Register new NN value on socket - * @fn: feature-negotiation list to register with - * @feat: an NN feature from %dccp_feature_numbers - * @mandatory: use Mandatory option if 1 - * @nn_val: value to register (restricted to 4 bytes) - * - * Note that NN features are local by definition (RFC 4340, 6.3.2). - */ -static int __feat_register_nn(struct list_head *fn, u8 feat, - u8 mandatory, u64 nn_val) -{ - dccp_feat_val fval = { .nn = nn_val }; - - if (dccp_feat_type(feat) != FEAT_NN || - !dccp_feat_is_valid_nn_val(feat, nn_val)) - return -EINVAL; - - /* Don't bother with default values, they will be activated anyway. */ - if (nn_val - (u64)dccp_feat_default_value(feat) == 0) - return 0; - - return dccp_feat_push_change(fn, feat, 1, mandatory, &fval); -} - -/** - * __feat_register_sp - Register new SP value/list on socket - * @fn: feature-negotiation list to register with - * @feat: an SP feature from %dccp_feature_numbers - * @is_local: whether the local (1) or the remote (0) @feat is meant - * @mandatory: use Mandatory option if 1 - * @sp_val: SP value followed by optional preference list - * @sp_len: length of @sp_val in bytes - */ -static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, - u8 mandatory, u8 const *sp_val, u8 sp_len) -{ - dccp_feat_val fval; - - if (dccp_feat_type(feat) != FEAT_SP || - !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) - return -EINVAL; - - /* Avoid negotiating alien CCIDs by only advertising supported ones */ - if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) - return -EOPNOTSUPP; - - if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) - return -ENOMEM; - - if (dccp_feat_push_change(fn, feat, is_local, mandatory, &fval)) { - kfree(fval.sp.vec); - return -ENOMEM; - } - - return 0; -} - -/** - * dccp_feat_register_sp - Register requests to change SP feature values - * @sk: client or listening socket - * @feat: one of %dccp_feature_numbers - * @is_local: whether the local (1) or remote (0) @feat is meant - * @list: array of preferred values, in descending order of preference - * @len: length of @list in bytes - */ -int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, - u8 const *list, u8 len) -{ /* any changes must be registered before establishing the connection */ - if (sk->sk_state != DCCP_CLOSED) - return -EISCONN; - if (dccp_feat_type(feat) != FEAT_SP) - return -EINVAL; - return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, - 0, list, len); -} - -/** - * dccp_feat_nn_get - Query current/pending value of NN feature - * @sk: DCCP socket of an established connection - * @feat: NN feature number from %dccp_feature_numbers - * - * For a known NN feature, returns value currently being negotiated, or - * current (confirmed) value if no negotiation is going on. - */ -u64 dccp_feat_nn_get(struct sock *sk, u8 feat) -{ - if (dccp_feat_type(feat) == FEAT_NN) { - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_feat_entry *entry; - - entry = dccp_feat_list_lookup(&dp->dccps_featneg, feat, 1); - if (entry != NULL) - return entry->val.nn; - - switch (feat) { - case DCCPF_ACK_RATIO: - return dp->dccps_l_ack_ratio; - case DCCPF_SEQUENCE_WINDOW: - return dp->dccps_l_seq_win; - } - } - DCCP_BUG("attempt to look up unsupported feature %u", feat); - return 0; -} -EXPORT_SYMBOL_GPL(dccp_feat_nn_get); - -/** - * dccp_feat_signal_nn_change - Update NN values for an established connection - * @sk: DCCP socket of an established connection - * @feat: NN feature number from %dccp_feature_numbers - * @nn_val: the new value to use - * - * This function is used to communicate NN updates out-of-band. - */ -int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - dccp_feat_val fval = { .nn = nn_val }; - struct dccp_feat_entry *entry; - - if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN) - return 0; - - if (dccp_feat_type(feat) != FEAT_NN || - !dccp_feat_is_valid_nn_val(feat, nn_val)) - return -EINVAL; - - if (nn_val == dccp_feat_nn_get(sk, feat)) - return 0; /* already set or negotiation under way */ - - entry = dccp_feat_list_lookup(fn, feat, 1); - if (entry != NULL) { - dccp_pr_debug("Clobbering existing NN entry %llu -> %llu\n", - (unsigned long long)entry->val.nn, - (unsigned long long)nn_val); - dccp_feat_list_pop(entry); - } - - inet_csk_schedule_ack(sk); - return dccp_feat_push_change(fn, feat, 1, 0, &fval); -} -EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change); - -/* - * Tracking features whose value depend on the choice of CCID - * - * This is designed with an extension in mind so that a list walk could be done - * before activating any features. However, the existing framework was found to - * work satisfactorily up until now, the automatic verification is left open. - * When adding new CCIDs, add a corresponding dependency table here. - */ -static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) -{ - static const struct ccid_dependency ccid2_dependencies[2][2] = { - /* - * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX - * feature and Send Ack Vector is an RX feature, `is_local' - * needs to be reversed. - */ - { /* Dependencies of the receiver-side (remote) CCID2 */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = true, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 } - }, - { /* Dependencies of the sender-side (local) CCID2 */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 } - } - }; - static const struct ccid_dependency ccid3_dependencies[2][5] = { - { /* - * Dependencies of the receiver-side CCID3 - */ - { /* locally disable Ack Vectors */ - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = true, - .is_mandatory = false, - .val = 0 - }, - { /* see below why Send Loss Event Rate is on */ - .dependent_feat = DCCPF_SEND_LEV_RATE, - .is_local = true, - .is_mandatory = true, - .val = 1 - }, - { /* NDP Count is needed as per RFC 4342, 6.1.1 */ - .dependent_feat = DCCPF_SEND_NDP_COUNT, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 }, - }, - { /* - * CCID3 at the TX side: we request that the HC-receiver - * will not send Ack Vectors (they will be ignored, so - * Mandatory is not set); we enable Send Loss Event Rate - * (Mandatory since the implementation does not support - * the Loss Intervals option of RFC 4342, 8.6). - * The last two options are for peer's information only. - */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = false, - .is_mandatory = false, - .val = 0 - }, - { - .dependent_feat = DCCPF_SEND_LEV_RATE, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { /* this CCID does not support Ack Ratio */ - .dependent_feat = DCCPF_ACK_RATIO, - .is_local = true, - .is_mandatory = false, - .val = 0 - }, - { /* tell receiver we are sending NDP counts */ - .dependent_feat = DCCPF_SEND_NDP_COUNT, - .is_local = true, - .is_mandatory = false, - .val = 1 - }, - { 0, 0, 0, 0 } - } - }; - switch (ccid) { - case DCCPC_CCID2: - return ccid2_dependencies[is_local]; - case DCCPC_CCID3: - return ccid3_dependencies[is_local]; - default: - return NULL; - } -} - -/** - * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID - * @fn: feature-negotiation list to update - * @id: CCID number to track - * @is_local: whether TX CCID (1) or RX CCID (0) is meant - * - * This function needs to be called after registering all other features. - */ -static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) -{ - const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local); - int i, rc = (table == NULL); - - for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++) - if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP) - rc = __feat_register_sp(fn, table[i].dependent_feat, - table[i].is_local, - table[i].is_mandatory, - &table[i].val, 1); - else - rc = __feat_register_nn(fn, table[i].dependent_feat, - table[i].is_mandatory, - table[i].val); - return rc; -} - -/** - * dccp_feat_finalise_settings - Finalise settings before starting negotiation - * @dp: client or listening socket (settings will be inherited) - * - * This is called after all registrations (socket initialisation, sysctls, and - * sockopt calls), and before sending the first packet containing Change options - * (ie. client-Request or server-Response), to ensure internal consistency. - */ -int dccp_feat_finalise_settings(struct dccp_sock *dp) -{ - struct list_head *fn = &dp->dccps_featneg; - struct dccp_feat_entry *entry; - int i = 2, ccids[2] = { -1, -1 }; - - /* - * Propagating CCIDs: - * 1) not useful to propagate CCID settings if this host advertises more - * than one CCID: the choice of CCID may still change - if this is - * the client, or if this is the server and the client sends - * singleton CCID values. - * 2) since is that propagate_ccid changes the list, we defer changing - * the sorted list until after the traversal. - */ - list_for_each_entry(entry, fn, node) - if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1) - ccids[entry->is_local] = entry->val.sp.vec[0]; - while (i--) - if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) - return -1; - dccp_feat_print_fnlist(fn); - return 0; -} - -/** - * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features - * @dreq: server socket to resolve - * - * It is the server which resolves the dependencies once the CCID has been - * fully negotiated. If no CCID has been negotiated, it uses the default CCID. - */ -int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) -{ - struct list_head *fn = &dreq->dreq_featneg; - struct dccp_feat_entry *entry; - u8 is_local, ccid; - - for (is_local = 0; is_local <= 1; is_local++) { - entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local); - - if (entry != NULL && !entry->empty_confirm) - ccid = entry->val.sp.vec[0]; - else - ccid = dccp_feat_default_value(DCCPF_CCID); - - if (dccp_feat_propagate_ccid(fn, ccid, is_local)) - return -1; - } - return 0; -} - -/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ -static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) -{ - u8 c, s; - - for (s = 0; s < slen; s++) - for (c = 0; c < clen; c++) - if (servlist[s] == clilist[c]) - return servlist[s]; - return -1; -} - -/** - * dccp_feat_prefer - Move preferred entry to the start of array - * @preferred_value: entry to move to start of array - * @array: array of preferred entries - * @array_len: size of the array - * - * Reorder the @array_len elements in @array so that @preferred_value comes - * first. Returns >0 to indicate that @preferred_value does occur in @array. - */ -static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) -{ - u8 i, does_occur = 0; - - if (array != NULL) { - for (i = 0; i < array_len; i++) - if (array[i] == preferred_value) { - array[i] = array[0]; - does_occur++; - } - if (does_occur) - array[0] = preferred_value; - } - return does_occur; -} - -/** - * dccp_feat_reconcile - Reconcile SP preference lists - * @fv: SP list to reconcile into - * @arr: received SP preference list - * @len: length of @arr in bytes - * @is_server: whether this side is the server (and @fv is the server's list) - * @reorder: whether to reorder the list in @fv after reconciling with @arr - * When successful, > 0 is returned and the reconciled list is in @fval. - * A value of 0 means that negotiation failed (no shared entry). - */ -static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, - bool is_server, bool reorder) -{ - int rc; - - if (!fv->sp.vec || !arr) { - DCCP_CRIT("NULL feature value or array"); - return 0; - } - - if (is_server) - rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len); - else - rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len); - - if (!reorder) - return rc; - if (rc < 0) - return 0; - - /* - * Reorder list: used for activating features and in dccp_insert_fn_opt. - */ - return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); -} - -/** - * dccp_feat_change_recv - Process incoming ChangeL/R options - * @fn: feature-negotiation list to update - * @is_mandatory: whether the Change was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R - * @feat: one of %dccp_feature_numbers - * @val: NN value or SP value/preference list - * @len: length of @val in bytes - * @server: whether this node is the server (1) or the client (0) - */ -static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, - u8 feat, u8 *val, u8 len, const bool server) -{ - u8 defval, type = dccp_feat_type(feat); - const bool local = (opt == DCCPO_CHANGE_R); - struct dccp_feat_entry *entry; - dccp_feat_val fval; - - if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ - goto unknown_feature_or_value; - - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); - - /* - * Negotiation of NN features: Change R is invalid, so there is no - * simultaneous negotiation; hence we do not look up in the list. - */ - if (type == FEAT_NN) { - if (local || len > sizeof(fval.nn)) - goto unknown_feature_or_value; - - /* 6.3.2: "The feature remote MUST accept any valid value..." */ - fval.nn = dccp_decode_value_var(val, len); - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) - goto unknown_feature_or_value; - - return dccp_feat_push_confirm(fn, feat, local, &fval); - } - - /* - * Unidirectional/simultaneous negotiation of SP features (6.3.1) - */ - entry = dccp_feat_list_lookup(fn, feat, local); - if (entry == NULL) { - /* - * No particular preferences have been registered. We deal with - * this situation by assuming that all valid values are equally - * acceptable, and apply the following checks: - * - if the peer's list is a singleton, we accept a valid value; - * - if we are the server, we first try to see if the peer (the - * client) advertises the default value. If yes, we use it, - * otherwise we accept the preferred value; - * - else if we are the client, we use the first list element. - */ - if (dccp_feat_clone_sp_val(&fval, val, 1)) - return DCCP_RESET_CODE_TOO_BUSY; - - if (len > 1 && server) { - defval = dccp_feat_default_value(feat); - if (dccp_feat_preflist_match(&defval, 1, val, len) > -1) - fval.sp.vec[0] = defval; - } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) { - kfree(fval.sp.vec); - goto unknown_feature_or_value; - } - - /* Treat unsupported CCIDs like invalid values */ - if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) { - kfree(fval.sp.vec); - goto not_valid_or_not_known; - } - - if (dccp_feat_push_confirm(fn, feat, local, &fval)) { - kfree(fval.sp.vec); - return DCCP_RESET_CODE_TOO_BUSY; - } - - return 0; - } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ - return 0; - } - - if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { - entry->empty_confirm = false; - } else if (is_mandatory) { - return DCCP_RESET_CODE_MANDATORY_ERROR; - } else if (entry->state == FEAT_INITIALISING) { - /* - * Failed simultaneous negotiation (server only): try to `save' - * the connection by checking whether entry contains the default - * value for @feat. If yes, send an empty Confirm to signal that - * the received Change was not understood - which implies using - * the default value. - * If this also fails, we use Reset as the last resort. - */ - WARN_ON(!server); - defval = dccp_feat_default_value(feat); - if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true)) - return DCCP_RESET_CODE_OPTION_ERROR; - entry->empty_confirm = true; - } - entry->needs_confirm = true; - entry->needs_mandatory = false; - entry->state = FEAT_STABLE; - return 0; - -unknown_feature_or_value: - if (!is_mandatory) - return dccp_push_empty_confirm(fn, feat, local); - -not_valid_or_not_known: - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; -} - -/** - * dccp_feat_confirm_recv - Process received Confirm options - * @fn: feature-negotiation list to update - * @is_mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R - * @feat: one of %dccp_feature_numbers - * @val: NN value or SP value/preference list - * @len: length of @val in bytes - * @server: whether this node is server (1) or client (0) - */ -static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, - u8 feat, u8 *val, u8 len, const bool server) -{ - u8 *plist, plen, type = dccp_feat_type(feat); - const bool local = (opt == DCCPO_CONFIRM_R); - struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); - - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); - - if (entry == NULL) { /* nothing queued: ignore or handle error */ - if (is_mandatory && type == FEAT_UNKNOWN) - return DCCP_RESET_CODE_MANDATORY_ERROR; - - if (!local && type == FEAT_NN) /* 6.3.2 */ - goto confirmation_failed; - return 0; - } - - if (entry->state != FEAT_CHANGING) /* 6.6.2 */ - return 0; - - if (len == 0) { - if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */ - goto confirmation_failed; - /* - * Empty Confirm during connection setup: this means reverting - * to the `old' value, which in this case is the default. Since - * we handle default values automatically when no other values - * have been set, we revert to the old value by removing this - * entry from the list. - */ - dccp_feat_list_pop(entry); - return 0; - } - - if (type == FEAT_NN) { - if (len > sizeof(entry->val.nn)) - goto confirmation_failed; - - if (entry->val.nn == dccp_decode_value_var(val, len)) - goto confirmation_succeeded; - - DCCP_WARN("Bogus Confirm for non-existing value\n"); - goto confirmation_failed; - } - - /* - * Parsing SP Confirms: the first element of @val is the preferred - * SP value which the peer confirms, the remainder depends on @len. - * Note that only the confirmed value need to be a valid SP value. - */ - if (!dccp_feat_is_valid_sp_val(feat, *val)) - goto confirmation_failed; - - if (len == 1) { /* peer didn't supply a preference list */ - plist = val; - plen = len; - } else { /* preferred value + preference list */ - plist = val + 1; - plen = len - 1; - } - - /* Check whether the peer got the reconciliation right (6.6.8) */ - if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { - DCCP_WARN("Confirm selected the wrong value %u\n", *val); - return DCCP_RESET_CODE_OPTION_ERROR; - } - entry->val.sp.vec[0] = *val; - -confirmation_succeeded: - entry->state = FEAT_STABLE; - return 0; - -confirmation_failed: - DCCP_WARN("Confirmation failed\n"); - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; -} - -/** - * dccp_feat_handle_nn_established - Fast-path reception of NN options - * @sk: socket of an established DCCP connection - * @mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only) - * @feat: NN number, one of %dccp_feature_numbers - * @val: NN value - * @len: length of @val in bytes - * - * This function combines the functionality of change_recv/confirm_recv, with - * the following differences (reset codes are the same): - * - cleanup after receiving the Confirm; - * - values are directly activated after successful parsing; - * - deliberately restricted to NN features. - * The restriction to NN features is essential since SP features can have non- - * predictable outcomes (depending on the remote configuration), and are inter- - * dependent (CCIDs for instance cause further dependencies). - */ -static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt, - u8 feat, u8 *val, u8 len) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - const bool local = (opt == DCCPO_CONFIRM_R); - struct dccp_feat_entry *entry; - u8 type = dccp_feat_type(feat); - dccp_feat_val fval; - - dccp_feat_print_opt(opt, feat, val, len, mandatory); - - /* Ignore non-mandatory unknown and non-NN features */ - if (type == FEAT_UNKNOWN) { - if (local && !mandatory) - return 0; - goto fast_path_unknown; - } else if (type != FEAT_NN) { - return 0; - } - - /* - * We don't accept empty Confirms, since in fast-path feature - * negotiation the values are enabled immediately after sending - * the Change option. - * Empty Changes on the other hand are invalid (RFC 4340, 6.1). - */ - if (len == 0 || len > sizeof(fval.nn)) - goto fast_path_unknown; - - if (opt == DCCPO_CHANGE_L) { - fval.nn = dccp_decode_value_var(val, len); - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) - goto fast_path_unknown; - - if (dccp_feat_push_confirm(fn, feat, local, &fval) || - dccp_feat_activate(sk, feat, local, &fval)) - return DCCP_RESET_CODE_TOO_BUSY; - - /* set the `Ack Pending' flag to piggyback a Confirm */ - inet_csk_schedule_ack(sk); - - } else if (opt == DCCPO_CONFIRM_R) { - entry = dccp_feat_list_lookup(fn, feat, local); - if (entry == NULL || entry->state != FEAT_CHANGING) - return 0; - - fval.nn = dccp_decode_value_var(val, len); - /* - * Just ignore a value that doesn't match our current value. - * If the option changes twice within two RTTs, then at least - * one CONFIRM will be received for the old value after a - * new CHANGE was sent. - */ - if (fval.nn != entry->val.nn) - return 0; - - /* Only activate after receiving the Confirm option (6.6.1). */ - dccp_feat_activate(sk, feat, local, &fval); - - /* It has been confirmed - so remove the entry */ - dccp_feat_list_pop(entry); - - } else { - DCCP_WARN("Received illegal option %u\n", opt); - goto fast_path_failed; - } - return 0; - -fast_path_unknown: - if (!mandatory) - return dccp_push_empty_confirm(fn, feat, local); - -fast_path_failed: - return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; -} - -/** - * dccp_feat_parse_options - Process Feature-Negotiation Options - * @sk: for general use and used by the client during connection setup - * @dreq: used by the server during connection setup - * @mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R - * @feat: one of %dccp_feature_numbers - * @val: value contents of @opt - * @len: length of @val in bytes - * - * Returns 0 on success, a Reset code for ending the connection otherwise. - */ -int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; - bool server = false; - - switch (sk->sk_state) { - /* - * Negotiation during connection setup - */ - case DCCP_LISTEN: - server = true; - fallthrough; - case DCCP_REQUESTING: - switch (opt) { - case DCCPO_CHANGE_L: - case DCCPO_CHANGE_R: - return dccp_feat_change_recv(fn, mandatory, opt, feat, - val, len, server); - case DCCPO_CONFIRM_R: - case DCCPO_CONFIRM_L: - return dccp_feat_confirm_recv(fn, mandatory, opt, feat, - val, len, server); - } - break; - /* - * Support for exchanging NN options on an established connection. - */ - case DCCP_OPEN: - case DCCP_PARTOPEN: - return dccp_feat_handle_nn_established(sk, mandatory, opt, feat, - val, len); - } - return 0; /* ignore FN options in all other states */ -} - -/** - * dccp_feat_init - Seed feature negotiation with host-specific defaults - * @sk: Socket to initialize. - * - * This initialises global defaults, depending on the value of the sysctls. - * These can later be overridden by registering changes via setsockopt calls. - * The last link in the chain is finalise_settings, to make sure that between - * here and the start of actual feature negotiation no inconsistencies enter. - * - * All features not appearing below use either defaults or are otherwise - * later adjusted through dccp_feat_finalise_settings(). - */ -int dccp_feat_init(struct sock *sk) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - u8 on = 1, off = 0; - int rc; - struct { - u8 *val; - u8 len; - } tx, rx; - - /* Non-negotiable (NN) features */ - rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, - sysctl_dccp_sequence_window); - if (rc) - return rc; - - /* Server-priority (SP) features */ - - /* Advertise that short seqnos are not supported (7.6.1) */ - rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); - if (rc) - return rc; - - /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ - rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); - if (rc) - return rc; - - /* - * We advertise the available list of CCIDs and reorder according to - * preferences, to avoid failure resulting from negotiating different - * singleton values (which always leads to failure). - * These settings can still (later) be overridden via sockopts. - */ - if (ccid_get_builtin_ccids(&tx.val, &tx.len)) - return -ENOBUFS; - if (ccid_get_builtin_ccids(&rx.val, &rx.len)) { - kfree(tx.val); - return -ENOBUFS; - } - - if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || - !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) - goto free_ccid_lists; - - rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); - if (rc) - goto free_ccid_lists; - - rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); - -free_ccid_lists: - kfree(tx.val); - kfree(rx.val); - return rc; -} - -int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_feat_entry *cur, *next; - int idx; - dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = { - [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL } - }; - - list_for_each_entry(cur, fn_list, node) { - /* - * An empty Confirm means that either an unknown feature type - * or an invalid value was present. In the first case there is - * nothing to activate, in the other the default value is used. - */ - if (cur->empty_confirm) - continue; - - idx = dccp_feat_index(cur->feat_num); - if (idx < 0) { - DCCP_BUG("Unknown feature %u", cur->feat_num); - goto activation_failed; - } - if (cur->state != FEAT_STABLE) { - DCCP_CRIT("Negotiation of %s %s failed in state %s", - cur->is_local ? "local" : "remote", - dccp_feat_fname(cur->feat_num), - dccp_feat_sname[cur->state]); - goto activation_failed; - } - fvals[idx][cur->is_local] = &cur->val; - } - - /* - * Activate in decreasing order of index, so that the CCIDs are always - * activated as the last feature. This avoids the case where a CCID - * relies on the initialisation of one or more features that it depends - * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features). - */ - for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;) - if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) || - __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) { - DCCP_CRIT("Could not activate %d", idx); - goto activation_failed; - } - - /* Clean up Change options which have been confirmed already */ - list_for_each_entry_safe(cur, next, fn_list, node) - if (!cur->needs_confirm) - dccp_feat_list_pop(cur); - - dccp_pr_debug("Activation OK\n"); - return 0; - -activation_failed: - /* - * We clean up everything that may have been allocated, since - * it is difficult to track at which stage negotiation failed. - * This is ok, since all allocation functions below are robust - * against NULL arguments. - */ - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - return -1; -} diff --git a/net/dccp/feat.h b/net/dccp/feat.h deleted file mode 100644 index 57d9c026aa3f..000000000000 --- a/net/dccp/feat.h +++ /dev/null @@ -1,133 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _DCCP_FEAT_H -#define _DCCP_FEAT_H -/* - * net/dccp/feat.h - * - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) - * Copyright (c) 2008 Gerrit Renker - * Copyright (c) 2005 Andrea Bittau - */ -#include -#include "dccp.h" - -/* - * Known limit values - */ -/* Ack Ratio takes 2-byte integer values (11.3) */ -#define DCCPF_ACK_RATIO_MAX 0xFFFF -/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ -#define DCCPF_SEQ_WMIN 32 -#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull -/* Maximum number of SP values that fit in a single (Confirm) option */ -#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) - -enum dccp_feat_type { - FEAT_AT_RX = 1, /* located at RX side of half-connection */ - FEAT_AT_TX = 2, /* located at TX side of half-connection */ - FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */ - FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */ - FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */ -}; - -enum dccp_feat_state { - FEAT_DEFAULT = 0, /* using default values from 6.4 */ - FEAT_INITIALISING, /* feature is being initialised */ - FEAT_CHANGING, /* Change sent but not confirmed yet */ - FEAT_UNSTABLE, /* local modification in state CHANGING */ - FEAT_STABLE /* both ends (think they) agree */ -}; - -/** - * dccp_feat_val - Container for SP or NN feature values - * @nn: single NN value - * @sp.vec: single SP value plus optional preference list - * @sp.len: length of @sp.vec in bytes - */ -typedef union { - u64 nn; - struct { - u8 *vec; - u8 len; - } sp; -} dccp_feat_val; - -/** - * struct feat_entry - Data structure to perform feature negotiation - * @val: feature's current value (SP features may have preference list) - * @state: feature's current state - * @feat_num: one of %dccp_feature_numbers - * @needs_mandatory: whether Mandatory options should be sent - * @needs_confirm: whether to send a Confirm instead of a Change - * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm) - * @is_local: feature location (1) or feature-remote (0) - * @node: list pointers, entries arranged in FIFO order - */ -struct dccp_feat_entry { - dccp_feat_val val; - enum dccp_feat_state state:8; - u8 feat_num; - - bool needs_mandatory, - needs_confirm, - empty_confirm, - is_local; - - struct list_head node; -}; - -static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry) -{ - if (entry->needs_confirm) - return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R; - return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R; -} - -/** - * struct ccid_dependency - Track changes resulting from choosing a CCID - * @dependent_feat: one of %dccp_feature_numbers - * @is_local: local (1) or remote (0) @dependent_feat - * @is_mandatory: whether presence of @dependent_feat is mission-critical or not - * @val: corresponding default value for @dependent_feat (u8 is sufficient here) - */ -struct ccid_dependency { - u8 dependent_feat; - bool is_local:1, - is_mandatory:1; - u8 val; -}; - -/* - * Sysctls to seed defaults for feature negotiation - */ -extern unsigned long sysctl_dccp_sequence_window; -extern int sysctl_dccp_rx_ccid; -extern int sysctl_dccp_tx_ccid; - -int dccp_feat_init(struct sock *sk); -int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, - u8 const *list, u8 len); -int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, - u8 mand, u8 opt, u8 feat, u8 *val, u8 len); -int dccp_feat_clone_list(struct list_head const *, struct list_head *); - -/* - * Encoding variable-length options and their maximum length. - * - * This affects NN options (SP options are all u8) and other variable-length - * options (see table 3 in RFC 4340). The limit is currently given the Sequence - * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other - * options consume less than 6 bytes (timestamps are 4 bytes). - * When updating this constant (e.g. due to new internet drafts / RFCs), make - * sure that you also update all code which refers to it. - */ -#define DCCP_OPTVAL_MAXLEN 6 - -void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); -u64 dccp_decode_value_var(const u8 *bf, const u8 len); -u64 dccp_feat_nn_get(struct sock *sk, u8 feat); - -int dccp_insert_option_mandatory(struct sk_buff *skb); -int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, u8 *val, u8 len, - bool repeat_first); -#endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/input.c b/net/dccp/input.c deleted file mode 100644 index 2cbb757a894f..000000000000 --- a/net/dccp/input.c +++ /dev/null @@ -1,739 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/input.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include - -#include - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" - -/* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */ -int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8; - -static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb) -{ - __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4); - __skb_queue_tail(&sk->sk_receive_queue, skb); - skb_set_owner_r(skb, sk); - sk->sk_data_ready(sk); -} - -static void dccp_fin(struct sock *sk, struct sk_buff *skb) -{ - /* - * On receiving Close/CloseReq, both RD/WR shutdown are performed. - * RFC 4340, 8.3 says that we MAY send further Data/DataAcks after - * receiving the closing segment, but there is no guarantee that such - * data will be processed at all. - */ - sk->sk_shutdown = SHUTDOWN_MASK; - sock_set_flag(sk, SOCK_DONE); - dccp_enqueue_skb(sk, skb); -} - -static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb) -{ - int queued = 0; - - switch (sk->sk_state) { - /* - * We ignore Close when received in one of the following states: - * - CLOSED (may be a late or duplicate packet) - * - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier) - * - RESPOND (already handled by dccp_check_req) - */ - case DCCP_CLOSING: - /* - * Simultaneous-close: receiving a Close after sending one. This - * can happen if both client and server perform active-close and - * will result in an endless ping-pong of crossing and retrans- - * mitted Close packets, which only terminates when one of the - * nodes times out (min. 64 seconds). Quicker convergence can be - * achieved when one of the nodes acts as tie-breaker. - * This is ok as both ends are done with data transfer and each - * end is just waiting for the other to acknowledge termination. - */ - if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) - break; - fallthrough; - case DCCP_REQUESTING: - case DCCP_ACTIVE_CLOSEREQ: - dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); - dccp_done(sk); - break; - case DCCP_OPEN: - case DCCP_PARTOPEN: - /* Give waiting application a chance to read pending data */ - queued = 1; - dccp_fin(sk, skb); - dccp_set_state(sk, DCCP_PASSIVE_CLOSE); - fallthrough; - case DCCP_PASSIVE_CLOSE: - /* - * Retransmitted Close: we have already enqueued the first one. - */ - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - } - return queued; -} - -static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) -{ - int queued = 0; - - /* - * Step 7: Check for unexpected packet types - * If (S.is_server and P.type == CloseReq) - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) { - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); - return queued; - } - - /* Step 13: process relevant Client states < CLOSEREQ */ - switch (sk->sk_state) { - case DCCP_REQUESTING: - dccp_send_close(sk, 0); - dccp_set_state(sk, DCCP_CLOSING); - break; - case DCCP_OPEN: - case DCCP_PARTOPEN: - /* Give waiting application a chance to read pending data */ - queued = 1; - dccp_fin(sk, skb); - dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ); - fallthrough; - case DCCP_PASSIVE_CLOSEREQ: - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - } - return queued; -} - -static u16 dccp_reset_code_convert(const u8 code) -{ - static const u16 error_code[] = { - [DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */ - [DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */ - [DCCP_RESET_CODE_ABORTED] = ECONNRESET, - - [DCCP_RESET_CODE_NO_CONNECTION] = ECONNREFUSED, - [DCCP_RESET_CODE_CONNECTION_REFUSED] = ECONNREFUSED, - [DCCP_RESET_CODE_TOO_BUSY] = EUSERS, - [DCCP_RESET_CODE_AGGRESSION_PENALTY] = EDQUOT, - - [DCCP_RESET_CODE_PACKET_ERROR] = ENOMSG, - [DCCP_RESET_CODE_BAD_INIT_COOKIE] = EBADR, - [DCCP_RESET_CODE_BAD_SERVICE_CODE] = EBADRQC, - [DCCP_RESET_CODE_OPTION_ERROR] = EILSEQ, - [DCCP_RESET_CODE_MANDATORY_ERROR] = EOPNOTSUPP, - }; - - return code >= DCCP_MAX_RESET_CODES ? 0 : error_code[code]; -} - -static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) -{ - u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code); - - sk->sk_err = err; - - /* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */ - dccp_fin(sk, skb); - - if (err && !sock_flag(sk, SOCK_DEAD)) - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); - dccp_time_wait(sk, DCCP_TIME_WAIT, 0); -} - -static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; - - if (av == NULL) - return; - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); - dccp_ackvec_input(av, skb); -} - -static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_sock *dp = dccp_sk(sk); - - /* Don't deliver to RX CCID when node has shut down read end. */ - if (!(sk->sk_shutdown & RCV_SHUTDOWN)) - ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); - /* - * Until the TX queue has been drained, we can not honour SHUT_WR, since - * we need received feedback as input to adjust congestion control. - */ - if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN)) - ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); -} - -static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - struct dccp_sock *dp = dccp_sk(sk); - u64 lswl, lawl, seqno = DCCP_SKB_CB(skb)->dccpd_seq, - ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; - - /* - * Step 5: Prepare sequence numbers for Sync - * If P.type == Sync or P.type == SyncAck, - * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL, - * / * P is valid, so update sequence number variables - * accordingly. After this update, P will pass the tests - * in Step 6. A SyncAck is generated if necessary in - * Step 15 * / - * Update S.GSR, S.SWL, S.SWH - * Otherwise, - * Drop packet and return - */ - if (dh->dccph_type == DCCP_PKT_SYNC || - dh->dccph_type == DCCP_PKT_SYNCACK) { - if (between48(ackno, dp->dccps_awl, dp->dccps_awh) && - dccp_delta_seqno(dp->dccps_swl, seqno) >= 0) - dccp_update_gsr(sk, seqno); - else - return -1; - } - - /* - * Step 6: Check sequence numbers - * Let LSWL = S.SWL and LAWL = S.AWL - * If P.type == CloseReq or P.type == Close or P.type == Reset, - * LSWL := S.GSR + 1, LAWL := S.GAR - * If LSWL <= P.seqno <= S.SWH - * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH), - * Update S.GSR, S.SWL, S.SWH - * If P.type != Sync, - * Update S.GAR - */ - lswl = dp->dccps_swl; - lawl = dp->dccps_awl; - - if (dh->dccph_type == DCCP_PKT_CLOSEREQ || - dh->dccph_type == DCCP_PKT_CLOSE || - dh->dccph_type == DCCP_PKT_RESET) { - lswl = ADD48(dp->dccps_gsr, 1); - lawl = dp->dccps_gar; - } - - if (between48(seqno, lswl, dp->dccps_swh) && - (ackno == DCCP_PKT_WITHOUT_ACK_SEQ || - between48(ackno, lawl, dp->dccps_awh))) { - dccp_update_gsr(sk, seqno); - - if (dh->dccph_type != DCCP_PKT_SYNC && - ackno != DCCP_PKT_WITHOUT_ACK_SEQ && - after48(ackno, dp->dccps_gar)) - dp->dccps_gar = ackno; - } else { - unsigned long now = jiffies; - /* - * Step 6: Check sequence numbers - * Otherwise, - * If P.type == Reset, - * Send Sync packet acknowledging S.GSR - * Otherwise, - * Send Sync packet acknowledging P.seqno - * Drop packet and return - * - * These Syncs are rate-limited as per RFC 4340, 7.5.4: - * at most 1 / (dccp_sync_rate_limit * HZ) Syncs per second. - */ - if (time_before(now, (dp->dccps_rate_last + - sysctl_dccp_sync_ratelimit))) - return -1; - - DCCP_WARN("Step 6 failed for %s packet, " - "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and " - "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), " - "sending SYNC...\n", dccp_packet_name(dh->dccph_type), - (unsigned long long) lswl, (unsigned long long) seqno, - (unsigned long long) dp->dccps_swh, - (ackno == DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" - : "exists", - (unsigned long long) lawl, (unsigned long long) ackno, - (unsigned long long) dp->dccps_awh); - - dp->dccps_rate_last = now; - - if (dh->dccph_type == DCCP_PKT_RESET) - seqno = dp->dccps_gsr; - dccp_send_sync(sk, seqno, DCCP_PKT_SYNC); - return -1; - } - - return 0; -} - -static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - - switch (dccp_hdr(skb)->dccph_type) { - case DCCP_PKT_DATAACK: - case DCCP_PKT_DATA: - /* - * FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when - * - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening" - * - sk_receive_queue is full, use Code 2, "Receive Buffer" - */ - dccp_enqueue_skb(sk, skb); - return 0; - case DCCP_PKT_ACK: - goto discard; - case DCCP_PKT_RESET: - /* - * Step 9: Process Reset - * If P.type == Reset, - * Tear down connection - * S.state := TIMEWAIT - * Set TIMEWAIT timer - * Drop packet and return - */ - dccp_rcv_reset(sk, skb); - return 0; - case DCCP_PKT_CLOSEREQ: - if (dccp_rcv_closereq(sk, skb)) - return 0; - goto discard; - case DCCP_PKT_CLOSE: - if (dccp_rcv_close(sk, skb)) - return 0; - goto discard; - case DCCP_PKT_REQUEST: - /* Step 7 - * or (S.is_server and P.type == Response) - * or (S.is_client and P.type == Request) - * or (S.state >= OPEN and P.type == Request - * and P.seqno >= S.OSR) - * or (S.state >= OPEN and P.type == Response - * and P.seqno >= S.OSR) - * or (S.state == RESPOND and P.type == Data), - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if (dp->dccps_role != DCCP_ROLE_LISTEN) - goto send_sync; - goto check_seq; - case DCCP_PKT_RESPONSE: - if (dp->dccps_role != DCCP_ROLE_CLIENT) - goto send_sync; -check_seq: - if (dccp_delta_seqno(dp->dccps_osr, - DCCP_SKB_CB(skb)->dccpd_seq) >= 0) { -send_sync: - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_PKT_SYNC); - } - break; - case DCCP_PKT_SYNC: - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_PKT_SYNCACK); - /* - * From RFC 4340, sec. 5.7 - * - * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets - * MAY have non-zero-length application data areas, whose - * contents receivers MUST ignore. - */ - goto discard; - } - - DCCP_INC_STATS(DCCP_MIB_INERRS); -discard: - __kfree_skb(skb); - return 0; -} - -int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned int len) -{ - if (dccp_check_seqno(sk, skb)) - goto discard; - - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - dccp_handle_ackvec_processing(sk, skb); - dccp_deliver_input_to_ccids(sk, skb); - - return __dccp_rcv_established(sk, skb, dh, len); -discard: - __kfree_skb(skb); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_rcv_established); - -static int dccp_rcv_request_sent_state_process(struct sock *sk, - struct sk_buff *skb, - const struct dccp_hdr *dh, - const unsigned int len) -{ - /* - * Step 4: Prepare sequence numbers in REQUEST - * If S.state == REQUEST, - * If (P.type == Response or P.type == Reset) - * and S.AWL <= P.ackno <= S.AWH, - * / * Set sequence number variables corresponding to the - * other endpoint, so P will pass the tests in Step 6 * / - * Set S.GSR, S.ISR, S.SWL, S.SWH - * / * Response processing continues in Step 10; Reset - * processing continues in Step 9 * / - */ - if (dh->dccph_type == DCCP_PKT_RESPONSE) { - const struct inet_connection_sock *icsk = inet_csk(sk); - struct dccp_sock *dp = dccp_sk(sk); - long tstamp = dccp_timestamp(); - - if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, - dp->dccps_awl, dp->dccps_awh)) { - dccp_pr_debug("invalid ackno: S.AWL=%llu, " - "P.ackno=%llu, S.AWH=%llu\n", - (unsigned long long)dp->dccps_awl, - (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, - (unsigned long long)dp->dccps_awh); - goto out_invalid_packet; - } - - /* - * If option processing (Step 8) failed, return 1 here so that - * dccp_v4_do_rcv() sends a Reset. The Reset code depends on - * the option type and is set in dccp_parse_options(). - */ - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ - if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) - dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - - dp->dccps_options_received.dccpor_timestamp_echo)); - - /* Stop the REQUEST timer */ - inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); - WARN_ON(sk->sk_send_head == NULL); - kfree_skb(sk->sk_send_head); - sk->sk_send_head = NULL; - - /* - * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect - * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH - * is done as part of activating the feature values below, since - * these settings depend on the local/remote Sequence Window - * features, which were undefined or not confirmed until now. - */ - dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; - - dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); - - /* - * Step 10: Process REQUEST state (second part) - * If S.state == REQUEST, - * / * If we get here, P is a valid Response from the - * server (see Step 4), and we should move to - * PARTOPEN state. PARTOPEN means send an Ack, - * don't send Data packets, retransmit Acks - * periodically, and always include any Init Cookie - * from the Response * / - * S.state := PARTOPEN - * Set PARTOPEN timer - * Continue with S.state == PARTOPEN - * / * Step 12 will send the Ack completing the - * three-way handshake * / - */ - dccp_set_state(sk, DCCP_PARTOPEN); - - /* - * If feature negotiation was successful, activate features now; - * an activation failure means that this host could not activate - * one ore more features (e.g. insufficient memory), which would - * leave at least one feature in an undefined state. - */ - if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) - goto unable_to_proceed; - - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - - if (!sock_flag(sk, SOCK_DEAD)) { - sk->sk_state_change(sk); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - } - - if (sk->sk_write_pending || inet_csk_in_pingpong_mode(sk) || - icsk->icsk_accept_queue.rskq_defer_accept) { - /* Save one ACK. Data will be ready after - * several ticks, if write_pending is set. - * - * It may be deleted, but with this feature tcpdumps - * look so _wonderfully_ clever, that I was not able - * to stand against the temptation 8) --ANK - */ - /* - * OK, in DCCP we can as well do a similar trick, its - * even in the draft, but there is no need for us to - * schedule an ack here, as dccp_sendmsg does this for - * us, also stated in the draft. -acme - */ - __kfree_skb(skb); - return 0; - } - dccp_send_ack(sk); - return -1; - } - -out_invalid_packet: - /* dccp_v4_do_rcv will send a reset */ - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; - return 1; - -unable_to_proceed: - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; - /* - * We mark this socket as no longer usable, so that the loop in - * dccp_sendmsg() terminates and the application gets notified. - */ - dccp_set_state(sk, DCCP_CLOSED); - sk->sk_err = ECOMM; - return 1; -} - -static int dccp_rcv_respond_partopen_state_process(struct sock *sk, - struct sk_buff *skb, - const struct dccp_hdr *dh, - const unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - u32 sample = dp->dccps_options_received.dccpor_timestamp_echo; - int queued = 0; - - switch (dh->dccph_type) { - case DCCP_PKT_RESET: - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); - break; - case DCCP_PKT_DATA: - if (sk->sk_state == DCCP_RESPOND) - break; - fallthrough; - case DCCP_PKT_DATAACK: - case DCCP_PKT_ACK: - /* - * FIXME: we should be resetting the PARTOPEN (DELACK) timer - * here but only if we haven't used the DELACK timer for - * something else, like sending a delayed ack for a TIMESTAMP - * echo, etc, for now were not clearing it, sending an extra - * ACK when there is nothing else to do in DELACK is not a big - * deal after all. - */ - - /* Stop the PARTOPEN timer */ - if (sk->sk_state == DCCP_PARTOPEN) - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); - - /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ - if (likely(sample)) { - long delta = dccp_timestamp() - sample; - - dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta); - } - - dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq; - dccp_set_state(sk, DCCP_OPEN); - - if (dh->dccph_type == DCCP_PKT_DATAACK || - dh->dccph_type == DCCP_PKT_DATA) { - __dccp_rcv_established(sk, skb, dh, len); - queued = 1; /* packet was queued - (by __dccp_rcv_established) */ - } - break; - } - - return queued; -} - -int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - struct dccp_hdr *dh, unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const int old_state = sk->sk_state; - bool acceptable; - int queued = 0; - - /* - * Step 3: Process LISTEN state - * - * If S.state == LISTEN, - * If P.type == Request or P contains a valid Init Cookie option, - * (* Must scan the packet's options to check for Init - * Cookies. Only Init Cookies are processed here, - * however; other options are processed in Step 8. This - * scan need only be performed if the endpoint uses Init - * Cookies *) - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - * S.state = RESPOND - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init - * Cookies Continue with S.state == RESPOND - * (* A Response packet will be generated in Step 11 *) - * Otherwise, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (sk->sk_state == DCCP_LISTEN) { - if (dh->dccph_type == DCCP_PKT_REQUEST) { - /* It is possible that we process SYN packets from backlog, - * so we need to make sure to disable BH and RCU right there. - */ - rcu_read_lock(); - local_bh_disable(); - acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0; - local_bh_enable(); - rcu_read_unlock(); - if (!acceptable) - return 1; - consume_skb(skb); - return 0; - } - if (dh->dccph_type == DCCP_PKT_RESET) - goto discard; - - /* Caller (dccp_v4_do_rcv) will send Reset */ - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - return 1; - } else if (sk->sk_state == DCCP_CLOSED) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - return 1; - } - - /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ - if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) - goto discard; - - /* - * Step 7: Check for unexpected packet types - * If (S.is_server and P.type == Response) - * or (S.is_client and P.type == Request) - * or (S.state == RESPOND and P.type == Data), - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if ((dp->dccps_role != DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_RESPONSE) || - (dp->dccps_role == DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_REQUEST) || - (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); - goto discard; - } - - /* Step 8: Process options */ - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - /* - * Step 9: Process Reset - * If P.type == Reset, - * Tear down connection - * S.state := TIMEWAIT - * Set TIMEWAIT timer - * Drop packet and return - */ - if (dh->dccph_type == DCCP_PKT_RESET) { - dccp_rcv_reset(sk, skb); - return 0; - } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ - if (dccp_rcv_closereq(sk, skb)) - return 0; - goto discard; - } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ - if (dccp_rcv_close(sk, skb)) - return 0; - goto discard; - } - - switch (sk->sk_state) { - case DCCP_REQUESTING: - queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); - if (queued >= 0) - return queued; - - __kfree_skb(skb); - return 0; - - case DCCP_PARTOPEN: - /* Step 8: if using Ack Vectors, mark packet acknowledgeable */ - dccp_handle_ackvec_processing(sk, skb); - dccp_deliver_input_to_ccids(sk, skb); - fallthrough; - case DCCP_RESPOND: - queued = dccp_rcv_respond_partopen_state_process(sk, skb, - dh, len); - break; - } - - if (dh->dccph_type == DCCP_PKT_ACK || - dh->dccph_type == DCCP_PKT_DATAACK) { - switch (old_state) { - case DCCP_PARTOPEN: - sk->sk_state_change(sk); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - break; - } - } else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) { - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK); - goto discard; - } - - if (!queued) { -discard: - __kfree_skb(skb); - } - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_rcv_state_process); - -/** - * dccp_sample_rtt - Validate and finalise computation of RTT sample - * @sk: socket structure - * @delta: number of microseconds between packet and acknowledgment - * - * The routine is kept generic to work in different contexts. It should be - * called immediately when the ACK used for the RTT sample arrives. - */ -u32 dccp_sample_rtt(struct sock *sk, long delta) -{ - /* dccpor_elapsed_time is either zeroed out or set and > 0 */ - delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; - - if (unlikely(delta <= 0)) { - DCCP_WARN("unusable RTT sample %ld, using min\n", delta); - return DCCP_SANE_RTT_MIN; - } - if (unlikely(delta > DCCP_SANE_RTT_MAX)) { - DCCP_WARN("RTT sample %ld too large, using max\n", delta); - return DCCP_SANE_RTT_MAX; - } - - return delta; -} diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c deleted file mode 100644 index 2045ddac0fe9..000000000000 --- a/net/dccp/ipv4.c +++ /dev/null @@ -1,1101 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/ipv4.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -struct dccp_v4_pernet { - struct sock *v4_ctl_sk; -}; - -static unsigned int dccp_v4_pernet_id __read_mostly; - -/* - * The per-net v4_ctl_sk socket is used for responding to - * the Out-of-the-blue (OOTB) packets. A control sock will be created - * for this socket at the initialization time. - */ - -int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) -{ - const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; - struct inet_sock *inet = inet_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - __be16 orig_sport, orig_dport; - __be32 daddr, nexthop; - struct flowi4 *fl4; - struct rtable *rt; - int err; - struct ip_options_rcu *inet_opt; - - dp->dccps_role = DCCP_ROLE_CLIENT; - - if (addr_len < sizeof(struct sockaddr_in)) - return -EINVAL; - - if (usin->sin_family != AF_INET) - return -EAFNOSUPPORT; - - nexthop = daddr = usin->sin_addr.s_addr; - - inet_opt = rcu_dereference_protected(inet->inet_opt, - lockdep_sock_is_held(sk)); - if (inet_opt != NULL && inet_opt->opt.srr) { - if (daddr == 0) - return -EINVAL; - nexthop = inet_opt->opt.faddr; - } - - orig_sport = inet->inet_sport; - orig_dport = usin->sin_port; - fl4 = &inet->cork.fl.u.ip4; - rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, - sk->sk_bound_dev_if, IPPROTO_DCCP, orig_sport, - orig_dport, sk); - if (IS_ERR(rt)) - return PTR_ERR(rt); - - if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { - ip_rt_put(rt); - return -ENETUNREACH; - } - - if (inet_opt == NULL || !inet_opt->opt.srr) - daddr = fl4->daddr; - - if (inet->inet_saddr == 0) { - err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); - if (err) { - ip_rt_put(rt); - return err; - } - } else { - sk_rcv_saddr_set(sk, inet->inet_saddr); - } - - inet->inet_dport = usin->sin_port; - sk_daddr_set(sk, daddr); - - inet_csk(sk)->icsk_ext_hdr_len = 0; - if (inet_opt) - inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; - /* - * Socket identity is still unknown (sport may be zero). - * However we set state to DCCP_REQUESTING and not releasing socket - * lock select source port, enter ourselves into the hash tables and - * complete initialization after this. - */ - dccp_set_state(sk, DCCP_REQUESTING); - err = inet_hash_connect(&dccp_death_row, sk); - if (err != 0) - goto failure; - - rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, - inet->inet_sport, inet->inet_dport, sk); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - rt = NULL; - goto failure; - } - /* OK, now commit destination to socket. */ - sk_setup_caps(sk, &rt->dst); - - dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - inet->inet_dport); - atomic_set(&inet->inet_id, get_random_u16()); - - err = dccp_connect(sk); - rt = NULL; - if (err != 0) - goto failure; -out: - return err; -failure: - /* - * This unhashes the socket and releases the local port, if necessary. - */ - dccp_set_state(sk, DCCP_CLOSED); - inet_bhash2_reset_saddr(sk); - ip_rt_put(rt); - sk->sk_route_caps = 0; - inet->inet_dport = 0; - goto out; -} -EXPORT_SYMBOL_GPL(dccp_v4_connect); - -/* - * This routine does path mtu discovery as defined in RFC1191. - */ -static inline void dccp_do_pmtu_discovery(struct sock *sk, - const struct iphdr *iph, - u32 mtu) -{ - struct dst_entry *dst; - const struct inet_sock *inet = inet_sk(sk); - const struct dccp_sock *dp = dccp_sk(sk); - - /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs - * send out by Linux are always < 576bytes so they should go through - * unfragmented). - */ - if (sk->sk_state == DCCP_LISTEN) - return; - - dst = inet_csk_update_pmtu(sk, mtu); - if (!dst) - return; - - /* Something is about to be wrong... Remember soft error - * for the case, if this connection will not able to recover. - */ - if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) - WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); - - mtu = dst_mtu(dst); - - if (inet->pmtudisc != IP_PMTUDISC_DONT && - ip_sk_accept_pmtu(sk) && - inet_csk(sk)->icsk_pmtu_cookie > mtu) { - dccp_sync_mss(sk, mtu); - - /* - * From RFC 4340, sec. 14.1: - * - * DCCP-Sync packets are the best choice for upward - * probing, since DCCP-Sync probes do not risk application - * data loss. - */ - dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); - } /* else let the usual retransmit timer handle it */ -} - -static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk) -{ - struct dst_entry *dst = __sk_dst_check(sk, 0); - - if (dst) - dst->ops->redirect(dst, sk, skb); -} - -void dccp_req_err(struct sock *sk, u64 seq) - { - struct request_sock *req = inet_reqsk(sk); - struct net *net = sock_net(sk); - - /* - * ICMPs are not backlogged, hence we cannot get an established - * socket here. - */ - if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) { - __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); - } else { - /* - * Still in RESPOND, just remove it silently. - * There is no good way to pass the error to the newly - * created socket, and POSIX does not want network - * errors returned from accept(). - */ - inet_csk_reqsk_queue_drop(req->rsk_listener, req); - } - reqsk_put(req); -} -EXPORT_SYMBOL(dccp_req_err); - -/* - * This routine is called by the ICMP module when it gets some sort of error - * condition. If err < 0 then the socket should be closed and the error - * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code. - * After adjustment header points to the first 8 bytes of the tcp header. We - * need to find the appropriate port. - * - * The locking strategy used here is very "optimistic". When someone else - * accesses the socket the ICMP is just dropped and for some paths there is no - * check at all. A more general error queue to queue errors for later handling - * is probably better. - */ -static int dccp_v4_err(struct sk_buff *skb, u32 info) -{ - const struct iphdr *iph = (struct iphdr *)skb->data; - const u8 offset = iph->ihl << 2; - const struct dccp_hdr *dh; - struct dccp_sock *dp; - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; - struct sock *sk; - __u64 seq; - int err; - struct net *net = dev_net(skb->dev); - - if (!pskb_may_pull(skb, offset + sizeof(*dh))) - return -EINVAL; - dh = (struct dccp_hdr *)(skb->data + offset); - if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) - return -EINVAL; - iph = (struct iphdr *)skb->data; - dh = (struct dccp_hdr *)(skb->data + offset); - - sk = __inet_lookup_established(net, &dccp_hashinfo, - iph->daddr, dh->dccph_dport, - iph->saddr, ntohs(dh->dccph_sport), - inet_iif(skb), 0); - if (!sk) { - __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return -ENOENT; - } - - if (sk->sk_state == DCCP_TIME_WAIT) { - inet_twsk_put(inet_twsk(sk)); - return 0; - } - seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - dccp_req_err(sk, seq); - return 0; - } - - bh_lock_sock(sk); - /* If too many ICMPs get dropped on busy - * servers this needs to be solved differently. - */ - if (sock_owned_by_user(sk)) - __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); - - if (sk->sk_state == DCCP_CLOSED) - goto out; - - dp = dccp_sk(sk); - if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && - !between48(seq, dp->dccps_awl, dp->dccps_awh)) { - __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); - goto out; - } - - switch (type) { - case ICMP_REDIRECT: - if (!sock_owned_by_user(sk)) - dccp_do_redirect(skb, sk); - goto out; - case ICMP_SOURCE_QUENCH: - /* Just silently ignore these. */ - goto out; - case ICMP_PARAMETERPROB: - err = EPROTO; - break; - case ICMP_DEST_UNREACH: - if (code > NR_ICMP_UNREACH) - goto out; - - if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ - if (!sock_owned_by_user(sk)) - dccp_do_pmtu_discovery(sk, iph, info); - goto out; - } - - err = icmp_err_convert[code].errno; - break; - case ICMP_TIME_EXCEEDED: - err = EHOSTUNREACH; - break; - default: - goto out; - } - - switch (sk->sk_state) { - case DCCP_REQUESTING: - case DCCP_RESPOND: - if (!sock_owned_by_user(sk)) { - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - sk->sk_err = err; - - sk_error_report(sk); - - dccp_done(sk); - } else { - WRITE_ONCE(sk->sk_err_soft, err); - } - goto out; - } - - /* If we've already connected we will keep trying - * until we time out, or the user gives up. - * - * rfc1122 4.2.3.9 allows to consider as hard errors - * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, - * but it is obsoleted by pmtu discovery). - * - * Note, that in modern internet, where routing is unreliable - * and in each dark corner broken firewalls sit, sending random - * errors ordered by their masters even this two messages finally lose - * their original sense (even Linux sends invalid PORT_UNREACHs) - * - * Now we are in compliance with RFCs. - * --ANK (980905) - */ - - if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { - sk->sk_err = err; - sk_error_report(sk); - } else { /* Only an error on timeout */ - WRITE_ONCE(sk->sk_err_soft, err); - } -out: - bh_unlock_sock(sk); - sock_put(sk); - return 0; -} - -static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb, - __be32 src, __be32 dst) -{ - return csum_tcpudp_magic(src, dst, skb->len, IPPROTO_DCCP, skb->csum); -} - -void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb) -{ - const struct inet_sock *inet = inet_sk(sk); - struct dccp_hdr *dh = dccp_hdr(skb); - - dccp_csum_outgoing(skb); - dh->dccph_checksum = dccp_v4_csum_finish(skb, - inet->inet_saddr, - inet->inet_daddr); -} -EXPORT_SYMBOL_GPL(dccp_v4_send_check); - -static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb) -{ - return secure_dccp_sequence_number(ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, - dccp_hdr(skb)->dccph_dport, - dccp_hdr(skb)->dccph_sport); -} - -/* - * The three way handshake has completed - we got a valid ACK or DATAACK - - * now create the new socket. - * - * This is the equivalent of TCP's tcp_v4_syn_recv_sock - */ -struct sock *dccp_v4_request_recv_sock(const struct sock *sk, - struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req) -{ - struct inet_request_sock *ireq; - struct inet_sock *newinet; - struct sock *newsk; - - if (sk_acceptq_is_full(sk)) - goto exit_overflow; - - newsk = dccp_create_openreq_child(sk, req, skb); - if (newsk == NULL) - goto exit_nonewsk; - - newinet = inet_sk(newsk); - ireq = inet_rsk(req); - RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt)); - newinet->mc_index = inet_iif(skb); - newinet->mc_ttl = ip_hdr(skb)->ttl; - atomic_set(&newinet->inet_id, get_random_u16()); - - if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) - goto put_and_exit; - - sk_setup_caps(newsk, dst); - - dccp_sync_mss(newsk, dst_mtu(dst)); - - if (__inet_inherit_port(sk, newsk) < 0) - goto put_and_exit; - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); - if (*own_req) - ireq->ireq_opt = NULL; - else - newinet->inet_opt = NULL; - return newsk; - -exit_overflow: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -exit_nonewsk: - dst_release(dst); -exit: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); - return NULL; -put_and_exit: - newinet->inet_opt = NULL; - inet_csk_prepare_forced_close(newsk); - dccp_done(newsk); - goto exit; -} -EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); - -static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, - struct sk_buff *skb) -{ - struct rtable *rt; - const struct iphdr *iph = ip_hdr(skb); - struct flowi4 fl4 = { - .flowi4_oif = inet_iif(skb), - .daddr = iph->saddr, - .saddr = iph->daddr, - .flowi4_tos = inet_dscp_to_dsfield(inet_sk_dscp(inet_sk(sk))), - .flowi4_scope = ip_sock_rt_scope(sk), - .flowi4_proto = sk->sk_protocol, - .fl4_sport = dccp_hdr(skb)->dccph_dport, - .fl4_dport = dccp_hdr(skb)->dccph_sport, - }; - - security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); - rt = ip_route_output_flow(net, &fl4, sk); - if (IS_ERR(rt)) { - IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - - return &rt->dst; -} - -static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req) -{ - int err = -1; - struct sk_buff *skb; - struct dst_entry *dst; - struct flowi4 fl4; - - dst = inet_csk_route_req(sk, &fl4, req); - if (dst == NULL) - goto out; - - skb = dccp_make_response(sk, dst, req); - if (skb != NULL) { - const struct inet_request_sock *ireq = inet_rsk(req); - struct dccp_hdr *dh = dccp_hdr(skb); - - dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr, - ireq->ir_rmt_addr); - rcu_read_lock(); - err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, - ireq->ir_rmt_addr, - rcu_dereference(ireq->ireq_opt), - READ_ONCE(inet_sk(sk)->tos)); - rcu_read_unlock(); - err = net_xmit_eval(err); - } - -out: - dst_release(dst); - return err; -} - -static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb, - enum sk_rst_reason reason) -{ - int err; - const struct iphdr *rxiph; - struct sk_buff *skb; - struct dst_entry *dst; - struct net *net = dev_net(skb_dst(rxskb)->dev); - struct dccp_v4_pernet *pn; - struct sock *ctl_sk; - - /* Never send a reset in response to a reset. */ - if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) - return; - - if (skb_rtable(rxskb)->rt_type != RTN_LOCAL) - return; - - pn = net_generic(net, dccp_v4_pernet_id); - ctl_sk = pn->v4_ctl_sk; - dst = dccp_v4_route_skb(net, ctl_sk, rxskb); - if (dst == NULL) - return; - - skb = dccp_ctl_make_reset(ctl_sk, rxskb); - if (skb == NULL) - goto out; - - rxiph = ip_hdr(rxskb); - dccp_hdr(skb)->dccph_checksum = dccp_v4_csum_finish(skb, rxiph->saddr, - rxiph->daddr); - skb_dst_set(skb, dst_clone(dst)); - - local_bh_disable(); - bh_lock_sock(ctl_sk); - err = ip_build_and_send_pkt(skb, ctl_sk, - rxiph->daddr, rxiph->saddr, NULL, - inet_sk(ctl_sk)->tos); - bh_unlock_sock(ctl_sk); - - if (net_xmit_eval(err) == 0) { - __DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - __DCCP_INC_STATS(DCCP_MIB_OUTRSTS); - } - local_bh_enable(); -out: - dst_release(dst); -} - -static void dccp_v4_reqsk_destructor(struct request_sock *req) -{ - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); - kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); -} - -void dccp_syn_ack_timeout(const struct request_sock *req) -{ -} -EXPORT_SYMBOL(dccp_syn_ack_timeout); - -static struct request_sock_ops dccp_request_sock_ops __read_mostly = { - .family = PF_INET, - .obj_size = sizeof(struct dccp_request_sock), - .rtx_syn_ack = dccp_v4_send_response, - .send_ack = dccp_reqsk_send_ack, - .destructor = dccp_v4_reqsk_destructor, - .send_reset = dccp_v4_ctl_send_reset, - .syn_ack_timeout = dccp_syn_ack_timeout, -}; - -int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) -{ - struct inet_request_sock *ireq; - struct request_sock *req; - struct dccp_request_sock *dreq; - const __be32 service = dccp_hdr_request(skb)->dccph_req_service; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - - /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ - if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - return 0; /* discard, don't send a reset here */ - - if (dccp_bad_service_code(sk, service)) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; - goto drop; - } - /* - * TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; - if (inet_csk_reqsk_queue_is_full(sk)) - goto drop; - - if (sk_acceptq_is_full(sk)) - goto drop; - - req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true); - if (req == NULL) - goto drop; - - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) - goto drop_and_free; - - dreq = dccp_rsk(req); - if (dccp_parse_options(sk, dreq, skb)) - goto drop_and_free; - - ireq = inet_rsk(req); - sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); - sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); - ireq->ir_mark = inet_request_mark(sk, skb); - ireq->ireq_family = AF_INET; - ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if); - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - - /* - * Step 3: Process LISTEN state - * - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie - * - * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child(). - */ - dreq->dreq_isr = dcb->dccpd_seq; - dreq->dreq_gsr = dreq->dreq_isr; - dreq->dreq_iss = dccp_v4_init_sequence(skb); - dreq->dreq_gss = dreq->dreq_iss; - dreq->dreq_service = service; - - if (dccp_v4_send_response(sk, req)) - goto drop_and_free; - - if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT))) - reqsk_free(req); - else - reqsk_put(req); - - return 0; - -drop_and_free: - reqsk_free(req); -drop: - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - return -1; -} -EXPORT_SYMBOL_GPL(dccp_v4_conn_request); - -int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_hdr *dh = dccp_hdr(skb); - - if (sk->sk_state == DCCP_OPEN) { /* Fast path */ - if (dccp_rcv_established(sk, skb, dh, skb->len)) - goto reset; - return 0; - } - - /* - * Step 3: Process LISTEN state - * If P.type == Request or P contains a valid Init Cookie option, - * (* Must scan the packet's options to check for Init - * Cookies. Only Init Cookies are processed here, - * however; other options are processed in Step 8. This - * scan need only be performed if the endpoint uses Init - * Cookies *) - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - * S.state = RESPOND - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies - * Continue with S.state == RESPOND - * (* A Response packet will be generated in Step 11 *) - * Otherwise, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - * - * NOTE: the check for the packet types is done in - * dccp_rcv_state_process - */ - - if (dccp_rcv_state_process(sk, skb, dh, skb->len)) - goto reset; - return 0; - -reset: - dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - kfree_skb(skb); - return 0; -} -EXPORT_SYMBOL_GPL(dccp_v4_do_rcv); - -/** - * dccp_invalid_packet - check for malformed packets - * @skb: Packet to validate - * - * Implements RFC 4340, 8.5: Step 1: Check header basics - * Packets that fail these checks are ignored and do not receive Resets. - */ -int dccp_invalid_packet(struct sk_buff *skb) -{ - const struct dccp_hdr *dh; - unsigned int cscov; - u8 dccph_doff; - - if (skb->pkt_type != PACKET_HOST) - return 1; - - /* If the packet is shorter than 12 bytes, drop packet and return */ - if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) { - DCCP_WARN("pskb_may_pull failed\n"); - return 1; - } - - dh = dccp_hdr(skb); - - /* If P.type is not understood, drop packet and return */ - if (dh->dccph_type >= DCCP_PKT_INVALID) { - DCCP_WARN("invalid packet type\n"); - return 1; - } - - /* - * If P.Data Offset is too small for packet type, drop packet and return - */ - dccph_doff = dh->dccph_doff; - if (dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) { - DCCP_WARN("P.Data Offset(%u) too small\n", dccph_doff); - return 1; - } - /* - * If P.Data Offset is too large for packet, drop packet and return - */ - if (!pskb_may_pull(skb, dccph_doff * sizeof(u32))) { - DCCP_WARN("P.Data Offset(%u) too large\n", dccph_doff); - return 1; - } - dh = dccp_hdr(skb); - /* - * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet - * has short sequence numbers), drop packet and return - */ - if ((dh->dccph_type < DCCP_PKT_DATA || - dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0) { - DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n", - dccp_packet_name(dh->dccph_type)); - return 1; - } - - /* - * If P.CsCov is too large for the packet size, drop packet and return. - * This must come _before_ checksumming (not as RFC 4340 suggests). - */ - cscov = dccp_csum_coverage(skb); - if (cscov > skb->len) { - DCCP_WARN("P.CsCov %u exceeds packet length %d\n", - dh->dccph_cscov, skb->len); - return 1; - } - - /* If header checksum is incorrect, drop packet and return. - * (This step is completed in the AF-dependent functions.) */ - skb->csum = skb_checksum(skb, 0, cscov, 0); - - return 0; -} -EXPORT_SYMBOL_GPL(dccp_invalid_packet); - -/* this is called when real data arrives */ -static int dccp_v4_rcv(struct sk_buff *skb) -{ - const struct dccp_hdr *dh; - const struct iphdr *iph; - bool refcounted; - struct sock *sk; - int min_cov; - - /* Step 1: Check header basics */ - - if (dccp_invalid_packet(skb)) - goto discard_it; - - iph = ip_hdr(skb); - /* Step 1: If header checksum is incorrect, drop packet and return */ - if (dccp_v4_csum_finish(skb, iph->saddr, iph->daddr)) { - DCCP_WARN("dropped packet with invalid checksum\n"); - goto discard_it; - } - - dh = dccp_hdr(skb); - - DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh); - DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; - - dccp_pr_debug("%8.8s src=%pI4@%-5d dst=%pI4@%-5d seq=%llu", - dccp_packet_name(dh->dccph_type), - &iph->saddr, ntohs(dh->dccph_sport), - &iph->daddr, ntohs(dh->dccph_dport), - (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); - - if (dccp_packet_without_ack(skb)) { - DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; - dccp_pr_debug_cat("\n"); - } else { - DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); - dccp_pr_debug_cat(", ack=%llu\n", (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq); - } - -lookup: - sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), - dh->dccph_sport, dh->dccph_dport, 0, &refcounted); - if (!sk) { - dccp_pr_debug("failed to look up flow ID in table and " - "get corresponding socket\n"); - goto no_dccp_socket; - } - - /* - * Step 2: - * ... or S.state == TIMEWAIT, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (sk->sk_state == DCCP_TIME_WAIT) { - dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n"); - inet_twsk_put(inet_twsk(sk)); - goto no_dccp_socket; - } - - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - struct request_sock *req = inet_reqsk(sk); - struct sock *nsk; - - sk = req->rsk_listener; - if (unlikely(sk->sk_state != DCCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; - } - sock_hold(sk); - refcounted = true; - nsk = dccp_check_req(sk, skb, req); - if (!nsk) { - reqsk_put(req); - goto discard_and_relse; - } - if (nsk == sk) { - reqsk_put(req); - } else if (dccp_child_process(sk, nsk, skb)) { - dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - goto discard_and_relse; - } else { - sock_put(sk); - return 0; - } - } - /* - * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage - * o if MinCsCov = 0, only packets with CsCov = 0 are accepted - * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov - */ - min_cov = dccp_sk(sk)->dccps_pcrlen; - if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) { - dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n", - dh->dccph_cscov, min_cov); - /* FIXME: "Such packets SHOULD be reported using Data Dropped - * options (Section 11.7) with Drop Code 0, Protocol - * Constraints." */ - goto discard_and_relse; - } - - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) - goto discard_and_relse; - nf_reset_ct(skb); - - return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted); - -no_dccp_socket: - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto discard_it; - /* - * Step 2: - * If no socket ... - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (dh->dccph_type != DCCP_PKT_RESET) { - DCCP_SKB_CB(skb)->dccpd_reset_code = - DCCP_RESET_CODE_NO_CONNECTION; - dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - } - -discard_it: - kfree_skb(skb); - return 0; - -discard_and_relse: - if (refcounted) - sock_put(sk); - goto discard_it; -} - -static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { - .queue_xmit = ip_queue_xmit, - .send_check = dccp_v4_send_check, - .rebuild_header = inet_sk_rebuild_header, - .conn_request = dccp_v4_conn_request, - .syn_recv_sock = dccp_v4_request_recv_sock, - .net_header_len = sizeof(struct iphdr), - .setsockopt = ip_setsockopt, - .getsockopt = ip_getsockopt, -}; - -static int dccp_v4_init_sock(struct sock *sk) -{ - static __u8 dccp_v4_ctl_sock_initialized; - int err = dccp_init_sock(sk, dccp_v4_ctl_sock_initialized); - - if (err == 0) { - if (unlikely(!dccp_v4_ctl_sock_initialized)) - dccp_v4_ctl_sock_initialized = 1; - inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops; - } - - return err; -} - -static struct timewait_sock_ops dccp_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct inet_timewait_sock), -}; - -static struct proto dccp_v4_prot = { - .name = "DCCP", - .owner = THIS_MODULE, - .close = dccp_close, - .connect = dccp_v4_connect, - .disconnect = dccp_disconnect, - .ioctl = dccp_ioctl, - .init = dccp_v4_init_sock, - .setsockopt = dccp_setsockopt, - .getsockopt = dccp_getsockopt, - .sendmsg = dccp_sendmsg, - .recvmsg = dccp_recvmsg, - .backlog_rcv = dccp_v4_do_rcv, - .hash = inet_hash, - .unhash = inet_unhash, - .accept = inet_csk_accept, - .get_port = inet_csk_get_port, - .shutdown = dccp_shutdown, - .destroy = dccp_destroy_sock, - .orphan_count = &dccp_orphan_count, - .max_header = MAX_DCCP_HEADER, - .obj_size = sizeof(struct dccp_sock), - .slab_flags = SLAB_TYPESAFE_BY_RCU, - .rsk_prot = &dccp_request_sock_ops, - .twsk_prot = &dccp_timewait_sock_ops, - .h.hashinfo = &dccp_hashinfo, -}; - -static const struct net_protocol dccp_v4_protocol = { - .handler = dccp_v4_rcv, - .err_handler = dccp_v4_err, - .no_policy = 1, - .icmp_strict_tag_validation = 1, -}; - -static const struct proto_ops inet_dccp_ops = { - .family = PF_INET, - .owner = THIS_MODULE, - .release = inet_release, - .bind = inet_bind, - .connect = inet_stream_connect, - .socketpair = sock_no_socketpair, - .accept = inet_accept, - .getname = inet_getname, - /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ - .poll = dccp_poll, - .ioctl = inet_ioctl, - .gettstamp = sock_gettstamp, - /* FIXME: work on inet_listen to rename it to sock_common_listen */ - .listen = inet_dccp_listen, - .shutdown = inet_shutdown, - .setsockopt = sock_common_setsockopt, - .getsockopt = sock_common_getsockopt, - .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, - .mmap = sock_no_mmap, -}; - -static struct inet_protosw dccp_v4_protosw = { - .type = SOCK_DCCP, - .protocol = IPPROTO_DCCP, - .prot = &dccp_v4_prot, - .ops = &inet_dccp_ops, - .flags = INET_PROTOSW_ICSK, -}; - -static int __net_init dccp_v4_init_net(struct net *net) -{ - struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); - - if (dccp_hashinfo.bhash == NULL) - return -ESOCKTNOSUPPORT; - - return inet_ctl_sock_create(&pn->v4_ctl_sk, PF_INET, - SOCK_DCCP, IPPROTO_DCCP, net); -} - -static void __net_exit dccp_v4_exit_net(struct net *net) -{ - struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); - - inet_ctl_sock_destroy(pn->v4_ctl_sk); -} - -static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) -{ - inet_twsk_purge(&dccp_hashinfo); -} - -static struct pernet_operations dccp_v4_ops = { - .init = dccp_v4_init_net, - .exit = dccp_v4_exit_net, - .exit_batch = dccp_v4_exit_batch, - .id = &dccp_v4_pernet_id, - .size = sizeof(struct dccp_v4_pernet), -}; - -static int __init dccp_v4_init(void) -{ - int err = proto_register(&dccp_v4_prot, 1); - - if (err) - goto out; - - inet_register_protosw(&dccp_v4_protosw); - - err = register_pernet_subsys(&dccp_v4_ops); - if (err) - goto out_destroy_ctl_sock; - - err = inet_add_protocol(&dccp_v4_protocol, IPPROTO_DCCP); - if (err) - goto out_proto_unregister; - -out: - return err; -out_proto_unregister: - unregister_pernet_subsys(&dccp_v4_ops); -out_destroy_ctl_sock: - inet_unregister_protosw(&dccp_v4_protosw); - proto_unregister(&dccp_v4_prot); - goto out; -} - -static void __exit dccp_v4_exit(void) -{ - inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP); - unregister_pernet_subsys(&dccp_v4_ops); - inet_unregister_protosw(&dccp_v4_protosw); - proto_unregister(&dccp_v4_prot); -} - -module_init(dccp_v4_init); -module_exit(dccp_v4_exit); - -/* - * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) - * values directly, Also cover the case where the protocol is not specified, - * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP - */ -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 33, 6); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 0, 6); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo "); -MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c deleted file mode 100644 index e24dbffabfc1..000000000000 --- a/net/dccp/ipv6.c +++ /dev/null @@ -1,1174 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * DCCP over IPv6 - * Linux INET6 implementation - * - * Based on net/dccp6/ipv6.c - * - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "dccp.h" -#include "ipv6.h" -#include "feat.h" - -struct dccp_v6_pernet { - struct sock *v6_ctl_sk; -}; - -static unsigned int dccp_v6_pernet_id __read_mostly; - -/* The per-net v6_ctl_sk is used for sending RSTs and ACKs */ - -static const struct inet_connection_sock_af_ops dccp_ipv6_mapped; -static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops; - -/* add pseudo-header to DCCP checksum stored in skb->csum */ -static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb, - const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum); -} - -static inline void dccp_v6_send_check(struct sock *sk, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct dccp_hdr *dh = dccp_hdr(skb); - - dccp_csum_outgoing(skb); - dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &sk->sk_v6_daddr); -} - -static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb) -{ - return secure_dccpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32, - dccp_hdr(skb)->dccph_dport, - dccp_hdr(skb)->dccph_sport ); - -} - -static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) -{ - const struct ipv6hdr *hdr; - const struct dccp_hdr *dh; - struct dccp_sock *dp; - struct ipv6_pinfo *np; - struct sock *sk; - int err; - __u64 seq; - struct net *net = dev_net(skb->dev); - - if (!pskb_may_pull(skb, offset + sizeof(*dh))) - return -EINVAL; - dh = (struct dccp_hdr *)(skb->data + offset); - if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) - return -EINVAL; - hdr = (const struct ipv6hdr *)skb->data; - dh = (struct dccp_hdr *)(skb->data + offset); - - sk = __inet6_lookup_established(net, &dccp_hashinfo, - &hdr->daddr, dh->dccph_dport, - &hdr->saddr, ntohs(dh->dccph_sport), - inet6_iif(skb), 0); - - if (!sk) { - __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); - return -ENOENT; - } - - if (sk->sk_state == DCCP_TIME_WAIT) { - inet_twsk_put(inet_twsk(sk)); - return 0; - } - seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - dccp_req_err(sk, seq); - return 0; - } - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) - __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); - - if (sk->sk_state == DCCP_CLOSED) - goto out; - - dp = dccp_sk(sk); - if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && - !between48(seq, dp->dccps_awl, dp->dccps_awh)) { - __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); - goto out; - } - - np = inet6_sk(sk); - - if (type == NDISC_REDIRECT) { - if (!sock_owned_by_user(sk)) { - struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst) - dst->ops->redirect(dst, sk, skb); - } - goto out; - } - - if (type == ICMPV6_PKT_TOOBIG) { - struct dst_entry *dst = NULL; - - if (!ip6_sk_accept_pmtu(sk)) - goto out; - - if (sock_owned_by_user(sk)) - goto out; - if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED)) - goto out; - - dst = inet6_csk_update_pmtu(sk, ntohl(info)); - if (!dst) - goto out; - - if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) - dccp_sync_mss(sk, dst_mtu(dst)); - goto out; - } - - icmpv6_err_convert(type, code, &err); - - /* Might be for an request_sock */ - switch (sk->sk_state) { - case DCCP_REQUESTING: - case DCCP_RESPOND: /* Cannot happen. - It can, it SYNs are crossed. --ANK */ - if (!sock_owned_by_user(sk)) { - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - sk->sk_err = err; - /* - * Wake people up to see the error - * (see connect in sock.c) - */ - sk_error_report(sk); - dccp_done(sk); - } else { - WRITE_ONCE(sk->sk_err_soft, err); - } - goto out; - } - - if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { - sk->sk_err = err; - sk_error_report(sk); - } else { - WRITE_ONCE(sk->sk_err_soft, err); - } -out: - bh_unlock_sock(sk); - sock_put(sk); - return 0; -} - - -static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req) -{ - struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *np = inet6_sk(sk); - struct sk_buff *skb; - struct in6_addr *final_p, final; - struct flowi6 fl6; - int err = -1; - struct dst_entry *dst; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.daddr = ireq->ir_v6_rmt_addr; - fl6.saddr = ireq->ir_v6_loc_addr; - fl6.flowlabel = 0; - fl6.flowi6_oif = ireq->ir_iif; - fl6.fl6_dport = ireq->ir_rmt_port; - fl6.fl6_sport = htons(ireq->ir_num); - security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); - - - rcu_read_lock(); - final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); - rcu_read_unlock(); - - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - dst = NULL; - goto done; - } - - skb = dccp_make_response(sk, dst, req); - if (skb != NULL) { - struct dccp_hdr *dh = dccp_hdr(skb); - struct ipv6_txoptions *opt; - - dh->dccph_checksum = dccp_v6_csum_finish(skb, - &ireq->ir_v6_loc_addr, - &ireq->ir_v6_rmt_addr); - fl6.daddr = ireq->ir_v6_rmt_addr; - rcu_read_lock(); - opt = ireq->ipv6_opt; - if (!opt) - opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, &fl6, READ_ONCE(sk->sk_mark), opt, - np->tclass, READ_ONCE(sk->sk_priority)); - rcu_read_unlock(); - err = net_xmit_eval(err); - } - -done: - dst_release(dst); - return err; -} - -static void dccp_v6_reqsk_destructor(struct request_sock *req) -{ - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); - kfree(inet_rsk(req)->ipv6_opt); - kfree_skb(inet_rsk(req)->pktopts); -} - -static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb, - enum sk_rst_reason reason) -{ - const struct ipv6hdr *rxip6h; - struct sk_buff *skb; - struct flowi6 fl6; - struct net *net = dev_net(skb_dst(rxskb)->dev); - struct dccp_v6_pernet *pn; - struct sock *ctl_sk; - struct dst_entry *dst; - - if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) - return; - - if (!ipv6_unicast_destination(rxskb)) - return; - - pn = net_generic(net, dccp_v6_pernet_id); - ctl_sk = pn->v6_ctl_sk; - skb = dccp_ctl_make_reset(ctl_sk, rxskb); - if (skb == NULL) - return; - - rxip6h = ipv6_hdr(rxskb); - dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr, - &rxip6h->daddr); - - memset(&fl6, 0, sizeof(fl6)); - fl6.daddr = rxip6h->saddr; - fl6.saddr = rxip6h->daddr; - - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.flowi6_oif = inet6_iif(rxskb); - fl6.fl6_dport = dccp_hdr(skb)->dccph_dport; - fl6.fl6_sport = dccp_hdr(skb)->dccph_sport; - security_skb_classify_flow(rxskb, flowi6_to_flowi_common(&fl6)); - - /* sk = NULL, but it is safe for now. RST socket required. */ - dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); - if (!IS_ERR(dst)) { - skb_dst_set(skb, dst); - ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0); - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - DCCP_INC_STATS(DCCP_MIB_OUTRSTS); - return; - } - - kfree_skb(skb); -} - -static struct request_sock_ops dccp6_request_sock_ops = { - .family = AF_INET6, - .obj_size = sizeof(struct dccp6_request_sock), - .rtx_syn_ack = dccp_v6_send_response, - .send_ack = dccp_reqsk_send_ack, - .destructor = dccp_v6_reqsk_destructor, - .send_reset = dccp_v6_ctl_send_reset, - .syn_ack_timeout = dccp_syn_ack_timeout, -}; - -static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) -{ - struct request_sock *req; - struct dccp_request_sock *dreq; - struct inet_request_sock *ireq; - struct ipv6_pinfo *np = inet6_sk(sk); - const __be32 service = dccp_hdr_request(skb)->dccph_req_service; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - - if (skb->protocol == htons(ETH_P_IP)) - return dccp_v4_conn_request(sk, skb); - - if (!ipv6_unicast_destination(skb)) - return 0; /* discard, don't send a reset here */ - - if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { - __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); - return 0; - } - - if (dccp_bad_service_code(sk, service)) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; - goto drop; - } - /* - * There are no SYN attacks on IPv6, yet... - */ - dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; - if (inet_csk_reqsk_queue_is_full(sk)) - goto drop; - - if (sk_acceptq_is_full(sk)) - goto drop; - - req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true); - if (req == NULL) - goto drop; - - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) - goto drop_and_free; - - dreq = dccp_rsk(req); - if (dccp_parse_options(sk, dreq, skb)) - goto drop_and_free; - - ireq = inet_rsk(req); - ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; - ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - ireq->ir_rmt_addr = LOOPBACK4_IPV6; - ireq->ir_loc_addr = LOOPBACK4_IPV6; - - ireq->ireq_family = AF_INET6; - ireq->ir_mark = inet_request_mark(sk, skb); - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - - if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || - np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || - np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { - refcount_inc(&skb->users); - ireq->pktopts = skb; - } - ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if); - - /* So that link locals have meaning */ - if (!ireq->ir_iif && - ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) - ireq->ir_iif = inet6_iif(skb); - - /* - * Step 3: Process LISTEN state - * - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie - * - * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child(). - */ - dreq->dreq_isr = dcb->dccpd_seq; - dreq->dreq_gsr = dreq->dreq_isr; - dreq->dreq_iss = dccp_v6_init_sequence(skb); - dreq->dreq_gss = dreq->dreq_iss; - dreq->dreq_service = service; - - if (dccp_v6_send_response(sk, req)) - goto drop_and_free; - - if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT))) - reqsk_free(req); - else - reqsk_put(req); - - return 0; - -drop_and_free: - reqsk_free(req); -drop: - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - return -1; -} - -static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, - struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req) -{ - struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *newnp; - const struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6_txoptions *opt; - struct inet_sock *newinet; - struct dccp6_sock *newdp6; - struct sock *newsk; - - if (skb->protocol == htons(ETH_P_IP)) { - /* - * v6 mapped - */ - newsk = dccp_v4_request_recv_sock(sk, skb, req, dst, - req_unhash, own_req); - if (newsk == NULL) - return NULL; - - newdp6 = (struct dccp6_sock *)newsk; - newinet = inet_sk(newsk); - newinet->pinet6 = &newdp6->inet6; - newnp = inet6_sk(newsk); - - memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - - newnp->saddr = newsk->sk_v6_rcv_saddr; - - inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped; - newsk->sk_backlog_rcv = dccp_v4_do_rcv; - newnp->pktoptions = NULL; - newnp->opt = NULL; - newnp->ipv6_mc_list = NULL; - newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; - newnp->mcast_oif = inet_iif(skb); - newnp->mcast_hops = ip_hdr(skb)->ttl; - - /* - * No need to charge this sock to the relevant IPv6 refcnt debug socks count - * here, dccp_create_openreq_child now does this for us, see the comment in - * that function for the gory details. -acme - */ - - /* It is tricky place. Until this moment IPv4 tcp - worked with IPv6 icsk.icsk_af_ops. - Sync it now. - */ - dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); - - return newsk; - } - - - if (sk_acceptq_is_full(sk)) - goto out_overflow; - - if (!dst) { - struct flowi6 fl6; - - dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_DCCP); - if (!dst) - goto out; - } - - newsk = dccp_create_openreq_child(sk, req, skb); - if (newsk == NULL) - goto out_nonewsk; - - /* - * No need to charge this sock to the relevant IPv6 refcnt debug socks - * count here, dccp_create_openreq_child now does this for us, see the - * comment in that function for the gory details. -acme - */ - - ip6_dst_store(newsk, dst, NULL, NULL); - newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | - NETIF_F_TSO); - newdp6 = (struct dccp6_sock *)newsk; - newinet = inet_sk(newsk); - newinet->pinet6 = &newdp6->inet6; - newnp = inet6_sk(newsk); - - memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - - newnp->saddr = ireq->ir_v6_loc_addr; - - /* Now IPv6 options... - - First: no IPv4 options. - */ - newinet->inet_opt = NULL; - - /* Clone RX bits */ - newnp->rxopt.all = np->rxopt.all; - - newnp->ipv6_mc_list = NULL; - newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; - newnp->pktoptions = NULL; - newnp->opt = NULL; - newnp->mcast_oif = inet6_iif(skb); - newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; - - /* - * Clone native IPv6 options from listening socket (if any) - * - * Yes, keeping reference count would be much more clever, but we make - * one more one thing there: reattach optmem to newsk. - */ - opt = ireq->ipv6_opt; - if (!opt) - opt = rcu_dereference(np->opt); - if (opt) { - opt = ipv6_dup_options(newsk, opt); - RCU_INIT_POINTER(newnp->opt, opt); - } - inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (opt) - inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + - opt->opt_flen; - - dccp_sync_mss(newsk, dst_mtu(dst)); - - if (__inet_inherit_port(sk, newsk) < 0) { - inet_csk_prepare_forced_close(newsk); - dccp_done(newsk); - goto out; - } - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); - /* Clone pktoptions received with SYN, if we own the req */ - if (*own_req && ireq->pktopts) { - newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); - consume_skb(ireq->pktopts); - ireq->pktopts = NULL; - } - - return newsk; - -out_overflow: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -out_nonewsk: - dst_release(dst); -out: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); - return NULL; -} - -/* The socket must have it's spinlock held when we get - * here. - * - * We have a potential double-lock case here, so even when - * doing backlog processing we use the BH locking scheme. - * This is because we cannot sleep with the original spinlock - * held. - */ -static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct sk_buff *opt_skb = NULL; - - /* Imagine: socket is IPv6. IPv4 packet arrives, - goes to IPv4 receive handler and backlogged. - From backlog it always goes here. Kerboom... - Fortunately, dccp_rcv_established and rcv_established - handle them correctly, but it is not case with - dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK - */ - - if (skb->protocol == htons(ETH_P_IP)) - return dccp_v4_do_rcv(sk, skb); - - if (sk_filter(sk, skb)) - goto discard; - - /* - * socket locking is here for SMP purposes as backlog rcv is currently - * called with bh processing disabled. - */ - - /* Do Stevens' IPV6_PKTOPTIONS. - - Yes, guys, it is the only place in our code, where we - may make it not affecting IPv4. - The rest of code is protocol independent, - and I do not like idea to uglify IPv4. - - Actually, all the idea behind IPV6_PKTOPTIONS - looks not very well thought. For now we latch - options, received in the last packet, enqueued - by tcp. Feel free to propose better solution. - --ANK (980728) - */ - if (np->rxopt.all && sk->sk_state != DCCP_LISTEN) - opt_skb = skb_clone_and_charge_r(skb, sk); - - if (sk->sk_state == DCCP_OPEN) { /* Fast path */ - if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) - goto reset; - if (opt_skb) - goto ipv6_pktoptions; - return 0; - } - - /* - * Step 3: Process LISTEN state - * If S.state == LISTEN, - * If P.type == Request or P contains a valid Init Cookie option, - * (* Must scan the packet's options to check for Init - * Cookies. Only Init Cookies are processed here, - * however; other options are processed in Step 8. This - * scan need only be performed if the endpoint uses Init - * Cookies *) - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - * S.state = RESPOND - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies - * Continue with S.state == RESPOND - * (* A Response packet will be generated in Step 11 *) - * Otherwise, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - * - * NOTE: the check for the packet types is done in - * dccp_rcv_state_process - */ - - if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) - goto reset; - if (opt_skb) - goto ipv6_pktoptions; - return 0; - -reset: - dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); -discard: - if (opt_skb != NULL) - __kfree_skb(opt_skb); - kfree_skb(skb); - return 0; - -/* Handling IPV6_PKTOPTIONS skb the similar - * way it's done for net/ipv6/tcp_ipv6.c - */ -ipv6_pktoptions: - if (!((1 << sk->sk_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) { - if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) - WRITE_ONCE(np->mcast_oif, inet6_iif(opt_skb)); - if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) - WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit); - if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) - np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); - if (inet6_test_bit(REPFLOW, sk)) - np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); - if (ipv6_opt_accepted(sk, opt_skb, - &DCCP_SKB_CB(opt_skb)->header.h6)) { - memmove(IP6CB(opt_skb), - &DCCP_SKB_CB(opt_skb)->header.h6, - sizeof(struct inet6_skb_parm)); - opt_skb = xchg(&np->pktoptions, opt_skb); - } else { - __kfree_skb(opt_skb); - opt_skb = xchg(&np->pktoptions, NULL); - } - } - - kfree_skb(opt_skb); - return 0; -} - -static int dccp_v6_rcv(struct sk_buff *skb) -{ - const struct dccp_hdr *dh; - bool refcounted; - struct sock *sk; - int min_cov; - - /* Step 1: Check header basics */ - - if (dccp_invalid_packet(skb)) - goto discard_it; - - /* Step 1: If header checksum is incorrect, drop packet and return. */ - if (dccp_v6_csum_finish(skb, &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr)) { - DCCP_WARN("dropped packet with invalid checksum\n"); - goto discard_it; - } - - dh = dccp_hdr(skb); - - DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh); - DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; - - if (dccp_packet_without_ack(skb)) - DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; - else - DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); - -lookup: - sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), - dh->dccph_sport, dh->dccph_dport, - inet6_iif(skb), 0, &refcounted); - if (!sk) { - dccp_pr_debug("failed to look up flow ID in table and " - "get corresponding socket\n"); - goto no_dccp_socket; - } - - /* - * Step 2: - * ... or S.state == TIMEWAIT, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (sk->sk_state == DCCP_TIME_WAIT) { - dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n"); - inet_twsk_put(inet_twsk(sk)); - goto no_dccp_socket; - } - - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - struct request_sock *req = inet_reqsk(sk); - struct sock *nsk; - - sk = req->rsk_listener; - if (unlikely(sk->sk_state != DCCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; - } - sock_hold(sk); - refcounted = true; - nsk = dccp_check_req(sk, skb, req); - if (!nsk) { - reqsk_put(req); - goto discard_and_relse; - } - if (nsk == sk) { - reqsk_put(req); - } else if (dccp_child_process(sk, nsk, skb)) { - dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - goto discard_and_relse; - } else { - sock_put(sk); - return 0; - } - } - /* - * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage - * o if MinCsCov = 0, only packets with CsCov = 0 are accepted - * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov - */ - min_cov = dccp_sk(sk)->dccps_pcrlen; - if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) { - dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n", - dh->dccph_cscov, min_cov); - /* FIXME: send Data Dropped option (see also dccp_v4_rcv) */ - goto discard_and_relse; - } - - if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) - goto discard_and_relse; - nf_reset_ct(skb); - - return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, - refcounted) ? -1 : 0; - -no_dccp_socket: - if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto discard_it; - /* - * Step 2: - * If no socket ... - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (dh->dccph_type != DCCP_PKT_RESET) { - DCCP_SKB_CB(skb)->dccpd_reset_code = - DCCP_RESET_CODE_NO_CONNECTION; - dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - } - -discard_it: - kfree_skb(skb); - return 0; - -discard_and_relse: - if (refcounted) - sock_put(sk); - goto discard_it; -} - -static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, - int addr_len) -{ - struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr; - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - struct in6_addr *saddr = NULL, *final_p, final; - struct ipv6_txoptions *opt; - struct flowi6 fl6; - struct dst_entry *dst; - int addr_type; - int err; - - dp->dccps_role = DCCP_ROLE_CLIENT; - - if (addr_len < SIN6_LEN_RFC2133) - return -EINVAL; - - if (usin->sin6_family != AF_INET6) - return -EAFNOSUPPORT; - - memset(&fl6, 0, sizeof(fl6)); - - if (inet6_test_bit(SNDFLOW, sk)) { - fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; - IP6_ECN_flow_init(fl6.flowlabel); - if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { - struct ip6_flowlabel *flowlabel; - flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (IS_ERR(flowlabel)) - return -EINVAL; - fl6_sock_release(flowlabel); - } - } - /* - * connect() to INADDR_ANY means loopback (BSD'ism). - */ - if (ipv6_addr_any(&usin->sin6_addr)) - usin->sin6_addr.s6_addr[15] = 1; - - addr_type = ipv6_addr_type(&usin->sin6_addr); - - if (addr_type & IPV6_ADDR_MULTICAST) - return -ENETUNREACH; - - if (addr_type & IPV6_ADDR_LINKLOCAL) { - if (addr_len >= sizeof(struct sockaddr_in6) && - usin->sin6_scope_id) { - /* If interface is set while binding, indices - * must coincide. - */ - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != usin->sin6_scope_id) - return -EINVAL; - - sk->sk_bound_dev_if = usin->sin6_scope_id; - } - - /* Connect to link-local address requires an interface */ - if (!sk->sk_bound_dev_if) - return -EINVAL; - } - - sk->sk_v6_daddr = usin->sin6_addr; - np->flow_label = fl6.flowlabel; - - /* - * DCCP over IPv4 - */ - if (addr_type == IPV6_ADDR_MAPPED) { - u32 exthdrlen = icsk->icsk_ext_hdr_len; - struct sockaddr_in sin; - - net_dbg_ratelimited("connect: ipv4 mapped\n"); - - if (ipv6_only_sock(sk)) - return -ENETUNREACH; - - sin.sin_family = AF_INET; - sin.sin_port = usin->sin6_port; - sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - - icsk->icsk_af_ops = &dccp_ipv6_mapped; - sk->sk_backlog_rcv = dccp_v4_do_rcv; - - err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); - if (err) { - icsk->icsk_ext_hdr_len = exthdrlen; - icsk->icsk_af_ops = &dccp_ipv6_af_ops; - sk->sk_backlog_rcv = dccp_v6_do_rcv; - goto failure; - } - np->saddr = sk->sk_v6_rcv_saddr; - return err; - } - - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) - saddr = &sk->sk_v6_rcv_saddr; - - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.daddr = sk->sk_v6_daddr; - fl6.saddr = saddr ? *saddr : np->saddr; - fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.fl6_dport = usin->sin6_port; - fl6.fl6_sport = inet->inet_sport; - security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - - opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); - final_p = fl6_update_dst(&fl6, opt, &final); - - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - goto failure; - } - - if (saddr == NULL) { - saddr = &fl6.saddr; - - err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); - if (err) - goto failure; - } - - /* set the source address */ - np->saddr = *saddr; - inet->inet_rcv_saddr = LOOPBACK4_IPV6; - - ip6_dst_store(sk, dst, NULL, NULL); - - icsk->icsk_ext_hdr_len = 0; - if (opt) - icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; - - inet->inet_dport = usin->sin6_port; - - dccp_set_state(sk, DCCP_REQUESTING); - err = inet6_hash_connect(&dccp_death_row, sk); - if (err) - goto late_failure; - - dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32, - inet->inet_sport, - inet->inet_dport); - err = dccp_connect(sk); - if (err) - goto late_failure; - - return 0; - -late_failure: - dccp_set_state(sk, DCCP_CLOSED); - inet_bhash2_reset_saddr(sk); - __sk_dst_reset(sk); -failure: - inet->inet_dport = 0; - sk->sk_route_caps = 0; - return err; -} - -static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { - .queue_xmit = inet6_csk_xmit, - .send_check = dccp_v6_send_check, - .rebuild_header = inet6_sk_rebuild_header, - .conn_request = dccp_v6_conn_request, - .syn_recv_sock = dccp_v6_request_recv_sock, - .net_header_len = sizeof(struct ipv6hdr), - .setsockopt = ipv6_setsockopt, - .getsockopt = ipv6_getsockopt, -}; - -/* - * DCCP over IPv4 via INET6 API - */ -static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { - .queue_xmit = ip_queue_xmit, - .send_check = dccp_v4_send_check, - .rebuild_header = inet_sk_rebuild_header, - .conn_request = dccp_v6_conn_request, - .syn_recv_sock = dccp_v6_request_recv_sock, - .net_header_len = sizeof(struct iphdr), - .setsockopt = ipv6_setsockopt, - .getsockopt = ipv6_getsockopt, -}; - -static void dccp_v6_sk_destruct(struct sock *sk) -{ - dccp_destruct_common(sk); - inet6_sock_destruct(sk); -} - -/* NOTE: A lot of things set to zero explicitly by call to - * sk_alloc() so need not be done here. - */ -static int dccp_v6_init_sock(struct sock *sk) -{ - static __u8 dccp_v6_ctl_sock_initialized; - int err = dccp_init_sock(sk, dccp_v6_ctl_sock_initialized); - - if (err == 0) { - if (unlikely(!dccp_v6_ctl_sock_initialized)) - dccp_v6_ctl_sock_initialized = 1; - inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; - sk->sk_destruct = dccp_v6_sk_destruct; - } - - return err; -} - -static struct timewait_sock_ops dccp6_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct dccp6_timewait_sock), -}; - -static struct proto dccp_v6_prot = { - .name = "DCCPv6", - .owner = THIS_MODULE, - .close = dccp_close, - .connect = dccp_v6_connect, - .disconnect = dccp_disconnect, - .ioctl = dccp_ioctl, - .init = dccp_v6_init_sock, - .setsockopt = dccp_setsockopt, - .getsockopt = dccp_getsockopt, - .sendmsg = dccp_sendmsg, - .recvmsg = dccp_recvmsg, - .backlog_rcv = dccp_v6_do_rcv, - .hash = inet6_hash, - .unhash = inet_unhash, - .accept = inet_csk_accept, - .get_port = inet_csk_get_port, - .shutdown = dccp_shutdown, - .destroy = dccp_destroy_sock, - .orphan_count = &dccp_orphan_count, - .max_header = MAX_DCCP_HEADER, - .obj_size = sizeof(struct dccp6_sock), - .ipv6_pinfo_offset = offsetof(struct dccp6_sock, inet6), - .slab_flags = SLAB_TYPESAFE_BY_RCU, - .rsk_prot = &dccp6_request_sock_ops, - .twsk_prot = &dccp6_timewait_sock_ops, - .h.hashinfo = &dccp_hashinfo, -}; - -static const struct inet6_protocol dccp_v6_protocol = { - .handler = dccp_v6_rcv, - .err_handler = dccp_v6_err, - .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, -}; - -static const struct proto_ops inet6_dccp_ops = { - .family = PF_INET6, - .owner = THIS_MODULE, - .release = inet6_release, - .bind = inet6_bind, - .connect = inet_stream_connect, - .socketpair = sock_no_socketpair, - .accept = inet_accept, - .getname = inet6_getname, - .poll = dccp_poll, - .ioctl = inet6_ioctl, - .gettstamp = sock_gettstamp, - .listen = inet_dccp_listen, - .shutdown = inet_shutdown, - .setsockopt = sock_common_setsockopt, - .getsockopt = sock_common_getsockopt, - .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, - .mmap = sock_no_mmap, -#ifdef CONFIG_COMPAT - .compat_ioctl = inet6_compat_ioctl, -#endif -}; - -static struct inet_protosw dccp_v6_protosw = { - .type = SOCK_DCCP, - .protocol = IPPROTO_DCCP, - .prot = &dccp_v6_prot, - .ops = &inet6_dccp_ops, - .flags = INET_PROTOSW_ICSK, -}; - -static int __net_init dccp_v6_init_net(struct net *net) -{ - struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); - - if (dccp_hashinfo.bhash == NULL) - return -ESOCKTNOSUPPORT; - - return inet_ctl_sock_create(&pn->v6_ctl_sk, PF_INET6, - SOCK_DCCP, IPPROTO_DCCP, net); -} - -static void __net_exit dccp_v6_exit_net(struct net *net) -{ - struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); - - inet_ctl_sock_destroy(pn->v6_ctl_sk); -} - -static struct pernet_operations dccp_v6_ops = { - .init = dccp_v6_init_net, - .exit = dccp_v6_exit_net, - .id = &dccp_v6_pernet_id, - .size = sizeof(struct dccp_v6_pernet), -}; - -static int __init dccp_v6_init(void) -{ - int err = proto_register(&dccp_v6_prot, 1); - - if (err) - goto out; - - inet6_register_protosw(&dccp_v6_protosw); - - err = register_pernet_subsys(&dccp_v6_ops); - if (err) - goto out_destroy_ctl_sock; - - err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP); - if (err) - goto out_unregister_proto; - -out: - return err; -out_unregister_proto: - unregister_pernet_subsys(&dccp_v6_ops); -out_destroy_ctl_sock: - inet6_unregister_protosw(&dccp_v6_protosw); - proto_unregister(&dccp_v6_prot); - goto out; -} - -static void __exit dccp_v6_exit(void) -{ - inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP); - unregister_pernet_subsys(&dccp_v6_ops); - inet6_unregister_protosw(&dccp_v6_protosw); - proto_unregister(&dccp_v6_prot); -} - -module_init(dccp_v6_init); -module_exit(dccp_v6_exit); - -/* - * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) - * values directly, Also cover the case where the protocol is not specified, - * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP - */ -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 33, 6); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 0, 6); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo "); -MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h deleted file mode 100644 index c5d14c48def1..000000000000 --- a/net/dccp/ipv6.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _DCCP_IPV6_H -#define _DCCP_IPV6_H -/* - * net/dccp/ipv6.h - * - * An implementation of the DCCP protocol - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ - -#include -#include - -struct dccp6_sock { - struct dccp_sock dccp; - struct ipv6_pinfo inet6; -}; - -struct dccp6_request_sock { - struct dccp_request_sock dccp; -}; - -struct dccp6_timewait_sock { - struct inet_timewait_sock inet; -}; - -#endif /* _DCCP_IPV6_H */ diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c deleted file mode 100644 index fecc8190064f..000000000000 --- a/net/dccp/minisocks.c +++ /dev/null @@ -1,266 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/minisocks.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -struct inet_timewait_death_row dccp_death_row = { - .tw_refcount = REFCOUNT_INIT(1), - .sysctl_max_tw_buckets = NR_FILE * 2, - .hashinfo = &dccp_hashinfo, -}; - -EXPORT_SYMBOL_GPL(dccp_death_row); - -void dccp_time_wait(struct sock *sk, int state, int timeo) -{ - struct inet_timewait_sock *tw; - - tw = inet_twsk_alloc(sk, &dccp_death_row, state); - - if (tw != NULL) { - const struct inet_connection_sock *icsk = inet_csk(sk); - const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); -#if IS_ENABLED(CONFIG_IPV6) - if (tw->tw_family == PF_INET6) { - tw->tw_v6_daddr = sk->sk_v6_daddr; - tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - tw->tw_ipv6only = sk->sk_ipv6only; - } -#endif - - /* Get the TIME_WAIT timeout firing. */ - if (timeo < rto) - timeo = rto; - - if (state == DCCP_TIME_WAIT) - timeo = DCCP_TIMEWAIT_LEN; - - /* Linkage updates. - * Note that access to tw after this point is illegal. - */ - inet_twsk_hashdance_schedule(tw, sk, &dccp_hashinfo, timeo); - } else { - /* Sorry, if we're out of memory, just CLOSE this - * socket up. We've got bigger problems than - * non-graceful socket closings. - */ - DCCP_WARN("time wait bucket table overflow\n"); - } - - dccp_done(sk); -} - -struct sock *dccp_create_openreq_child(const struct sock *sk, - const struct request_sock *req, - const struct sk_buff *skb) -{ - /* - * Step 3: Process LISTEN state - * - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - */ - struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); - - if (newsk != NULL) { - struct dccp_request_sock *dreq = dccp_rsk(req); - struct inet_connection_sock *newicsk = inet_csk(newsk); - struct dccp_sock *newdp = dccp_sk(newsk); - - newdp->dccps_role = DCCP_ROLE_SERVER; - newdp->dccps_hc_rx_ackvec = NULL; - newdp->dccps_service_list = NULL; - newdp->dccps_hc_rx_ccid = NULL; - newdp->dccps_hc_tx_ccid = NULL; - newdp->dccps_service = dreq->dreq_service; - newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo; - newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; - newicsk->icsk_rto = DCCP_TIMEOUT_INIT; - - INIT_LIST_HEAD(&newdp->dccps_featneg); - /* - * Step 3: Process LISTEN state - * - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR from packet (or Init Cookies) - * - * Setting AWL/AWH and SWL/SWH happens as part of the feature - * activation below, as these windows all depend on the local - * and remote Sequence Window feature values (7.5.2). - */ - newdp->dccps_iss = dreq->dreq_iss; - newdp->dccps_gss = dreq->dreq_gss; - newdp->dccps_gar = newdp->dccps_iss; - newdp->dccps_isr = dreq->dreq_isr; - newdp->dccps_gsr = dreq->dreq_gsr; - - /* - * Activate features: initialise CCIDs, sequence windows etc. - */ - if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { - sk_free_unlock_clone(newsk); - return NULL; - } - dccp_init_xmit_timers(newsk); - - __DCCP_INC_STATS(DCCP_MIB_PASSIVEOPENS); - } - return newsk; -} - -EXPORT_SYMBOL_GPL(dccp_create_openreq_child); - -/* - * Process an incoming packet for RESPOND sockets represented - * as an request_sock. - */ -struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, - struct request_sock *req) -{ - struct sock *child = NULL; - struct dccp_request_sock *dreq = dccp_rsk(req); - bool own_req; - - /* TCP/DCCP listeners became lockless. - * DCCP stores complex state in its request_sock, so we need - * a protection for them, now this code runs without being protected - * by the parent (listener) lock. - */ - spin_lock_bh(&dreq->dreq_lock); - - /* Check for retransmitted REQUEST */ - if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { - - if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_gsr)) { - dccp_pr_debug("Retransmitted REQUEST\n"); - dreq->dreq_gsr = DCCP_SKB_CB(skb)->dccpd_seq; - /* - * Send another RESPONSE packet - * To protect against Request floods, increment retrans - * counter (backoff, monitored by dccp_response_timer). - */ - inet_rtx_syn_ack(sk, req); - } - /* Network Duplicate, discard packet */ - goto out; - } - - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; - - if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK && - dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK) - goto drop; - - /* Invalid ACK */ - if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, - dreq->dreq_iss, dreq->dreq_gss)) { - dccp_pr_debug("Invalid ACK number: ack_seq=%llu, " - "dreq_iss=%llu, dreq_gss=%llu\n", - (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq, - (unsigned long long) dreq->dreq_iss, - (unsigned long long) dreq->dreq_gss); - goto drop; - } - - if (dccp_parse_options(sk, dreq, skb)) - goto drop; - - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, - req, &own_req); - if (child) { - child = inet_csk_complete_hashdance(sk, child, req, own_req); - goto out; - } - - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; -drop: - if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) - req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - - inet_csk_reqsk_queue_drop(sk, req); -out: - spin_unlock_bh(&dreq->dreq_lock); - return child; -} - -EXPORT_SYMBOL_GPL(dccp_check_req); - -/* - * Queue segment on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket. - */ -int dccp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb) - __releases(child) -{ - int ret = 0; - const int state = child->sk_state; - - if (!sock_owned_by_user(child)) { - ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb), - skb->len); - - /* Wakeup parent, send SIGIO */ - if (state == DCCP_RESPOND && child->sk_state != state) - parent->sk_data_ready(parent); - } else { - /* Alas, it is possible again, because we do lookup - * in main socket hash table and lock on listening - * socket does not protect us more. - */ - __sk_add_backlog(child, skb); - } - - bh_unlock_sock(child); - sock_put(child); - return ret; -} - -EXPORT_SYMBOL_GPL(dccp_child_process); - -void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, - struct request_sock *rsk) -{ - DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state"); -} - -EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); - -int dccp_reqsk_init(struct request_sock *req, - struct dccp_sock const *dp, struct sk_buff const *skb) -{ - struct dccp_request_sock *dreq = dccp_rsk(req); - - spin_lock_init(&dreq->dreq_lock); - inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport; - inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport); - inet_rsk(req)->acked = 0; - dreq->dreq_timestamp_echo = 0; - - /* inherit feature negotiation options from listening socket */ - return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); -} - -EXPORT_SYMBOL_GPL(dccp_reqsk_init); diff --git a/net/dccp/options.c b/net/dccp/options.c deleted file mode 100644 index db62d4767024..000000000000 --- a/net/dccp/options.c +++ /dev/null @@ -1,609 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/options.c - * - * An implementation of the DCCP protocol - * Copyright (c) 2005 Aristeu Sergio Rozanski Filho - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2005 Ian McDonald - */ -#include -#include -#include -#include -#include -#include - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -u64 dccp_decode_value_var(const u8 *bf, const u8 len) -{ - u64 value = 0; - - if (len >= DCCP_OPTVAL_MAXLEN) - value += ((u64)*bf++) << 40; - if (len > 4) - value += ((u64)*bf++) << 32; - if (len > 3) - value += ((u64)*bf++) << 24; - if (len > 2) - value += ((u64)*bf++) << 16; - if (len > 1) - value += ((u64)*bf++) << 8; - if (len > 0) - value += *bf; - - return value; -} - -/** - * dccp_parse_options - Parse DCCP options present in @skb - * @sk: client|server|listening dccp socket (when @dreq != NULL) - * @dreq: request socket to use during connection setup, or NULL - * @skb: frame to parse - */ -int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - const struct dccp_hdr *dh = dccp_hdr(skb); - const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; - unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); - unsigned char *opt_ptr = options; - const unsigned char *opt_end = (unsigned char *)dh + - (dh->dccph_doff * 4); - struct dccp_options_received *opt_recv = &dp->dccps_options_received; - unsigned char opt, len; - unsigned char *value; - u32 elapsed_time; - __be32 opt_val; - int rc; - int mandatory = 0; - - memset(opt_recv, 0, sizeof(*opt_recv)); - - opt = len = 0; - while (opt_ptr != opt_end) { - opt = *opt_ptr++; - len = 0; - value = NULL; - - /* Check if this isn't a single byte option */ - if (opt > DCCPO_MAX_RESERVED) { - if (opt_ptr == opt_end) - goto out_nonsensical_length; - - len = *opt_ptr++; - if (len < 2) - goto out_nonsensical_length; - /* - * Remove the type and len fields, leaving - * just the value size - */ - len -= 2; - value = opt_ptr; - opt_ptr += len; - - if (opt_ptr > opt_end) - goto out_nonsensical_length; - } - - /* - * CCID-specific options are ignored during connection setup, as - * negotiation may still be in progress (see RFC 4340, 10.3). - * The same applies to Ack Vectors, as these depend on the CCID. - */ - if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || - opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) - goto ignore_option; - - switch (opt) { - case DCCPO_PADDING: - break; - case DCCPO_MANDATORY: - if (mandatory) - goto out_invalid_option; - if (pkt_type != DCCP_PKT_DATA) - mandatory = 1; - break; - case DCCPO_NDP_COUNT: - if (len > 6) - goto out_invalid_option; - - opt_recv->dccpor_ndp = dccp_decode_value_var(value, len); - dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), - (unsigned long long)opt_recv->dccpor_ndp); - break; - case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: - if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ - break; - if (len == 0) - goto out_invalid_option; - rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, - *value, value + 1, len - 1); - if (rc) - goto out_featneg_failed; - break; - case DCCPO_TIMESTAMP: - if (len != 4) - goto out_invalid_option; - /* - * RFC 4340 13.1: "The precise time corresponding to - * Timestamp Value zero is not specified". We use - * zero to indicate absence of a meaningful timestamp. - */ - opt_val = get_unaligned((__be32 *)value); - if (unlikely(opt_val == 0)) { - DCCP_WARN("Timestamp with zero value\n"); - break; - } - - if (dreq != NULL) { - dreq->dreq_timestamp_echo = ntohl(opt_val); - dreq->dreq_timestamp_time = dccp_timestamp(); - } else { - opt_recv->dccpor_timestamp = - dp->dccps_timestamp_echo = ntohl(opt_val); - dp->dccps_timestamp_time = dccp_timestamp(); - } - dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n", - dccp_role(sk), ntohl(opt_val), - (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq); - /* schedule an Ack in case this sender is quiescent */ - inet_csk_schedule_ack(sk); - break; - case DCCPO_TIMESTAMP_ECHO: - if (len != 4 && len != 6 && len != 8) - goto out_invalid_option; - - opt_val = get_unaligned((__be32 *)value); - opt_recv->dccpor_timestamp_echo = ntohl(opt_val); - - dccp_pr_debug("%s rx opt: TIMESTAMP_ECHO=%u, len=%d, " - "ackno=%llu", dccp_role(sk), - opt_recv->dccpor_timestamp_echo, - len + 2, - (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq); - - value += 4; - - if (len == 4) { /* no elapsed time included */ - dccp_pr_debug_cat("\n"); - break; - } - - if (len == 6) { /* 2-byte elapsed time */ - __be16 opt_val2 = get_unaligned((__be16 *)value); - elapsed_time = ntohs(opt_val2); - } else { /* 4-byte elapsed time */ - opt_val = get_unaligned((__be32 *)value); - elapsed_time = ntohl(opt_val); - } - - dccp_pr_debug_cat(", ELAPSED_TIME=%u\n", elapsed_time); - - /* Give precedence to the biggest ELAPSED_TIME */ - if (elapsed_time > opt_recv->dccpor_elapsed_time) - opt_recv->dccpor_elapsed_time = elapsed_time; - break; - case DCCPO_ELAPSED_TIME: - if (dccp_packet_without_ack(skb)) /* RFC 4340, 13.2 */ - break; - - if (len == 2) { - __be16 opt_val2 = get_unaligned((__be16 *)value); - elapsed_time = ntohs(opt_val2); - } else if (len == 4) { - opt_val = get_unaligned((__be32 *)value); - elapsed_time = ntohl(opt_val); - } else { - goto out_invalid_option; - } - - if (elapsed_time > opt_recv->dccpor_elapsed_time) - opt_recv->dccpor_elapsed_time = elapsed_time; - - dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", - dccp_role(sk), elapsed_time); - break; - case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: - if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, - pkt_type, opt, value, len)) - goto out_invalid_option; - break; - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ - break; - /* - * Ack vectors are processed by the TX CCID if it is - * interested. The RX CCID need not parse Ack Vectors, - * since it is only interested in clearing old state. - */ - fallthrough; - case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: - if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, - pkt_type, opt, value, len)) - goto out_invalid_option; - break; - default: - DCCP_CRIT("DCCP(%p): option %d(len=%d) not " - "implemented, ignoring", sk, opt, len); - break; - } -ignore_option: - if (opt != DCCPO_MANDATORY) - mandatory = 0; - } - - /* mandatory was the last byte in option list -> reset connection */ - if (mandatory) - goto out_invalid_option; - -out_nonsensical_length: - /* RFC 4340, 5.8: ignore option and all remaining option space */ - return 0; - -out_invalid_option: - DCCP_INC_STATS(DCCP_MIB_INVALIDOPT); - rc = DCCP_RESET_CODE_OPTION_ERROR; -out_featneg_failed: - DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); - DCCP_SKB_CB(skb)->dccpd_reset_code = rc; - DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; - DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; - DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; - return -1; -} - -EXPORT_SYMBOL_GPL(dccp_parse_options); - -void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) -{ - if (len >= DCCP_OPTVAL_MAXLEN) - *to++ = (value & 0xFF0000000000ull) >> 40; - if (len > 4) - *to++ = (value & 0xFF00000000ull) >> 32; - if (len > 3) - *to++ = (value & 0xFF000000) >> 24; - if (len > 2) - *to++ = (value & 0xFF0000) >> 16; - if (len > 1) - *to++ = (value & 0xFF00) >> 8; - if (len > 0) - *to++ = (value & 0xFF); -} - -static inline u8 dccp_ndp_len(const u64 ndp) -{ - if (likely(ndp <= 0xFF)) - return 1; - return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6); -} - -int dccp_insert_option(struct sk_buff *skb, const unsigned char option, - const void *value, const unsigned char len) -{ - unsigned char *to; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2; - - to = skb_push(skb, len + 2); - *to++ = option; - *to++ = len + 2; - - memcpy(to, value, len); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_insert_option); - -static int dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - u64 ndp = dp->dccps_ndp_count; - - if (dccp_non_data_packet(skb)) - ++dp->dccps_ndp_count; - else - dp->dccps_ndp_count = 0; - - if (ndp > 0) { - unsigned char *ptr; - const int ndp_len = dccp_ndp_len(ndp); - const int len = ndp_len + 2; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len; - - ptr = skb_push(skb, len); - *ptr++ = DCCPO_NDP_COUNT; - *ptr++ = len; - dccp_encode_value_var(ndp, ptr, ndp_len); - } - - return 0; -} - -static inline int dccp_elapsed_time_len(const u32 elapsed_time) -{ - return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4; -} - -static int dccp_insert_option_timestamp(struct sk_buff *skb) -{ - __be32 now = htonl(dccp_timestamp()); - /* yes this will overflow but that is the point as we want a - * 10 usec 32 bit timer which mean it wraps every 11.9 hours */ - - return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now)); -} - -static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, - struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - __be32 tstamp_echo; - unsigned char *to; - u32 elapsed_time, elapsed_time_len, len; - - if (dreq != NULL) { - elapsed_time = dccp_timestamp() - dreq->dreq_timestamp_time; - tstamp_echo = htonl(dreq->dreq_timestamp_echo); - dreq->dreq_timestamp_echo = 0; - } else { - elapsed_time = dccp_timestamp() - dp->dccps_timestamp_time; - tstamp_echo = htonl(dp->dccps_timestamp_echo); - dp->dccps_timestamp_echo = 0; - } - - elapsed_time_len = dccp_elapsed_time_len(elapsed_time); - len = 6 + elapsed_time_len; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len; - - to = skb_push(skb, len); - *to++ = DCCPO_TIMESTAMP_ECHO; - *to++ = len; - - memcpy(to, &tstamp_echo, 4); - to += 4; - - if (elapsed_time_len == 2) { - const __be16 var16 = htons((u16)elapsed_time); - memcpy(to, &var16, 2); - } else if (elapsed_time_len == 4) { - const __be32 var32 = htonl(elapsed_time); - memcpy(to, &var32, 4); - } - - return 0; -} - -static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const u16 buflen = dccp_ackvec_buflen(av); - /* Figure out how many options do we need to represent the ackvec */ - const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); - u16 len = buflen + 2 * nr_opts; - u8 i, nonce = 0; - const unsigned char *tail, *from; - unsigned char *to; - - if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { - DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, - dccp_packet_name(dcb->dccpd_type)); - return -1; - } - /* - * Since Ack Vectors are variable-length, we can not always predict - * their size. To catch exception cases where the space is running out - * on the skb, a separate Sync is scheduled to carry the Ack Vector. - */ - if (len > DCCPAV_MIN_OPTLEN && - len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { - DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " - "MPS=%u ==> reduce payload size?\n", len, skb->len, - dcb->dccpd_opt_len, dp->dccps_mss_cache); - dp->dccps_sync_scheduled = 1; - return 0; - } - dcb->dccpd_opt_len += len; - - to = skb_push(skb, len); - len = buflen; - from = av->av_buf + av->av_buf_head; - tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; - - for (i = 0; i < nr_opts; ++i) { - int copylen = len; - - if (len > DCCP_SINGLE_OPT_MAXLEN) - copylen = DCCP_SINGLE_OPT_MAXLEN; - - /* - * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via - * its type; ack_nonce is the sum of all individual buf_nonce's. - */ - nonce ^= av->av_buf_nonce[i]; - - *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; - *to++ = copylen + 2; - - /* Check if buf_head wraps */ - if (from + copylen > tail) { - const u16 tailsize = tail - from; - - memcpy(to, from, tailsize); - to += tailsize; - len -= tailsize; - copylen -= tailsize; - from = av->av_buf; - } - - memcpy(to, from, copylen); - from += copylen; - to += copylen; - len -= copylen; - } - /* - * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. - */ - if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) - return -ENOBUFS; - return 0; -} - -/** - * dccp_insert_option_mandatory - Mandatory option (5.8.2) - * @skb: frame into which to insert option - * - * Note that since we are using skb_push, this function needs to be called - * _after_ inserting the option it is supposed to influence (stack order). - */ -int dccp_insert_option_mandatory(struct sk_buff *skb) -{ - if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len++; - *(u8 *)skb_push(skb, 1) = DCCPO_MANDATORY; - return 0; -} - -/** - * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb - * @skb: frame to insert feature negotiation option into - * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R - * @feat: one out of %dccp_feature_numbers - * @val: NN value or SP array (preferred element first) to copy - * @len: true length of @val in bytes (excluding first element repetition) - * @repeat_first: whether to copy the first element of @val twice - * - * The last argument is used to construct Confirm options, where the preferred - * value and the preference list appear separately (RFC 4340, 6.3.1). Preference - * lists are kept such that the preferred entry is always first, so we only need - * to copy twice, and avoid the overhead of cloning into a bigger array. - */ -int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, - u8 *val, u8 len, bool repeat_first) -{ - u8 tot_len, *to; - - /* take the `Feature' field and possible repetition into account */ - if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { - DCCP_WARN("length %u for feature %u too large\n", len, feat); - return -1; - } - - if (unlikely(val == NULL || len == 0)) - len = repeat_first = false; - tot_len = 3 + repeat_first + len; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { - DCCP_WARN("packet too small for feature %d option!\n", feat); - return -1; - } - DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len; - - to = skb_push(skb, tot_len); - *to++ = type; - *to++ = tot_len; - *to++ = feat; - - if (repeat_first) - *to++ = *val; - if (len) - memcpy(to, val, len); - return 0; -} - -/* The length of all options needs to be a multiple of 4 (5.8) */ -static void dccp_insert_option_padding(struct sk_buff *skb) -{ - int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4; - - if (padding != 0) { - padding = 4 - padding; - memset(skb_push(skb, padding), 0, padding); - DCCP_SKB_CB(skb)->dccpd_opt_len += padding; - } -} - -int dccp_insert_options(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - - DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - - if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) - return -1; - - if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { - - /* Feature Negotiation */ - if (dccp_feat_insert_opts(dp, NULL, skb)) - return -1; - - if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) { - /* - * Obtain RTT sample from Request/Response exchange. - * This is currently used for TFRC initialisation. - */ - if (dccp_insert_option_timestamp(skb)) - return -1; - - } else if (dccp_ackvec_pending(sk) && - dccp_insert_option_ackvec(sk, skb)) { - return -1; - } - } - - if (dp->dccps_hc_rx_insert_options) { - if (ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb)) - return -1; - dp->dccps_hc_rx_insert_options = 0; - } - - if (dp->dccps_timestamp_echo != 0 && - dccp_insert_option_timestamp_echo(dp, NULL, skb)) - return -1; - - dccp_insert_option_padding(skb); - return 0; -} - -int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) -{ - DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - - if (dccp_feat_insert_opts(NULL, dreq, skb)) - return -1; - - /* Obtain RTT sample from Response/Ack exchange (used by TFRC). */ - if (dccp_insert_option_timestamp(skb)) - return -1; - - if (dreq->dreq_timestamp_echo != 0 && - dccp_insert_option_timestamp_echo(NULL, dreq, skb)) - return -1; - - dccp_insert_option_padding(skb); - return 0; -} diff --git a/net/dccp/output.c b/net/dccp/output.c deleted file mode 100644 index 39cf3430177a..000000000000 --- a/net/dccp/output.c +++ /dev/null @@ -1,708 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/output.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include -#include - -#include -#include - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" - -static inline void dccp_event_ack_sent(struct sock *sk) -{ - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); -} - -/* enqueue @skb on sk_send_head for retransmission, return clone to send now */ -static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb) -{ - skb_set_owner_w(skb, sk); - WARN_ON(sk->sk_send_head); - sk->sk_send_head = skb; - return skb_clone(sk->sk_send_head, gfp_any()); -} - -/* - * All SKB's seen here are completely headerless. It is our - * job to build the DCCP header, and pass the packet down to - * IP so it can do the same plus pass the packet off to the - * device. - */ -static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) -{ - if (likely(skb != NULL)) { - struct inet_sock *inet = inet_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - struct dccp_hdr *dh; - /* XXX For now we're using only 48 bits sequence numbers */ - const u32 dccp_header_size = sizeof(*dh) + - sizeof(struct dccp_hdr_ext) + - dccp_packet_hdr_len(dcb->dccpd_type); - int err, set_ack = 1; - u64 ackno = dp->dccps_gsr; - /* - * Increment GSS here already in case the option code needs it. - * Update GSS for real only if option processing below succeeds. - */ - dcb->dccpd_seq = ADD48(dp->dccps_gss, 1); - - switch (dcb->dccpd_type) { - case DCCP_PKT_DATA: - set_ack = 0; - fallthrough; - case DCCP_PKT_DATAACK: - case DCCP_PKT_RESET: - break; - - case DCCP_PKT_REQUEST: - set_ack = 0; - /* Use ISS on the first (non-retransmitted) Request. */ - if (icsk->icsk_retransmits == 0) - dcb->dccpd_seq = dp->dccps_iss; - fallthrough; - - case DCCP_PKT_SYNC: - case DCCP_PKT_SYNCACK: - ackno = dcb->dccpd_ack_seq; - fallthrough; - default: - /* - * Set owner/destructor: some skbs are allocated via - * alloc_skb (e.g. when retransmission may happen). - * Only Data, DataAck, and Reset packets should come - * through here with skb->sk set. - */ - WARN_ON(skb->sk); - skb_set_owner_w(skb, sk); - break; - } - - if (dccp_insert_options(sk, skb)) { - kfree_skb(skb); - return -EPROTO; - } - - - /* Build DCCP header and checksum it. */ - dh = dccp_zeroed_hdr(skb, dccp_header_size); - dh->dccph_type = dcb->dccpd_type; - dh->dccph_sport = inet->inet_sport; - dh->dccph_dport = inet->inet_dport; - dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4; - dh->dccph_ccval = dcb->dccpd_ccval; - dh->dccph_cscov = dp->dccps_pcslen; - /* XXX For now we're using only 48 bits sequence numbers */ - dh->dccph_x = 1; - - dccp_update_gss(sk, dcb->dccpd_seq); - dccp_hdr_set_seq(dh, dp->dccps_gss); - if (set_ack) - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno); - - switch (dcb->dccpd_type) { - case DCCP_PKT_REQUEST: - dccp_hdr_request(skb)->dccph_req_service = - dp->dccps_service; - /* - * Limit Ack window to ISS <= P.ackno <= GSS, so that - * only Responses to Requests we sent are considered. - */ - dp->dccps_awl = dp->dccps_iss; - break; - case DCCP_PKT_RESET: - dccp_hdr_reset(skb)->dccph_reset_code = - dcb->dccpd_reset_code; - break; - } - - icsk->icsk_af_ops->send_check(sk, skb); - - if (set_ack) - dccp_event_ack_sent(sk); - - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - - err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); - return net_xmit_eval(err); - } - return -ENOBUFS; -} - -/** - * dccp_determine_ccmps - Find out about CCID-specific packet-size limits - * @dp: socket to find packet size limits of - * - * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.), - * since the RX CCID is restricted to feedback packets (Acks), which are small - * in comparison with the data traffic. A value of 0 means "no current CCMPS". - */ -static u32 dccp_determine_ccmps(const struct dccp_sock *dp) -{ - const struct ccid *tx_ccid = dp->dccps_hc_tx_ccid; - - if (tx_ccid == NULL || tx_ccid->ccid_ops == NULL) - return 0; - return tx_ccid->ccid_ops->ccid_ccmps; -} - -unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct dccp_sock *dp = dccp_sk(sk); - u32 ccmps = dccp_determine_ccmps(dp); - u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; - - /* Account for header lengths and IPv4/v6 option overhead */ - cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + - sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); - - /* - * Leave enough headroom for common DCCP header options. - * This only considers options which may appear on DCCP-Data packets, as - * per table 3 in RFC 4340, 5.8. When running out of space for other - * options (eg. Ack Vector which can take up to 255 bytes), it is better - * to schedule a separate Ack. Thus we leave headroom for the following: - * - 1 byte for Slow Receiver (11.6) - * - 6 bytes for Timestamp (13.1) - * - 10 bytes for Timestamp Echo (13.3) - * - 8 bytes for NDP count (7.7, when activated) - * - 6 bytes for Data Checksum (9.3) - * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) - */ - cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + - (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); - - /* And store cached results */ - icsk->icsk_pmtu_cookie = pmtu; - WRITE_ONCE(dp->dccps_mss_cache, cur_mps); - - return cur_mps; -} - -EXPORT_SYMBOL_GPL(dccp_sync_mss); - -void dccp_write_space(struct sock *sk) -{ - struct socket_wq *wq; - - rcu_read_lock(); - wq = rcu_dereference(sk->sk_wq); - if (skwq_has_sleeper(wq)) - wake_up_interruptible(&wq->wait); - /* Should agree with poll, otherwise some programs break */ - if (sock_writeable(sk)) - sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); - - rcu_read_unlock(); -} - -/** - * dccp_wait_for_ccid - Await CCID send permission - * @sk: socket to wait for - * @delay: timeout in jiffies - * - * This is used by CCIDs which need to delay the send time in process context. - */ -static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) -{ - DEFINE_WAIT(wait); - long remaining; - - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - sk->sk_write_pending++; - release_sock(sk); - - remaining = schedule_timeout(delay); - - lock_sock(sk); - sk->sk_write_pending--; - finish_wait(sk_sleep(sk), &wait); - - if (signal_pending(current) || sk->sk_err) - return -1; - return remaining; -} - -/** - * dccp_xmit_packet - Send data packet under control of CCID - * @sk: socket to send data packet on - * - * Transmits next-queued payload and informs CCID to account for the packet. - */ -static void dccp_xmit_packet(struct sock *sk) -{ - int err, len; - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb = dccp_qpolicy_pop(sk); - - if (unlikely(skb == NULL)) - return; - len = skb->len; - - if (sk->sk_state == DCCP_PARTOPEN) { - const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; - /* - * See 8.1.5 - Handshake Completion. - * - * For robustness we resend Confirm options until the client has - * entered OPEN. During the initial feature negotiation, the MPS - * is smaller than usual, reduced by the Change/Confirm options. - */ - if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { - DCCP_WARN("Payload too large (%d) for featneg.\n", len); - dccp_send_ack(sk); - dccp_feat_list_purge(&dp->dccps_featneg); - } - - inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - inet_csk(sk)->icsk_rto, - DCCP_RTO_MAX); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; - } else if (dccp_ack_pending(sk)) { - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; - } else { - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; - } - - err = dccp_transmit_skb(sk, skb); - if (err) - dccp_pr_debug("transmit_skb() returned err=%d\n", err); - /* - * Register this one as sent even if an error occurred. To the remote - * end a local packet drop is indistinguishable from network loss, i.e. - * any local drop will eventually be reported via receiver feedback. - */ - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); - - /* - * If the CCID needs to transfer additional header options out-of-band - * (e.g. Ack Vectors or feature-negotiation options), it activates this - * flag to schedule a Sync. The Sync will automatically incorporate all - * currently pending header options, thus clearing the backlog. - */ - if (dp->dccps_sync_scheduled) - dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); -} - -/** - * dccp_flush_write_queue - Drain queue at end of connection - * @sk: socket to be drained - * @time_budget: time allowed to drain the queue - * - * Since dccp_sendmsg queues packets without waiting for them to be sent, it may - * happen that the TX queue is not empty at the end of a connection. We give the - * HC-sender CCID a grace period of up to @time_budget jiffies. If this function - * returns with a non-empty write queue, it will be purged later. - */ -void dccp_flush_write_queue(struct sock *sk, long *time_budget) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - long delay, rc; - - while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { - rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - - switch (ccid_packet_dequeue_eval(rc)) { - case CCID_PACKET_WILL_DEQUEUE_LATER: - /* - * If the CCID determines when to send, the next sending - * time is unknown or the CCID may not even send again - * (e.g. remote host crashes or lost Ack packets). - */ - DCCP_WARN("CCID did not manage to send all packets\n"); - return; - case CCID_PACKET_DELAY: - delay = msecs_to_jiffies(rc); - if (delay > *time_budget) - return; - rc = dccp_wait_for_ccid(sk, delay); - if (rc < 0) - return; - *time_budget -= (delay - rc); - /* check again if we can send now */ - break; - case CCID_PACKET_SEND_AT_ONCE: - dccp_xmit_packet(sk); - break; - case CCID_PACKET_ERR: - skb_dequeue(&sk->sk_write_queue); - kfree_skb(skb); - dccp_pr_debug("packet discarded due to err=%ld\n", rc); - } - } -} - -void dccp_write_xmit(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - - while ((skb = dccp_qpolicy_top(sk))) { - int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - - switch (ccid_packet_dequeue_eval(rc)) { - case CCID_PACKET_WILL_DEQUEUE_LATER: - return; - case CCID_PACKET_DELAY: - sk_reset_timer(sk, &dp->dccps_xmit_timer, - jiffies + msecs_to_jiffies(rc)); - return; - case CCID_PACKET_SEND_AT_ONCE: - dccp_xmit_packet(sk); - break; - case CCID_PACKET_ERR: - dccp_qpolicy_drop(sk, skb); - dccp_pr_debug("packet discarded due to err=%d\n", rc); - } - } -} - -/** - * dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets - * @sk: socket to perform retransmit on - * - * There are only four retransmittable packet types in DCCP: - * - Request in client-REQUEST state (sec. 8.1.1), - * - CloseReq in server-CLOSEREQ state (sec. 8.3), - * - Close in node-CLOSING state (sec. 8.3), - * - Acks in client-PARTOPEN state (sec. 8.1.5, handled by dccp_delack_timer()). - * This function expects sk->sk_send_head to contain the original skb. - */ -int dccp_retransmit_skb(struct sock *sk) -{ - WARN_ON(sk->sk_send_head == NULL); - - if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0) - return -EHOSTUNREACH; /* Routing failure or similar. */ - - /* this count is used to distinguish original and retransmitted skb */ - inet_csk(sk)->icsk_retransmits++; - - return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC)); -} - -struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, - struct request_sock *req) -{ - struct dccp_hdr *dh; - struct dccp_request_sock *dreq; - const u32 dccp_header_size = sizeof(struct dccp_hdr) + - sizeof(struct dccp_hdr_ext) + - sizeof(struct dccp_hdr_response); - struct sk_buff *skb; - - /* sk is marked const to clearly express we dont hold socket lock. - * sock_wmalloc() will atomically change sk->sk_wmem_alloc, - * it is safe to promote sk to non const. - */ - skb = sock_wmalloc((struct sock *)sk, MAX_DCCP_HEADER, 1, - GFP_ATOMIC); - if (!skb) - return NULL; - - skb_reserve(skb, MAX_DCCP_HEADER); - - skb_dst_set(skb, dst_clone(dst)); - - dreq = dccp_rsk(req); - if (inet_rsk(req)->acked) /* increase GSS upon retransmission */ - dccp_inc_seqno(&dreq->dreq_gss); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; - DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_gss; - - /* Resolve feature dependencies resulting from choice of CCID */ - if (dccp_feat_server_ccid_dependencies(dreq)) - goto response_failed; - - if (dccp_insert_options_rsk(dreq, skb)) - goto response_failed; - - /* Build and checksum header */ - dh = dccp_zeroed_hdr(skb, dccp_header_size); - - dh->dccph_sport = htons(inet_rsk(req)->ir_num); - dh->dccph_dport = inet_rsk(req)->ir_rmt_port; - dh->dccph_doff = (dccp_header_size + - DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; - dh->dccph_type = DCCP_PKT_RESPONSE; - dh->dccph_x = 1; - dccp_hdr_set_seq(dh, dreq->dreq_gss); - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_gsr); - dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service; - - dccp_csum_outgoing(skb); - - /* We use `acked' to remember that a Response was already sent. */ - inet_rsk(req)->acked = 1; - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - return skb; -response_failed: - kfree_skb(skb); - return NULL; -} - -EXPORT_SYMBOL_GPL(dccp_make_response); - -/* answer offending packet in @rcv_skb with Reset from control socket @ctl */ -struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *rcv_skb) -{ - struct dccp_hdr *rxdh = dccp_hdr(rcv_skb), *dh; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(rcv_skb); - const u32 dccp_hdr_reset_len = sizeof(struct dccp_hdr) + - sizeof(struct dccp_hdr_ext) + - sizeof(struct dccp_hdr_reset); - struct dccp_hdr_reset *dhr; - struct sk_buff *skb; - - skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC); - if (skb == NULL) - return NULL; - - skb_reserve(skb, sk->sk_prot->max_header); - - /* Swap the send and the receive. */ - dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len); - dh->dccph_type = DCCP_PKT_RESET; - dh->dccph_sport = rxdh->dccph_dport; - dh->dccph_dport = rxdh->dccph_sport; - dh->dccph_doff = dccp_hdr_reset_len / 4; - dh->dccph_x = 1; - - dhr = dccp_hdr_reset(skb); - dhr->dccph_reset_code = dcb->dccpd_reset_code; - - switch (dcb->dccpd_reset_code) { - case DCCP_RESET_CODE_PACKET_ERROR: - dhr->dccph_reset_data[0] = rxdh->dccph_type; - break; - case DCCP_RESET_CODE_OPTION_ERROR: - case DCCP_RESET_CODE_MANDATORY_ERROR: - memcpy(dhr->dccph_reset_data, dcb->dccpd_reset_data, 3); - break; - } - /* - * From RFC 4340, 8.3.1: - * If P.ackno exists, set R.seqno := P.ackno + 1. - * Else set R.seqno := 0. - */ - if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_hdr_set_seq(dh, ADD48(dcb->dccpd_ack_seq, 1)); - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dcb->dccpd_seq); - - dccp_csum_outgoing(skb); - return skb; -} - -EXPORT_SYMBOL_GPL(dccp_ctl_make_reset); - -/* send Reset on established socket, to close or abort the connection */ -int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) -{ - struct sk_buff *skb; - /* - * FIXME: what if rebuild_header fails? - * Should we be doing a rebuild_header here? - */ - int err = inet_csk(sk)->icsk_af_ops->rebuild_header(sk); - - if (err != 0) - return err; - - skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, GFP_ATOMIC); - if (skb == NULL) - return -ENOBUFS; - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET; - DCCP_SKB_CB(skb)->dccpd_reset_code = code; - - return dccp_transmit_skb(sk, skb); -} - -/* - * Do all connect socket setups that can be done AF independent. - */ -int dccp_connect(struct sock *sk) -{ - struct sk_buff *skb; - struct dccp_sock *dp = dccp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - - sk->sk_err = 0; - sock_reset_flag(sk, SOCK_DONE); - - dccp_sync_mss(sk, dst_mtu(dst)); - - /* do not connect if feature negotiation setup fails */ - if (dccp_feat_finalise_settings(dccp_sk(sk))) - return -EPROTO; - - /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ - dp->dccps_gar = dp->dccps_iss; - - skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); - if (unlikely(skb == NULL)) - return -ENOBUFS; - - /* Reserve space for headers. */ - skb_reserve(skb, sk->sk_prot->max_header); - - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; - - dccp_transmit_skb(sk, dccp_skb_entail(sk, skb)); - DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); - - /* Timer for repeating the REQUEST until an answer. */ - icsk->icsk_retransmits = 0; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - icsk->icsk_rto, DCCP_RTO_MAX); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_connect); - -void dccp_send_ack(struct sock *sk) -{ - /* If we have been reset, we may not send again. */ - if (sk->sk_state != DCCP_CLOSED) { - struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, - GFP_ATOMIC); - - if (skb == NULL) { - inet_csk_schedule_ack(sk); - inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, - DCCP_RTO_MAX); - return; - } - - /* Reserve space for headers */ - skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK; - dccp_transmit_skb(sk, skb); - } -} - -EXPORT_SYMBOL_GPL(dccp_send_ack); - -#if 0 -/* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */ -void dccp_send_delayed_ack(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - /* - * FIXME: tune this timer. elapsed time fixes the skew, so no problem - * with using 2s, and active senders also piggyback the ACK into a - * DATAACK packet, so this is really for quiescent senders. - */ - unsigned long timeout = jiffies + 2 * HZ; - - /* Use new timeout only if there wasn't a older one earlier. */ - if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { - /* If delack timer was blocked or is about to expire, - * send ACK now. - * - * FIXME: check the "about to expire" part - */ - if (icsk->icsk_ack.blocked) { - dccp_send_ack(sk); - return; - } - - if (!time_before(timeout, icsk_delack_timeout(icsk))) - timeout = icsk_delack_timeout(icsk); - } - icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; - sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); -} -#endif - -void dccp_send_sync(struct sock *sk, const u64 ackno, - const enum dccp_pkt_type pkt_type) -{ - /* - * We are not putting this on the write queue, so - * dccp_transmit_skb() will set the ownership to this - * sock. - */ - struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC); - - if (skb == NULL) { - /* FIXME: how to make sure the sync is sent? */ - DCCP_CRIT("could not send %s", dccp_packet_name(pkt_type)); - return; - } - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = pkt_type; - DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; - - /* - * Clear the flag in case the Sync was scheduled for out-of-band data, - * such as carrying a long Ack Vector. - */ - dccp_sk(sk)->dccps_sync_scheduled = 0; - - dccp_transmit_skb(sk, skb); -} - -EXPORT_SYMBOL_GPL(dccp_send_sync); - -/* - * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This - * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under - * any circumstances. - */ -void dccp_send_close(struct sock *sk, const int active) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC; - - skb = alloc_skb(sk->sk_prot->max_header, prio); - if (skb == NULL) - return; - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, sk->sk_prot->max_header); - if (dp->dccps_role == DCCP_ROLE_SERVER && !dp->dccps_server_timewait) - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSEREQ; - else - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; - - if (active) { - skb = dccp_skb_entail(sk, skb); - /* - * Retransmission timer for active-close: RFC 4340, 8.3 requires - * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ - * state can be left. The initial timeout is 2 RTTs. - * Since RTT measurement is done by the CCIDs, there is no easy - * way to get an RTT sample. The fallback RTT from RFC 4340, 3.4 - * is too low (200ms); we use a high value to avoid unnecessary - * retransmissions when the link RTT is > 0.2 seconds. - * FIXME: Let main module sample RTTs and use that instead. - */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); - } - dccp_transmit_skb(sk, skb); -} diff --git a/net/dccp/proto.c b/net/dccp/proto.c deleted file mode 100644 index fcc5c9d64f46..000000000000 --- a/net/dccp/proto.c +++ /dev/null @@ -1,1293 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/proto.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -#define CREATE_TRACE_POINTS -#include "trace.h" - -DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; - -EXPORT_SYMBOL_GPL(dccp_statistics); - -DEFINE_PER_CPU(unsigned int, dccp_orphan_count); -EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count); - -struct inet_hashinfo dccp_hashinfo; -EXPORT_SYMBOL_GPL(dccp_hashinfo); - -/* the maximum queue length for tx in packets. 0 is no limit */ -int sysctl_dccp_tx_qlen __read_mostly = 5; - -#ifdef CONFIG_IP_DCCP_DEBUG -static const char *dccp_state_name(const int state) -{ - static const char *const dccp_state_names[] = { - [DCCP_OPEN] = "OPEN", - [DCCP_REQUESTING] = "REQUESTING", - [DCCP_PARTOPEN] = "PARTOPEN", - [DCCP_LISTEN] = "LISTEN", - [DCCP_RESPOND] = "RESPOND", - [DCCP_CLOSING] = "CLOSING", - [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", - [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", - [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", - [DCCP_TIME_WAIT] = "TIME_WAIT", - [DCCP_CLOSED] = "CLOSED", - }; - - if (state >= DCCP_MAX_STATES) - return "INVALID STATE!"; - else - return dccp_state_names[state]; -} -#endif - -void dccp_set_state(struct sock *sk, const int state) -{ - const int oldstate = sk->sk_state; - - dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk, - dccp_state_name(oldstate), dccp_state_name(state)); - WARN_ON(state == oldstate); - - switch (state) { - case DCCP_OPEN: - if (oldstate != DCCP_OPEN) - DCCP_INC_STATS(DCCP_MIB_CURRESTAB); - /* Client retransmits all Confirm options until entering OPEN */ - if (oldstate == DCCP_PARTOPEN) - dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); - break; - - case DCCP_CLOSED: - if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ || - oldstate == DCCP_CLOSING) - DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); - - sk->sk_prot->unhash(sk); - if (inet_csk(sk)->icsk_bind_hash != NULL && - !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) - inet_put_port(sk); - fallthrough; - default: - if (oldstate == DCCP_OPEN) - DCCP_DEC_STATS(DCCP_MIB_CURRESTAB); - } - - /* Change state AFTER socket is unhashed to avoid closed - * socket sitting in hash tables. - */ - inet_sk_set_state(sk, state); -} - -EXPORT_SYMBOL_GPL(dccp_set_state); - -static void dccp_finish_passive_close(struct sock *sk) -{ - switch (sk->sk_state) { - case DCCP_PASSIVE_CLOSE: - /* Node (client or server) has received Close packet. */ - dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); - dccp_set_state(sk, DCCP_CLOSED); - break; - case DCCP_PASSIVE_CLOSEREQ: - /* - * Client received CloseReq. We set the `active' flag so that - * dccp_send_close() retransmits the Close as per RFC 4340, 8.3. - */ - dccp_send_close(sk, 1); - dccp_set_state(sk, DCCP_CLOSING); - } -} - -void dccp_done(struct sock *sk) -{ - dccp_set_state(sk, DCCP_CLOSED); - dccp_clear_xmit_timers(sk); - - sk->sk_shutdown = SHUTDOWN_MASK; - - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_state_change(sk); - else - inet_csk_destroy_sock(sk); -} - -EXPORT_SYMBOL_GPL(dccp_done); - -const char *dccp_packet_name(const int type) -{ - static const char *const dccp_packet_names[] = { - [DCCP_PKT_REQUEST] = "REQUEST", - [DCCP_PKT_RESPONSE] = "RESPONSE", - [DCCP_PKT_DATA] = "DATA", - [DCCP_PKT_ACK] = "ACK", - [DCCP_PKT_DATAACK] = "DATAACK", - [DCCP_PKT_CLOSEREQ] = "CLOSEREQ", - [DCCP_PKT_CLOSE] = "CLOSE", - [DCCP_PKT_RESET] = "RESET", - [DCCP_PKT_SYNC] = "SYNC", - [DCCP_PKT_SYNCACK] = "SYNCACK", - }; - - if (type >= DCCP_NR_PKT_TYPES) - return "INVALID"; - else - return dccp_packet_names[type]; -} - -EXPORT_SYMBOL_GPL(dccp_packet_name); - -void dccp_destruct_common(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_tx_ccid = NULL; -} -EXPORT_SYMBOL_GPL(dccp_destruct_common); - -static void dccp_sk_destruct(struct sock *sk) -{ - dccp_destruct_common(sk); - inet_sock_destruct(sk); -} - -int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - - pr_warn_once("DCCP is deprecated and scheduled to be removed in 2025, " - "please contact the netdev mailing list\n"); - - icsk->icsk_rto = DCCP_TIMEOUT_INIT; - icsk->icsk_syn_retries = sysctl_dccp_request_retries; - sk->sk_state = DCCP_CLOSED; - sk->sk_write_space = dccp_write_space; - sk->sk_destruct = dccp_sk_destruct; - icsk->icsk_sync_mss = dccp_sync_mss; - dp->dccps_mss_cache = 536; - dp->dccps_rate_last = jiffies; - dp->dccps_role = DCCP_ROLE_UNDEFINED; - dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; - dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; - - dccp_init_xmit_timers(sk); - - INIT_LIST_HEAD(&dp->dccps_featneg); - /* control socket doesn't need feat nego */ - if (likely(ctl_sock_initialized)) - return dccp_feat_init(sk); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_init_sock); - -void dccp_destroy_sock(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - - __skb_queue_purge(&sk->sk_write_queue); - if (sk->sk_send_head != NULL) { - kfree_skb(sk->sk_send_head); - sk->sk_send_head = NULL; - } - - /* Clean up a referenced DCCP bind bucket. */ - if (inet_csk(sk)->icsk_bind_hash != NULL) - inet_put_port(sk); - - kfree(dp->dccps_service_list); - dp->dccps_service_list = NULL; - - if (dp->dccps_hc_rx_ackvec != NULL) { - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - } - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = NULL; - - /* clean up feature negotiation state */ - dccp_feat_list_purge(&dp->dccps_featneg); -} - -EXPORT_SYMBOL_GPL(dccp_destroy_sock); - -static inline int dccp_need_reset(int state) -{ - return state != DCCP_CLOSED && state != DCCP_LISTEN && - state != DCCP_REQUESTING; -} - -int dccp_disconnect(struct sock *sk, int flags) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - const int old_state = sk->sk_state; - - if (old_state != DCCP_CLOSED) - dccp_set_state(sk, DCCP_CLOSED); - - /* - * This corresponds to the ABORT function of RFC793, sec. 3.8 - * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted". - */ - if (old_state == DCCP_LISTEN) { - inet_csk_listen_stop(sk); - } else if (dccp_need_reset(old_state)) { - dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); - sk->sk_err = ECONNRESET; - } else if (old_state == DCCP_REQUESTING) - sk->sk_err = ECONNRESET; - - dccp_clear_xmit_timers(sk); - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = NULL; - - __skb_queue_purge(&sk->sk_receive_queue); - __skb_queue_purge(&sk->sk_write_queue); - if (sk->sk_send_head != NULL) { - __kfree_skb(sk->sk_send_head); - sk->sk_send_head = NULL; - } - - inet->inet_dport = 0; - - inet_bhash2_reset_saddr(sk); - - sk->sk_shutdown = 0; - sock_reset_flag(sk, SOCK_DONE); - - icsk->icsk_backoff = 0; - inet_csk_delack_init(sk); - __sk_dst_reset(sk); - - WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); - - sk_error_report(sk); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_disconnect); - -/* - * Wait for a DCCP event. - * - * Note that we don't need to lock the socket, as the upper poll layers - * take care of normal races (between the test and the event) and we don't - * go look at any of the socket buffers directly. - */ -__poll_t dccp_poll(struct file *file, struct socket *sock, - poll_table *wait) -{ - struct sock *sk = sock->sk; - __poll_t mask; - u8 shutdown; - int state; - - sock_poll_wait(file, sock, wait); - - state = inet_sk_state_load(sk); - if (state == DCCP_LISTEN) - return inet_csk_listen_poll(sk); - - /* Socket is not locked. We are protected from async events - by poll logic and correct handling of state changes - made by another threads is impossible in any case. - */ - - mask = 0; - if (READ_ONCE(sk->sk_err)) - mask = EPOLLERR; - shutdown = READ_ONCE(sk->sk_shutdown); - - if (shutdown == SHUTDOWN_MASK || state == DCCP_CLOSED) - mask |= EPOLLHUP; - if (shutdown & RCV_SHUTDOWN) - mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - - /* Connected? */ - if ((1 << state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { - if (atomic_read(&sk->sk_rmem_alloc) > 0) - mask |= EPOLLIN | EPOLLRDNORM; - - if (!(shutdown & SEND_SHUTDOWN)) { - if (sk_stream_is_writeable(sk)) { - mask |= EPOLLOUT | EPOLLWRNORM; - } else { /* send SIGIO later */ - sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - - /* Race breaker. If space is freed after - * wspace test but before the flags are set, - * IO signal will be lost. - */ - if (sk_stream_is_writeable(sk)) - mask |= EPOLLOUT | EPOLLWRNORM; - } - } - } - return mask; -} -EXPORT_SYMBOL_GPL(dccp_poll); - -int dccp_ioctl(struct sock *sk, int cmd, int *karg) -{ - int rc = -ENOTCONN; - - lock_sock(sk); - - if (sk->sk_state == DCCP_LISTEN) - goto out; - - switch (cmd) { - case SIOCOUTQ: { - *karg = sk_wmem_alloc_get(sk); - /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and - * always 0, comparably to UDP. - */ - - rc = 0; - } - break; - case SIOCINQ: { - struct sk_buff *skb; - *karg = 0; - - skb = skb_peek(&sk->sk_receive_queue); - if (skb != NULL) { - /* - * We will only return the amount of this packet since - * that is all that will be read. - */ - *karg = skb->len; - } - rc = 0; - } - break; - default: - rc = -ENOIOCTLCMD; - break; - } -out: - release_sock(sk); - return rc; -} - -EXPORT_SYMBOL_GPL(dccp_ioctl); - -static int dccp_setsockopt_service(struct sock *sk, const __be32 service, - sockptr_t optval, unsigned int optlen) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_service_list *sl = NULL; - - if (service == DCCP_SERVICE_INVALID_VALUE || - optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32)) - return -EINVAL; - - if (optlen > sizeof(service)) { - sl = kmalloc(optlen, GFP_KERNEL); - if (sl == NULL) - return -ENOMEM; - - sl->dccpsl_nr = optlen / sizeof(u32) - 1; - if (copy_from_sockptr_offset(sl->dccpsl_list, optval, - sizeof(service), optlen - sizeof(service)) || - dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) { - kfree(sl); - return -EFAULT; - } - } - - lock_sock(sk); - dp->dccps_service = service; - - kfree(dp->dccps_service_list); - - dp->dccps_service_list = sl; - release_sock(sk); - return 0; -} - -static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) -{ - u8 *list, len; - int i, rc; - - if (cscov < 0 || cscov > 15) - return -EINVAL; - /* - * Populate a list of permissible values, in the range cscov...15. This - * is necessary since feature negotiation of single values only works if - * both sides incidentally choose the same value. Since the list starts - * lowest-value first, negotiation will pick the smallest shared value. - */ - if (cscov == 0) - return 0; - len = 16 - cscov; - - list = kmalloc(len, GFP_KERNEL); - if (list == NULL) - return -ENOBUFS; - - for (i = 0; i < len; i++) - list[i] = cscov++; - - rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); - - if (rc == 0) { - if (rx) - dccp_sk(sk)->dccps_pcrlen = cscov; - else - dccp_sk(sk)->dccps_pcslen = cscov; - } - kfree(list); - return rc; -} - -static int dccp_setsockopt_ccid(struct sock *sk, int type, - sockptr_t optval, unsigned int optlen) -{ - u8 *val; - int rc = 0; - - if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) - return -EINVAL; - - val = memdup_sockptr(optval, optlen); - if (IS_ERR(val)) - return PTR_ERR(val); - - lock_sock(sk); - if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); - - if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); - release_sock(sk); - - kfree(val); - return rc; -} - -static int do_dccp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) -{ - struct dccp_sock *dp = dccp_sk(sk); - int val, err = 0; - - switch (optname) { - case DCCP_SOCKOPT_PACKET_SIZE: - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_CHANGE_L: - case DCCP_SOCKOPT_CHANGE_R: - DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_CCID: - case DCCP_SOCKOPT_RX_CCID: - case DCCP_SOCKOPT_TX_CCID: - return dccp_setsockopt_ccid(sk, optname, optval, optlen); - } - - if (optlen < (int)sizeof(int)) - return -EINVAL; - - if (copy_from_sockptr(&val, optval, sizeof(int))) - return -EFAULT; - - if (optname == DCCP_SOCKOPT_SERVICE) - return dccp_setsockopt_service(sk, val, optval, optlen); - - lock_sock(sk); - switch (optname) { - case DCCP_SOCKOPT_SERVER_TIMEWAIT: - if (dp->dccps_role != DCCP_ROLE_SERVER) - err = -EOPNOTSUPP; - else - dp->dccps_server_timewait = (val != 0); - break; - case DCCP_SOCKOPT_SEND_CSCOV: - err = dccp_setsockopt_cscov(sk, val, false); - break; - case DCCP_SOCKOPT_RECV_CSCOV: - err = dccp_setsockopt_cscov(sk, val, true); - break; - case DCCP_SOCKOPT_QPOLICY_ID: - if (sk->sk_state != DCCP_CLOSED) - err = -EISCONN; - else if (val < 0 || val >= DCCPQ_POLICY_MAX) - err = -EINVAL; - else - dp->dccps_qpolicy = val; - break; - case DCCP_SOCKOPT_QPOLICY_TXQLEN: - if (val < 0) - err = -EINVAL; - else - dp->dccps_tx_qlen = val; - break; - default: - err = -ENOPROTOOPT; - break; - } - release_sock(sk); - - return err; -} - -int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, - unsigned int optlen) -{ - if (level != SOL_DCCP) - return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, - optname, optval, - optlen); - return do_dccp_setsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL_GPL(dccp_setsockopt); - -static int dccp_getsockopt_service(struct sock *sk, int len, - __be32 __user *optval, - int __user *optlen) -{ - const struct dccp_sock *dp = dccp_sk(sk); - const struct dccp_service_list *sl; - int err = -ENOENT, slen = 0, total_len = sizeof(u32); - - lock_sock(sk); - if ((sl = dp->dccps_service_list) != NULL) { - slen = sl->dccpsl_nr * sizeof(u32); - total_len += slen; - } - - err = -EINVAL; - if (total_len > len) - goto out; - - err = 0; - if (put_user(total_len, optlen) || - put_user(dp->dccps_service, optval) || - (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen))) - err = -EFAULT; -out: - release_sock(sk); - return err; -} - -static int do_dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - struct dccp_sock *dp; - int val, len; - - if (get_user(len, optlen)) - return -EFAULT; - - if (len < (int)sizeof(int)) - return -EINVAL; - - dp = dccp_sk(sk); - - switch (optname) { - case DCCP_SOCKOPT_PACKET_SIZE: - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_SERVICE: - return dccp_getsockopt_service(sk, len, - (__be32 __user *)optval, optlen); - case DCCP_SOCKOPT_GET_CUR_MPS: - val = READ_ONCE(dp->dccps_mss_cache); - break; - case DCCP_SOCKOPT_AVAILABLE_CCIDS: - return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); - case DCCP_SOCKOPT_TX_CCID: - val = ccid_get_current_tx_ccid(dp); - if (val < 0) - return -ENOPROTOOPT; - break; - case DCCP_SOCKOPT_RX_CCID: - val = ccid_get_current_rx_ccid(dp); - if (val < 0) - return -ENOPROTOOPT; - break; - case DCCP_SOCKOPT_SERVER_TIMEWAIT: - val = dp->dccps_server_timewait; - break; - case DCCP_SOCKOPT_SEND_CSCOV: - val = dp->dccps_pcslen; - break; - case DCCP_SOCKOPT_RECV_CSCOV: - val = dp->dccps_pcrlen; - break; - case DCCP_SOCKOPT_QPOLICY_ID: - val = dp->dccps_qpolicy; - break; - case DCCP_SOCKOPT_QPOLICY_TXQLEN: - val = dp->dccps_tx_qlen; - break; - case 128 ... 191: - return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, - len, (u32 __user *)optval, optlen); - case 192 ... 255: - return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname, - len, (u32 __user *)optval, optlen); - default: - return -ENOPROTOOPT; - } - - len = sizeof(val); - if (put_user(len, optlen) || copy_to_user(optval, &val, len)) - return -EFAULT; - - return 0; -} - -int dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level != SOL_DCCP) - return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level, - optname, optval, - optlen); - return do_dccp_getsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL_GPL(dccp_getsockopt); - -static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) -{ - struct cmsghdr *cmsg; - - /* - * Assign an (opaque) qpolicy priority value to skb->priority. - * - * We are overloading this skb field for use with the qpolicy subystem. - * The skb->priority is normally used for the SO_PRIORITY option, which - * is initialised from sk_priority. Since the assignment of sk_priority - * to skb->priority happens later (on layer 3), we overload this field - * for use with queueing priorities as long as the skb is on layer 4. - * The default priority value (if nothing is set) is 0. - */ - skb->priority = 0; - - for_each_cmsghdr(cmsg, msg) { - if (!CMSG_OK(msg, cmsg)) - return -EINVAL; - - if (cmsg->cmsg_level != SOL_DCCP) - continue; - - if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && - !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) - return -EINVAL; - - switch (cmsg->cmsg_type) { - case DCCP_SCM_PRIORITY: - if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) - return -EINVAL; - skb->priority = *(__u32 *)CMSG_DATA(cmsg); - break; - default: - return -EINVAL; - } - } - return 0; -} - -int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) -{ - const struct dccp_sock *dp = dccp_sk(sk); - const int flags = msg->msg_flags; - const int noblock = flags & MSG_DONTWAIT; - struct sk_buff *skb; - int rc, size; - long timeo; - - trace_dccp_probe(sk, len); - - if (len > READ_ONCE(dp->dccps_mss_cache)) - return -EMSGSIZE; - - lock_sock(sk); - - timeo = sock_sndtimeo(sk, noblock); - - /* - * We have to use sk_stream_wait_connect here to set sk_write_pending, - * so that the trick in dccp_rcv_request_sent_state_process. - */ - /* Wait for a connection to finish. */ - if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) - if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0) - goto out_release; - - size = sk->sk_prot->max_header + len; - release_sock(sk); - skb = sock_alloc_send_skb(sk, size, noblock, &rc); - lock_sock(sk); - if (skb == NULL) - goto out_release; - - if (dccp_qpolicy_full(sk)) { - rc = -EAGAIN; - goto out_discard; - } - - if (sk->sk_state == DCCP_CLOSED) { - rc = -ENOTCONN; - goto out_discard; - } - - /* We need to check dccps_mss_cache after socket is locked. */ - if (len > dp->dccps_mss_cache) { - rc = -EMSGSIZE; - goto out_discard; - } - - skb_reserve(skb, sk->sk_prot->max_header); - rc = memcpy_from_msg(skb_put(skb, len), msg, len); - if (rc != 0) - goto out_discard; - - rc = dccp_msghdr_parse(msg, skb); - if (rc != 0) - goto out_discard; - - dccp_qpolicy_push(sk, skb); - /* - * The xmit_timer is set if the TX CCID is rate-based and will expire - * when congestion control permits to release further packets into the - * network. Window-based CCIDs do not use this timer. - */ - if (!timer_pending(&dp->dccps_xmit_timer)) - dccp_write_xmit(sk); -out_release: - release_sock(sk); - return rc ? : len; -out_discard: - kfree_skb(skb); - goto out_release; -} - -EXPORT_SYMBOL_GPL(dccp_sendmsg); - -int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len) -{ - const struct dccp_hdr *dh; - long timeo; - - lock_sock(sk); - - if (sk->sk_state == DCCP_LISTEN) { - len = -ENOTCONN; - goto out; - } - - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - - do { - struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - - if (skb == NULL) - goto verify_sock_status; - - dh = dccp_hdr(skb); - - switch (dh->dccph_type) { - case DCCP_PKT_DATA: - case DCCP_PKT_DATAACK: - goto found_ok_skb; - - case DCCP_PKT_CLOSE: - case DCCP_PKT_CLOSEREQ: - if (!(flags & MSG_PEEK)) - dccp_finish_passive_close(sk); - fallthrough; - case DCCP_PKT_RESET: - dccp_pr_debug("found fin (%s) ok!\n", - dccp_packet_name(dh->dccph_type)); - len = 0; - goto found_fin_ok; - default: - dccp_pr_debug("packet_type=%s\n", - dccp_packet_name(dh->dccph_type)); - sk_eat_skb(sk, skb); - } -verify_sock_status: - if (sock_flag(sk, SOCK_DONE)) { - len = 0; - break; - } - - if (sk->sk_err) { - len = sock_error(sk); - break; - } - - if (sk->sk_shutdown & RCV_SHUTDOWN) { - len = 0; - break; - } - - if (sk->sk_state == DCCP_CLOSED) { - if (!sock_flag(sk, SOCK_DONE)) { - /* This occurs when user tries to read - * from never connected socket. - */ - len = -ENOTCONN; - break; - } - len = 0; - break; - } - - if (!timeo) { - len = -EAGAIN; - break; - } - - if (signal_pending(current)) { - len = sock_intr_errno(timeo); - break; - } - - sk_wait_data(sk, &timeo, NULL); - continue; - found_ok_skb: - if (len > skb->len) - len = skb->len; - else if (len < skb->len) - msg->msg_flags |= MSG_TRUNC; - - if (skb_copy_datagram_msg(skb, 0, msg, len)) { - /* Exception. Bailout! */ - len = -EFAULT; - break; - } - if (flags & MSG_TRUNC) - len = skb->len; - found_fin_ok: - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); - break; - } while (1); -out: - release_sock(sk); - return len; -} - -EXPORT_SYMBOL_GPL(dccp_recvmsg); - -int inet_dccp_listen(struct socket *sock, int backlog) -{ - struct sock *sk = sock->sk; - unsigned char old_state; - int err; - - lock_sock(sk); - - err = -EINVAL; - if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP) - goto out; - - old_state = sk->sk_state; - if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) - goto out; - - WRITE_ONCE(sk->sk_max_ack_backlog, backlog); - /* Really, if the socket is already in listen state - * we can only allow the backlog to be adjusted. - */ - if (old_state != DCCP_LISTEN) { - struct dccp_sock *dp = dccp_sk(sk); - - dp->dccps_role = DCCP_ROLE_LISTEN; - - /* do not start to listen if feature negotiation setup fails */ - if (dccp_feat_finalise_settings(dp)) { - err = -EPROTO; - goto out; - } - - err = inet_csk_listen_start(sk); - if (err) - goto out; - } - err = 0; - -out: - release_sock(sk); - return err; -} - -EXPORT_SYMBOL_GPL(inet_dccp_listen); - -static void dccp_terminate_connection(struct sock *sk) -{ - u8 next_state = DCCP_CLOSED; - - switch (sk->sk_state) { - case DCCP_PASSIVE_CLOSE: - case DCCP_PASSIVE_CLOSEREQ: - dccp_finish_passive_close(sk); - break; - case DCCP_PARTOPEN: - dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk); - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); - fallthrough; - case DCCP_OPEN: - dccp_send_close(sk, 1); - - if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER && - !dccp_sk(sk)->dccps_server_timewait) - next_state = DCCP_ACTIVE_CLOSEREQ; - else - next_state = DCCP_CLOSING; - fallthrough; - default: - dccp_set_state(sk, next_state); - } -} - -void dccp_close(struct sock *sk, long timeout) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - u32 data_was_unread = 0; - int state; - - lock_sock(sk); - - sk->sk_shutdown = SHUTDOWN_MASK; - - if (sk->sk_state == DCCP_LISTEN) { - dccp_set_state(sk, DCCP_CLOSED); - - /* Special case. */ - inet_csk_listen_stop(sk); - - goto adjudge_to_death; - } - - sk_stop_timer(sk, &dp->dccps_xmit_timer); - - /* - * We need to flush the recv. buffs. We do this only on the - * descriptor close, not protocol-sourced closes, because the - *reader process may not have drained the data yet! - */ - while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - data_was_unread += skb->len; - __kfree_skb(skb); - } - - /* If socket has been already reset kill it. */ - if (sk->sk_state == DCCP_CLOSED) - goto adjudge_to_death; - - if (data_was_unread) { - /* Unread data was tossed, send an appropriate Reset Code */ - DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread); - dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); - dccp_set_state(sk, DCCP_CLOSED); - } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { - /* Check zero linger _after_ checking for unread data. */ - sk->sk_prot->disconnect(sk, 0); - } else if (sk->sk_state != DCCP_CLOSED) { - /* - * Normal connection termination. May need to wait if there are - * still packets in the TX queue that are delayed by the CCID. - */ - dccp_flush_write_queue(sk, &timeout); - dccp_terminate_connection(sk); - } - - /* - * Flush write queue. This may be necessary in several cases: - * - we have been closed by the peer but still have application data; - * - abortive termination (unread data or zero linger time), - * - normal termination but queue could not be flushed within time limit - */ - __skb_queue_purge(&sk->sk_write_queue); - - sk_stream_wait_close(sk, timeout); - -adjudge_to_death: - state = sk->sk_state; - sock_hold(sk); - sock_orphan(sk); - - /* - * It is the last release_sock in its life. It will remove backlog. - */ - release_sock(sk); - /* - * Now socket is owned by kernel and we acquire BH lock - * to finish close. No need to check for user refs. - */ - local_bh_disable(); - bh_lock_sock(sk); - WARN_ON(sock_owned_by_user(sk)); - - this_cpu_inc(dccp_orphan_count); - - /* Have we already been destroyed by a softirq or backlog? */ - if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) - goto out; - - if (sk->sk_state == DCCP_CLOSED) - inet_csk_destroy_sock(sk); - - /* Otherwise, socket is reprieved until protocol close. */ - -out: - bh_unlock_sock(sk); - local_bh_enable(); - sock_put(sk); -} - -EXPORT_SYMBOL_GPL(dccp_close); - -void dccp_shutdown(struct sock *sk, int how) -{ - dccp_pr_debug("called shutdown(%x)\n", how); -} - -EXPORT_SYMBOL_GPL(dccp_shutdown); - -static inline int __init dccp_mib_init(void) -{ - dccp_statistics = alloc_percpu(struct dccp_mib); - if (!dccp_statistics) - return -ENOMEM; - return 0; -} - -static inline void dccp_mib_exit(void) -{ - free_percpu(dccp_statistics); -} - -static int thash_entries; -module_param(thash_entries, int, 0444); -MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); - -#ifdef CONFIG_IP_DCCP_DEBUG -bool dccp_debug; -module_param(dccp_debug, bool, 0644); -MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); - -EXPORT_SYMBOL_GPL(dccp_debug); -#endif - -static int __init dccp_init(void) -{ - unsigned long goal; - unsigned long nr_pages = totalram_pages(); - int ehash_order, bhash_order, i; - int rc; - - BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > - sizeof_field(struct sk_buff, cb)); - rc = inet_hashinfo2_init_mod(&dccp_hashinfo); - if (rc) - goto out_fail; - rc = -ENOBUFS; - dccp_hashinfo.bind_bucket_cachep = - kmem_cache_create("dccp_bind_bucket", - sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); - if (!dccp_hashinfo.bind_bucket_cachep) - goto out_free_hashinfo2; - dccp_hashinfo.bind2_bucket_cachep = - kmem_cache_create("dccp_bind2_bucket", - sizeof(struct inet_bind2_bucket), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); - if (!dccp_hashinfo.bind2_bucket_cachep) - goto out_free_bind_bucket_cachep; - - /* - * Size and allocate the main established and bind bucket - * hash tables. - * - * The methodology is similar to that of the buffer cache. - */ - if (nr_pages >= (128 * 1024)) - goal = nr_pages >> (21 - PAGE_SHIFT); - else - goal = nr_pages >> (23 - PAGE_SHIFT); - - if (thash_entries) - goal = (thash_entries * - sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; - for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++) - ; - do { - unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE / - sizeof(struct inet_ehash_bucket); - - while (hash_size & (hash_size - 1)) - hash_size--; - dccp_hashinfo.ehash_mask = hash_size - 1; - dccp_hashinfo.ehash = (struct inet_ehash_bucket *) - __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order); - } while (!dccp_hashinfo.ehash && --ehash_order > 0); - - if (!dccp_hashinfo.ehash) { - DCCP_CRIT("Failed to allocate DCCP established hash table"); - goto out_free_bind2_bucket_cachep; - } - - for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) - INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i); - - if (inet_ehash_locks_alloc(&dccp_hashinfo)) - goto out_free_dccp_ehash; - - bhash_order = ehash_order; - - do { - dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE / - sizeof(struct inet_bind_hashbucket); - if ((dccp_hashinfo.bhash_size > (64 * 1024)) && - bhash_order > 0) - continue; - dccp_hashinfo.bhash = (struct inet_bind_hashbucket *) - __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order); - } while (!dccp_hashinfo.bhash && --bhash_order >= 0); - - if (!dccp_hashinfo.bhash) { - DCCP_CRIT("Failed to allocate DCCP bind hash table"); - goto out_free_dccp_locks; - } - - dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *) - __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order); - - if (!dccp_hashinfo.bhash2) { - DCCP_CRIT("Failed to allocate DCCP bind2 hash table"); - goto out_free_dccp_bhash; - } - - for (i = 0; i < dccp_hashinfo.bhash_size; i++) { - spin_lock_init(&dccp_hashinfo.bhash[i].lock); - INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); - spin_lock_init(&dccp_hashinfo.bhash2[i].lock); - INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain); - } - - dccp_hashinfo.pernet = false; - - rc = dccp_mib_init(); - if (rc) - goto out_free_dccp_bhash2; - - rc = dccp_ackvec_init(); - if (rc) - goto out_free_dccp_mib; - - rc = dccp_sysctl_init(); - if (rc) - goto out_ackvec_exit; - - rc = ccid_initialize_builtins(); - if (rc) - goto out_sysctl_exit; - - dccp_timestamping_init(); - - return 0; - -out_sysctl_exit: - dccp_sysctl_exit(); -out_ackvec_exit: - dccp_ackvec_exit(); -out_free_dccp_mib: - dccp_mib_exit(); -out_free_dccp_bhash2: - free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); -out_free_dccp_bhash: - free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); -out_free_dccp_locks: - inet_ehash_locks_free(&dccp_hashinfo); -out_free_dccp_ehash: - free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); -out_free_bind2_bucket_cachep: - kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep); -out_free_bind_bucket_cachep: - kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); -out_free_hashinfo2: - inet_hashinfo2_free_mod(&dccp_hashinfo); -out_fail: - dccp_hashinfo.bhash = NULL; - dccp_hashinfo.bhash2 = NULL; - dccp_hashinfo.ehash = NULL; - dccp_hashinfo.bind_bucket_cachep = NULL; - dccp_hashinfo.bind2_bucket_cachep = NULL; - return rc; -} - -static void __exit dccp_fini(void) -{ - int bhash_order = get_order(dccp_hashinfo.bhash_size * - sizeof(struct inet_bind_hashbucket)); - - ccid_cleanup_builtins(); - dccp_mib_exit(); - free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); - free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); - free_pages((unsigned long)dccp_hashinfo.ehash, - get_order((dccp_hashinfo.ehash_mask + 1) * - sizeof(struct inet_ehash_bucket))); - inet_ehash_locks_free(&dccp_hashinfo); - kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); - dccp_ackvec_exit(); - dccp_sysctl_exit(); - inet_hashinfo2_free_mod(&dccp_hashinfo); -} - -module_init(dccp_init); -module_exit(dccp_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo "); -MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c deleted file mode 100644 index 5ba204ec0aca..000000000000 --- a/net/dccp/qpolicy.c +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/qpolicy.c - * - * Policy-based packet dequeueing interface for DCCP. - * - * Copyright (c) 2008 Tomasz Grobelny - */ -#include "dccp.h" - -/* - * Simple Dequeueing Policy: - * If tx_qlen is different from 0, enqueue up to tx_qlen elements. - */ -static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) -{ - skb_queue_tail(&sk->sk_write_queue, skb); -} - -static bool qpolicy_simple_full(struct sock *sk) -{ - return dccp_sk(sk)->dccps_tx_qlen && - sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; -} - -static struct sk_buff *qpolicy_simple_top(struct sock *sk) -{ - return skb_peek(&sk->sk_write_queue); -} - -/* - * Priority-based Dequeueing Policy: - * If tx_qlen is different from 0 and the queue has reached its upper bound - * of tx_qlen elements, replace older packets lowest-priority-first. - */ -static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) -{ - struct sk_buff *skb, *best = NULL; - - skb_queue_walk(&sk->sk_write_queue, skb) - if (best == NULL || skb->priority > best->priority) - best = skb; - return best; -} - -static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) -{ - struct sk_buff *skb, *worst = NULL; - - skb_queue_walk(&sk->sk_write_queue, skb) - if (worst == NULL || skb->priority < worst->priority) - worst = skb; - return worst; -} - -static bool qpolicy_prio_full(struct sock *sk) -{ - if (qpolicy_simple_full(sk)) - dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); - return false; -} - -/** - * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface - * @push: add a new @skb to the write queue - * @full: indicates that no more packets will be admitted - * @top: peeks at whatever the queueing policy defines as its `top' - * @params: parameter passed to policy operation - */ -struct dccp_qpolicy_operations { - void (*push) (struct sock *sk, struct sk_buff *skb); - bool (*full) (struct sock *sk); - struct sk_buff* (*top) (struct sock *sk); - __be32 params; -}; - -static struct dccp_qpolicy_operations qpol_table[DCCPQ_POLICY_MAX] = { - [DCCPQ_POLICY_SIMPLE] = { - .push = qpolicy_simple_push, - .full = qpolicy_simple_full, - .top = qpolicy_simple_top, - .params = 0, - }, - [DCCPQ_POLICY_PRIO] = { - .push = qpolicy_simple_push, - .full = qpolicy_prio_full, - .top = qpolicy_prio_best_skb, - .params = DCCP_SCM_PRIORITY, - }, -}; - -/* - * Externally visible interface - */ -void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) -{ - qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); -} - -bool dccp_qpolicy_full(struct sock *sk) -{ - return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); -} - -void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) -{ - if (skb != NULL) { - skb_unlink(skb, &sk->sk_write_queue); - kfree_skb(skb); - } -} - -struct sk_buff *dccp_qpolicy_top(struct sock *sk) -{ - return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); -} - -struct sk_buff *dccp_qpolicy_pop(struct sock *sk) -{ - struct sk_buff *skb = dccp_qpolicy_top(sk); - - if (skb != NULL) { - /* Clear any skb fields that we used internally */ - skb->priority = 0; - skb_unlink(skb, &sk->sk_write_queue); - } - return skb; -} - -bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) -{ - /* check if exactly one bit is set */ - if (!param || (param & (param - 1))) - return false; - return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; -} diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c deleted file mode 100644 index b15845fd6300..000000000000 --- a/net/dccp/sysctl.c +++ /dev/null @@ -1,107 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/sysctl.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include "dccp.h" -#include "feat.h" - -/* Boundary values */ -static int u8_max = 0xFF; -static unsigned long seqw_min = DCCPF_SEQ_WMIN, - seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */ - -static struct ctl_table dccp_default_table[] = { - { - .procname = "seq_window", - .data = &sysctl_dccp_sequence_window, - .maxlen = sizeof(sysctl_dccp_sequence_window), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ - .extra2 = &seqw_max, - }, - { - .procname = "rx_ccid", - .data = &sysctl_dccp_rx_ccid, - .maxlen = sizeof(sysctl_dccp_rx_ccid), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, /* RFC 4340, 10. */ - }, - { - .procname = "tx_ccid", - .data = &sysctl_dccp_tx_ccid, - .maxlen = sizeof(sysctl_dccp_tx_ccid), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, /* RFC 4340, 10. */ - }, - { - .procname = "request_retries", - .data = &sysctl_dccp_request_retries, - .maxlen = sizeof(sysctl_dccp_request_retries), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = &u8_max, - }, - { - .procname = "retries1", - .data = &sysctl_dccp_retries1, - .maxlen = sizeof(sysctl_dccp_retries1), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, - }, - { - .procname = "retries2", - .data = &sysctl_dccp_retries2, - .maxlen = sizeof(sysctl_dccp_retries2), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, - }, - { - .procname = "tx_qlen", - .data = &sysctl_dccp_tx_qlen, - .maxlen = sizeof(sysctl_dccp_tx_qlen), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "sync_ratelimit", - .data = &sysctl_dccp_sync_ratelimit, - .maxlen = sizeof(sysctl_dccp_sync_ratelimit), - .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - }, -}; - -static struct ctl_table_header *dccp_table_header; - -int __init dccp_sysctl_init(void) -{ - dccp_table_header = register_net_sysctl(&init_net, "net/dccp/default", - dccp_default_table); - - return dccp_table_header != NULL ? 0 : -ENOMEM; -} - -void dccp_sysctl_exit(void) -{ - if (dccp_table_header != NULL) { - unregister_net_sysctl_table(dccp_table_header); - dccp_table_header = NULL; - } -} diff --git a/net/dccp/timer.c b/net/dccp/timer.c deleted file mode 100644 index 232ac4ae0a73..000000000000 --- a/net/dccp/timer.c +++ /dev/null @@ -1,272 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/timer.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include - -#include "dccp.h" - -/* sysctl variables governing numbers of retransmission attempts */ -int sysctl_dccp_request_retries __read_mostly = TCP_SYN_RETRIES; -int sysctl_dccp_retries1 __read_mostly = TCP_RETR1; -int sysctl_dccp_retries2 __read_mostly = TCP_RETR2; - -static void dccp_write_err(struct sock *sk) -{ - sk->sk_err = READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT; - sk_error_report(sk); - - dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); - dccp_done(sk); - __DCCP_INC_STATS(DCCP_MIB_ABORTONTIMEOUT); -} - -/* A write timeout has occurred. Process the after effects. */ -static int dccp_write_timeout(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - int retry_until; - - if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { - if (icsk->icsk_retransmits != 0) - dst_negative_advice(sk); - retry_until = icsk->icsk_syn_retries ? - : sysctl_dccp_request_retries; - } else { - if (icsk->icsk_retransmits >= sysctl_dccp_retries1) { - /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu - black hole detection. :-( - - It is place to make it. It is not made. I do not want - to make it. It is disguisting. It does not work in any - case. Let me to cite the same draft, which requires for - us to implement this: - - "The one security concern raised by this memo is that ICMP black holes - are often caused by over-zealous security administrators who block - all ICMP messages. It is vitally important that those who design and - deploy security systems understand the impact of strict filtering on - upper-layer protocols. The safest web site in the world is worthless - if most TCP implementations cannot transfer data from it. It would - be far nicer to have all of the black holes fixed rather than fixing - all of the TCP implementations." - - Golden words :-). - */ - - dst_negative_advice(sk); - } - - retry_until = sysctl_dccp_retries2; - /* - * FIXME: see tcp_write_timout and tcp_out_of_resources - */ - } - - if (icsk->icsk_retransmits >= retry_until) { - /* Has it gone just too far? */ - dccp_write_err(sk); - return 1; - } - return 0; -} - -/* - * The DCCP retransmit timer. - */ -static void dccp_retransmit_timer(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - /* - * More than 4MSL (8 minutes) has passed, a RESET(aborted) was - * sent, no need to retransmit, this sock is dead. - */ - if (dccp_write_timeout(sk)) - return; - - /* - * We want to know the number of packets retransmitted, not the - * total number of retransmissions of clones of original packets. - */ - if (icsk->icsk_retransmits == 0) - __DCCP_INC_STATS(DCCP_MIB_TIMEOUTS); - - if (dccp_retransmit_skb(sk) != 0) { - /* - * Retransmission failed because of local congestion, - * do not backoff. - */ - if (--icsk->icsk_retransmits == 0) - icsk->icsk_retransmits = 1; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - min(icsk->icsk_rto, - TCP_RESOURCE_PROBE_INTERVAL), - DCCP_RTO_MAX); - return; - } - - icsk->icsk_backoff++; - - icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, - DCCP_RTO_MAX); - if (icsk->icsk_retransmits > sysctl_dccp_retries1) - __sk_dst_reset(sk); -} - -static void dccp_write_timer(struct timer_list *t) -{ - struct inet_connection_sock *icsk = - from_timer(icsk, t, icsk_retransmit_timer); - struct sock *sk = &icsk->icsk_inet.sk; - int event = 0; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later */ - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, - jiffies + (HZ / 20)); - goto out; - } - - if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending) - goto out; - - if (time_after(icsk_timeout(icsk), jiffies)) { - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, - icsk_timeout(icsk)); - goto out; - } - - event = icsk->icsk_pending; - icsk->icsk_pending = 0; - - switch (event) { - case ICSK_TIME_RETRANS: - dccp_retransmit_timer(sk); - break; - } -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -static void dccp_keepalive_timer(struct timer_list *t) -{ - struct sock *sk = from_timer(sk, t, sk_timer); - - pr_err("dccp should not use a keepalive timer !\n"); - sock_put(sk); -} - -/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */ -static void dccp_delack_timer(struct timer_list *t) -{ - struct inet_connection_sock *icsk = - from_timer(icsk, t, icsk_delack_timer); - struct sock *sk = &icsk->icsk_inet.sk; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later. */ - __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); - sk_reset_timer(sk, &icsk->icsk_delack_timer, - jiffies + TCP_DELACK_MIN); - goto out; - } - - if (sk->sk_state == DCCP_CLOSED || - !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) - goto out; - if (time_after(icsk_delack_timeout(icsk), jiffies)) { - sk_reset_timer(sk, &icsk->icsk_delack_timer, - icsk_delack_timeout(icsk)); - goto out; - } - - icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; - - if (inet_csk_ack_scheduled(sk)) { - if (!inet_csk_in_pingpong_mode(sk)) { - /* Delayed ACK missed: inflate ATO. */ - icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1, - icsk->icsk_rto); - } else { - /* Delayed ACK missed: leave pingpong mode and - * deflate ATO. - */ - inet_csk_exit_pingpong_mode(sk); - icsk->icsk_ack.ato = TCP_ATO_MIN; - } - dccp_send_ack(sk); - __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); - } -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -/** - * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface - * @t: pointer to the tasklet associated with this handler - * - * See the comments above %ccid_dequeueing_decision for supported modes. - */ -static void dccp_write_xmitlet(struct tasklet_struct *t) -{ - struct dccp_sock *dp = from_tasklet(dp, t, dccps_xmitlet); - struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) - sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); - else - dccp_write_xmit(sk); - bh_unlock_sock(sk); - sock_put(sk); -} - -static void dccp_write_xmit_timer(struct timer_list *t) -{ - struct dccp_sock *dp = from_timer(dp, t, dccps_xmit_timer); - - dccp_write_xmitlet(&dp->dccps_xmitlet); -} - -void dccp_init_xmit_timers(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - - tasklet_setup(&dp->dccps_xmitlet, dccp_write_xmitlet); - timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0); - inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, - &dccp_keepalive_timer); -} - -static ktime_t dccp_timestamp_seed; -/** - * dccp_timestamp - 10s of microseconds time source - * Returns the number of 10s of microseconds since loading DCCP. This is native - * DCCP time difference format (RFC 4340, sec. 13). - * Please note: This will wrap around about circa every 11.9 hours. - */ -u32 dccp_timestamp(void) -{ - u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); - - do_div(delta, 10); - return delta; -} -EXPORT_SYMBOL_GPL(dccp_timestamp); - -void __init dccp_timestamping_init(void) -{ - dccp_timestamp_seed = ktime_get_real(); -} diff --git a/net/dccp/trace.h b/net/dccp/trace.h deleted file mode 100644 index 5a43b3508c7f..000000000000 --- a/net/dccp/trace.h +++ /dev/null @@ -1,82 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM dccp - -#if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_DCCP_H - -#include -#include "dccp.h" -#include "ccids/ccid3.h" -#include -#include - -TRACE_EVENT(dccp_probe, - - TP_PROTO(struct sock *sk, size_t size), - - TP_ARGS(sk, size), - - TP_STRUCT__entry( - /* sockaddr_in6 is always bigger than sockaddr_in */ - __array(__u8, saddr, sizeof(struct sockaddr_in6)) - __array(__u8, daddr, sizeof(struct sockaddr_in6)) - __field(__u16, sport) - __field(__u16, dport) - __field(__u16, size) - __field(__u16, tx_s) - __field(__u32, tx_rtt) - __field(__u32, tx_p) - __field(__u32, tx_x_calc) - __field(__u64, tx_x_recv) - __field(__u64, tx_x) - __field(__u32, tx_t_ipi) - ), - - TP_fast_assign( - const struct inet_sock *inet = inet_sk(sk); - struct ccid3_hc_tx_sock *hc = NULL; - - if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) - hc = ccid3_hc_tx_sk(sk); - - memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); - memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); - - TP_STORE_ADDR_PORTS(__entry, inet, sk); - - /* For filtering use */ - __entry->sport = ntohs(inet->inet_sport); - __entry->dport = ntohs(inet->inet_dport); - - __entry->size = size; - if (hc) { - __entry->tx_s = hc->tx_s; - __entry->tx_rtt = hc->tx_rtt; - __entry->tx_p = hc->tx_p; - __entry->tx_x_calc = hc->tx_x_calc; - __entry->tx_x_recv = hc->tx_x_recv >> 6; - __entry->tx_x = hc->tx_x >> 6; - __entry->tx_t_ipi = hc->tx_t_ipi; - } else { - __entry->tx_s = 0; - memset_startat(__entry, 0, tx_rtt); - } - ), - - TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d " - "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d", - __entry->saddr, __entry->daddr, __entry->size, - __entry->tx_s, __entry->tx_rtt, __entry->tx_p, - __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x, - __entry->tx_t_ipi) -); - -#endif /* _TRACE_TCP_H */ - -/* This part must be outside protection */ -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace -#include diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 6d2c97f8e9ef..12850a277251 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -425,7 +425,7 @@ config INET_DIAG tristate "INET: socket monitoring interface" default y help - Support for INET (TCP, DCCP, etc) socket monitoring interface used by + Support for INET (TCP, UDP, etc) socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable at: diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5df1f1325259..76e38092cd8a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1328,10 +1328,7 @@ int inet_sk_rebuild_header(struct sock *sk) /* Routing failed... */ sk->sk_route_caps = 0; - /* - * Other protocols have to map its equivalent state to TCP_SYN_SENT. - * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme - */ + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) || sk->sk_state != TCP_SYN_SENT || (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index dd5cf8914a28..9a20b84dc022 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1322,7 +1322,7 @@ void inet_csk_destroy_sock(struct sock *sk) EXPORT_SYMBOL(inet_csk_destroy_sock); /* This function allows to force a closure of a socket after the call to - * tcp/dccp_create_openreq_child(). + * tcp_create_openreq_child(). */ void inet_csk_prepare_forced_close(struct sock *sk) __releases(&sk->sk_lock.slock) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c2bb91d9e9ff..907bad776b42 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1369,8 +1369,6 @@ static int inet_diag_type2proto(int type) switch (type) { case TCPDIAG_GETSOCK: return IPPROTO_TCP; - case DCCPDIAG_GETSOCK: - return IPPROTO_DCCP; default: return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 581bc6289081..ef052ccd9380 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -259,7 +259,7 @@ bool ip6_autoflowlabel(struct net *net, const struct sock *sk) } /* - * xmit an sk_buff (used by TCP, SCTP and DCCP) + * xmit an sk_buff (used by TCP and SCTP) * Note : socket lock is not held for SYNACK packets, but might be modified * by calls to skb_set_owner_w() and ipv6_local_error(), * which are using proper atomic operations or spinlocks. diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c index b7997541f7ee..f93d9145ab8a 100644 --- a/samples/bpf/sockex2_kern.c +++ b/samples/bpf/sockex2_kern.c @@ -31,7 +31,6 @@ static inline int proto_ports_offset(__u64 proto) switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: - case IPPROTO_DCCP: case IPPROTO_ESP: case IPPROTO_SCTP: case IPPROTO_UDPLITE: diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3d22bf863eec..298c8f8d7719 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4804,7 +4804,7 @@ sub process { } # do not use BUG() or variants - if ($line =~ /\b(?!AA_|BUILD_|DCCP_|IDA_|KVM_|RWLOCK_|snd_|SPIN_)(?:[a-zA-Z_]*_)?BUG(?:_ON)?(?:_[A-Z_]+)?\s*\(/) { + if ($line =~ /\b(?!AA_|BUILD_|IDA_|KVM_|RWLOCK_|snd_|SPIN_)(?:[a-zA-Z_]*_)?BUG(?:_ON)?(?:_[A-Z_]+)?\s*\(/) { my $msg_level = \&WARN; $msg_level = \&CHK if ($file); &{$msg_level}("AVOID_BUG", diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 1b942b4908a2..7d623b00495c 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -68,13 +67,6 @@ int ipv4_skb_to_auditdata(struct sk_buff *skb, ad->u.net->dport = uh->dest; break; } - case IPPROTO_DCCP: { - struct dccp_hdr *dh = dccp_hdr(skb); - - ad->u.net->sport = dh->dccph_sport; - ad->u.net->dport = dh->dccph_dport; - break; - } case IPPROTO_SCTP: { struct sctphdr *sh = sctp_hdr(skb); @@ -140,17 +132,6 @@ int ipv6_skb_to_auditdata(struct sk_buff *skb, ad->u.net->dport = uh->dest; break; } - case IPPROTO_DCCP: { - struct dccp_hdr _dccph, *dh; - - dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); - if (dh == NULL) - break; - - ad->u.net->sport = dh->dccph_sport; - ad->u.net->dport = dh->dccph_dport; - break; - } case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index e7a7dcab81db..b2695785610a 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -65,7 +65,6 @@ #include #include #include -#include #include #include #include @@ -1191,8 +1190,6 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc return SECCLASS_ICMP_SOCKET; else return SECCLASS_RAWIP_SOCKET; - case SOCK_DCCP: - return SECCLASS_DCCP_SOCKET; default: return SECCLASS_RAWIP_SOCKET; } @@ -4392,22 +4389,6 @@ static int selinux_parse_skb_ipv4(struct sk_buff *skb, break; } - case IPPROTO_DCCP: { - struct dccp_hdr _dccph, *dh; - - if (ntohs(ih->frag_off) & IP_OFFSET) - break; - - offset += ihlen; - dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); - if (dh == NULL) - break; - - ad->u.net->sport = dh->dccph_sport; - ad->u.net->dport = dh->dccph_dport; - break; - } - #if IS_ENABLED(CONFIG_IP_SCTP) case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; @@ -4486,18 +4467,6 @@ static int selinux_parse_skb_ipv6(struct sk_buff *skb, break; } - case IPPROTO_DCCP: { - struct dccp_hdr _dccph, *dh; - - dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); - if (dh == NULL) - break; - - ad->u.net->sport = dh->dccph_sport; - ad->u.net->dport = dh->dccph_dport; - break; - } - #if IS_ENABLED(CONFIG_IP_SCTP) case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; @@ -4849,10 +4818,6 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in node_perm = UDP_SOCKET__NODE_BIND; break; - case SECCLASS_DCCP_SOCKET: - node_perm = DCCP_SOCKET__NODE_BIND; - break; - case SECCLASS_SCTP_SOCKET: node_perm = SCTP_SOCKET__NODE_BIND; break; @@ -4908,11 +4873,10 @@ static int selinux_socket_connect_helper(struct socket *sock, return 0; /* - * If a TCP, DCCP or SCTP socket, check name_connect permission + * If a TCP or SCTP socket, check name_connect permission * for the port. */ if (sksec->sclass == SECCLASS_TCP_SOCKET || - sksec->sclass == SECCLASS_DCCP_SOCKET || sksec->sclass == SECCLASS_SCTP_SOCKET) { struct common_audit_data ad; struct lsm_network_audit net = {0,}; @@ -4957,9 +4921,6 @@ static int selinux_socket_connect_helper(struct socket *sock, case SECCLASS_TCP_SOCKET: perm = TCP_SOCKET__NAME_CONNECT; break; - case SECCLASS_DCCP_SOCKET: - perm = DCCP_SOCKET__NAME_CONNECT; - break; case SECCLASS_SCTP_SOCKET: perm = SCTP_SOCKET__NAME_CONNECT; break; diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 04a9b480885e..5665aa5e7853 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -127,8 +127,6 @@ const struct security_class_mapping secclass_map[] = { { "key", { "view", "read", "write", "search", "link", "setattr", "create", NULL } }, - { "dccp_socket", - { COMMON_SOCK_PERMS, "node_bind", "name_connect", NULL } }, { "memprotect", { "mmap_zero", NULL } }, { "peer", { "recv", NULL } }, { "capability2", { COMMON_CAP2_PERMS, NULL } }, diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 3a95986b134f..2c0b07f9fbbd 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -98,7 +98,6 @@ static const struct nlmsg_perm nlmsg_route_perms[] = { static const struct nlmsg_perm nlmsg_tcpdiag_perms[] = { { TCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ }, - { DCCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ }, { SOCK_DIAG_BY_FAMILY, NETLINK_TCPDIAG_SOCKET__NLMSG_READ }, { SOCK_DESTROY, NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE }, }; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 99833168604e..fc340a6f0dde 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -4061,7 +4060,6 @@ static int smk_skb_to_addr_ipv6(struct sk_buff *skb, struct sockaddr_in6 *sip) __be16 frag_off; struct tcphdr _tcph, *th; struct udphdr _udph, *uh; - struct dccp_hdr _dccph, *dh; sip->sin6_port = 0; @@ -4090,11 +4088,6 @@ static int smk_skb_to_addr_ipv6(struct sk_buff *skb, struct sockaddr_in6 *sip) if (uh != NULL) sip->sin6_port = uh->source; break; - case IPPROTO_DCCP: - dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); - if (dh != NULL) - sip->sin6_port = dh->dccph_sport; - break; } return proto; } @@ -4216,7 +4209,7 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) case PF_INET6: proto = smk_skb_to_addr_ipv6(skb, &sadd); if (proto != IPPROTO_UDP && proto != IPPROTO_UDPLITE && - proto != IPPROTO_TCP && proto != IPPROTO_DCCP) + proto != IPPROTO_TCP) break; #ifdef SMACK_IPV6_SECMARK_LABELING skp = smack_from_skb(skb); -- cgit v1.2.3 From 22d6c9eebf2e68e6ab831ded37daaa83daff6bb8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 9 Apr 2025 19:36:46 -0700 Subject: net: Unexport shared functions for DCCP. DCCP was removed, so many inet functions no longer need to be exported. Let's unexport or use EXPORT_IPV6_MOD() for such functions. sk_free_unlock_clone() is inlined in sk_clone_lock() as it's the only caller. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250410023921.11307-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 1 - net/core/sock.c | 32 ++++++++++++++------------------ net/ipv4/inet_connection_sock.c | 12 +----------- net/ipv4/inet_hashtables.c | 16 +++++----------- net/ipv4/inet_timewait_sock.c | 4 ---- net/ipv6/af_inet6.c | 1 - net/ipv6/inet6_connection_sock.c | 2 -- 7 files changed, 20 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 694f954258d4..bb4d6189292f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1781,7 +1781,6 @@ void sk_free(struct sock *sk); void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); -void sk_free_unlock_clone(struct sock *sk); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); diff --git a/net/core/sock.c b/net/core/sock.c index 121f64011288..b64df2463300 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2494,17 +2494,14 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ if (!is_charged) RCU_INIT_POINTER(newsk->sk_filter, NULL); - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; + + goto free; } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); - if (bpf_sk_storage_clone(sk, newsk)) { - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; - } + if (bpf_sk_storage_clone(sk, newsk)) + goto free; /* Clear sk_user_data if parent had the pointer tagged * as not suitable for copying when cloning. @@ -2534,18 +2531,17 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) net_enable_timestamp(); out: return newsk; -} -EXPORT_SYMBOL_GPL(sk_clone_lock); - -void sk_free_unlock_clone(struct sock *sk) -{ +free: /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - sk->sk_destruct = NULL; - bh_unlock_sock(sk); - sk_free(sk); + * destructor and make plain sk_free() + */ + newsk->sk_destruct = NULL; + bh_unlock_sock(newsk); + sk_free(newsk); + newsk = NULL; + goto out; } -EXPORT_SYMBOL_GPL(sk_free_unlock_clone); +EXPORT_SYMBOL_GPL(sk_clone_lock); static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 9a20b84dc022..a195203e7eef 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -767,7 +767,6 @@ void inet_csk_init_xmit_timers(struct sock *sk, timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } -EXPORT_SYMBOL(inet_csk_init_xmit_timers); void inet_csk_clear_xmit_timers(struct sock *sk) { @@ -780,7 +779,6 @@ void inet_csk_clear_xmit_timers(struct sock *sk) sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); } -EXPORT_SYMBOL(inet_csk_clear_xmit_timers); void inet_csk_clear_xmit_timers_sync(struct sock *sk) { @@ -831,7 +829,6 @@ no_route: __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } -EXPORT_SYMBOL_GPL(inet_csk_route_req); struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, @@ -898,7 +895,6 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) req->num_retrans++; return err; } -EXPORT_SYMBOL(inet_rtx_syn_ack); static struct request_sock * reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener, @@ -1058,14 +1054,13 @@ bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { return __inet_csk_reqsk_queue_drop(sk, req, false); } -EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_put(req); } -EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); +EXPORT_IPV6_MOD(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { @@ -1209,7 +1204,6 @@ bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, inet_csk_reqsk_queue_added(sk); return true; } -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, const gfp_t priority) @@ -1290,7 +1284,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, return newsk; } -EXPORT_SYMBOL_GPL(inet_csk_clone_lock); /* * At this point, there should be no process reference to this @@ -1380,7 +1373,6 @@ int inet_csk_listen_start(struct sock *sk) inet_sk_set_state(sk, TCP_CLOSE); return err; } -EXPORT_SYMBOL_GPL(inet_csk_listen_start); static void inet_child_forget(struct sock *sk, struct request_sock *req, struct sock *child) @@ -1475,7 +1467,6 @@ child_put: sock_put(child); return NULL; } -EXPORT_SYMBOL(inet_csk_complete_hashdance); /* * This routine closes sockets which have been at least partially @@ -1590,4 +1581,3 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) out: return dst; } -EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 5bf163f756e9..d1960701a3b4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -713,7 +713,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) } return ok; } -EXPORT_SYMBOL_GPL(inet_ehash_nolisten); +EXPORT_IPV6_MOD(inet_ehash_nolisten); static int inet_reuseport_add_sock(struct sock *sk, struct inet_listen_hashbucket *ilb) @@ -771,7 +771,7 @@ unlock: return err; } -EXPORT_SYMBOL(__inet_hash); +EXPORT_IPV6_MOD(__inet_hash); int inet_hash(struct sock *sk) { @@ -782,7 +782,6 @@ int inet_hash(struct sock *sk) return err; } -EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { @@ -823,7 +822,7 @@ void inet_unhash(struct sock *sk) spin_unlock_bh(lock); } } -EXPORT_SYMBOL_GPL(inet_unhash); +EXPORT_IPV6_MOD(inet_unhash); static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, @@ -982,14 +981,14 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) { return __inet_bhash2_update_saddr(sk, saddr, family, false); } -EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); +EXPORT_IPV6_MOD(inet_bhash2_update_saddr); void inet_bhash2_reset_saddr(struct sock *sk) { if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) __inet_bhash2_update_saddr(sk, NULL, 0, true); } -EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); +EXPORT_IPV6_MOD(inet_bhash2_reset_saddr); /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') @@ -1214,7 +1213,6 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, return __inet_hash_connect(death_row, sk, port_offset, hash_port0, __inet_check_established); } -EXPORT_SYMBOL_GPL(inet_hash_connect); static void init_hashinfo_lhash2(struct inet_hashinfo *h) { @@ -1265,7 +1263,6 @@ int inet_hashinfo2_init_mod(struct inet_hashinfo *h) init_hashinfo_lhash2(h); return 0; } -EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { @@ -1305,7 +1302,6 @@ set_mask: hashinfo->ehash_locks_mask = nblocks - 1; return 0; } -EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries) @@ -1341,7 +1337,6 @@ free_hashinfo: err: return NULL; } -EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) { @@ -1352,4 +1347,3 @@ void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) vfree(hashinfo->ehash); kfree(hashinfo); } -EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index aded4bf1bc16..67efe9501581 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -166,7 +166,6 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, spin_unlock(lock); local_bh_enable(); } -EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule); static void tw_timer_handler(struct timer_list *t) { @@ -223,7 +222,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, return tw; } -EXPORT_SYMBOL_GPL(inet_twsk_alloc); /* These are always called from BH context. See callers in * tcp_input.c to verify this. @@ -306,7 +304,6 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } -EXPORT_SYMBOL_GPL(__inet_twsk_schedule); /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ void inet_twsk_purge(struct inet_hashinfo *hashinfo) @@ -365,4 +362,3 @@ restart: rcu_read_unlock(); } } -EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index f60ec8b0f8ea..85bf681d427b 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -881,7 +881,6 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, } return false; } -EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index dbcf556a35bb..8f500eaf33cf 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -54,7 +54,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, return dst; } -EXPORT_SYMBOL(inet6_csk_route_req); static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) @@ -137,4 +136,3 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) dst = inet6_csk_route_socket(sk, &fl6); return IS_ERR(dst) ? NULL : dst; } -EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu); -- cgit v1.2.3 From 235bd9d21fcdf07dd125daa3e60ab64f8aefb927 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 9 Apr 2025 19:36:47 -0700 Subject: tcp: Rename tcp_or_dccp_get_hashinfo(). DCCP was removed, so tcp_or_dccp_get_hashinfo() should be renamed. Let's rename it to tcp_get_hashinfo(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250410023921.11307-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 3 +-- net/ipv4/inet_connection_sock.c | 9 +++++---- net/ipv4/inet_hashtables.c | 14 +++++++------- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index d172b64a6320..4564b5d348b1 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -175,9 +175,8 @@ struct inet_hashinfo { bool pernet; } ____cacheline_aligned_in_smp; -static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) +static inline struct inet_hashinfo *tcp_get_hashinfo(const struct sock *sk) { - /* TODO: rename function */ return sock_net(sk)->ipv4.tcp_death_row.hashinfo; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a195203e7eef..20915895bdaa 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -330,7 +330,7 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, struct inet_bind2_bucket **tb2_ret, struct inet_bind_hashbucket **head2_ret, int *port_ret) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); int i, low, high, attempt_half, port, l3mdev; struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); @@ -512,10 +512,10 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; bool found_port = false, check_bind_conflict = true; bool bhash_created = false, bhash2_created = false; + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); int ret = -EADDRINUSE, port = snum, l3mdev; struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2 = NULL; @@ -1022,9 +1022,10 @@ static bool reqsk_queue_unlink(struct request_sock *req) bool found = false; if (sk_hashed(sk)) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); - spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); + spinlock_t *lock; + lock = inet_ehash_lockp(hashinfo, req->rsk_hash); spin_lock(lock); found = __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index d1960701a3b4..da85cc30e382 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -176,7 +176,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, */ static void __inet_put_port(struct sock *sk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); struct inet_bind_bucket *tb; @@ -215,7 +215,7 @@ EXPORT_SYMBOL(inet_put_port); int __inet_inherit_port(const struct sock *sk, struct sock *child) { - struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *table = tcp_get_hashinfo(sk); unsigned short port = inet_sk(child)->inet_num; struct inet_bind_hashbucket *head, *head2; bool created_inet_bind_bucket = false; @@ -668,7 +668,7 @@ static bool inet_ehash_lookup_by_sk(struct sock *sk, */ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_ehash_bucket *head; struct hlist_nulls_head *list; spinlock_t *lock; @@ -740,7 +740,7 @@ static int inet_reuseport_add_sock(struct sock *sk, int __inet_hash(struct sock *sk, struct sock *osk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; int err = 0; @@ -785,7 +785,7 @@ int inet_hash(struct sock *sk) void inet_unhash(struct sock *sk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); if (sk_unhashed(sk)) return; @@ -873,7 +873,7 @@ inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); u32 hash; #if IS_ENABLED(CONFIG_IPV6) @@ -901,7 +901,7 @@ static void inet_update_saddr(struct sock *sk, void *saddr, int family) static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); -- cgit v1.2.3 From c26c192c3d486a2a7d83d254bae294c2f8f50abf Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Apr 2025 22:00:56 +0200 Subject: udp: properly deal with xfrm encap and ADDRFORM UDP GRO accounting assumes that the GRO receive callback is always set when the UDP tunnel is enabled, but syzkaller proved otherwise, leading tot the following splat: WARNING: CPU: 0 PID: 5837 at net/ipv4/udp_offload.c:123 udp_tunnel_update_gro_rcv+0x28d/0x4c0 net/ipv4/udp_offload.c:123 Modules linked in: CPU: 0 UID: 0 PID: 5837 Comm: syz-executor850 Not tainted 6.14.0-syzkaller-13320-g420aabef3ab5 #0 PREEMPT(full) Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2025 RIP: 0010:udp_tunnel_update_gro_rcv+0x28d/0x4c0 net/ipv4/udp_offload.c:123 Code: 00 00 e8 c6 5a 2f f7 48 c1 e5 04 48 8d b5 20 53 c7 9a ba 10 00 00 00 4c 89 ff e8 ce 87 99 f7 e9 ce 00 00 00 e8 a4 5a 2f f7 90 <0f> 0b 90 e9 de fd ff ff bf 01 00 00 00 89 ee e8 cf 5e 2f f7 85 ed RSP: 0018:ffffc90003effa88 EFLAGS: 00010293 RAX: ffffffff8a93fc9c RBX: 0000000000000000 RCX: ffff8880306f9e00 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffffffff8a93fabe R09: 1ffffffff20bfb2e R10: dffffc0000000000 R11: fffffbfff20bfb2f R12: ffff88814ef21738 R13: dffffc0000000000 R14: ffff88814ef21778 R15: 1ffff11029de42ef FS: 0000000000000000(0000) GS:ffff888124f96000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f04eec760d0 CR3: 000000000eb38000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: udp_tunnel_cleanup_gro include/net/udp_tunnel.h:205 [inline] udpv6_destroy_sock+0x212/0x270 net/ipv6/udp.c:1829 sk_common_release+0x71/0x2e0 net/core/sock.c:3896 inet_release+0x17d/0x200 net/ipv4/af_inet.c:435 __sock_release net/socket.c:647 [inline] sock_close+0xbc/0x240 net/socket.c:1391 __fput+0x3e9/0x9f0 fs/file_table.c:465 task_work_run+0x251/0x310 kernel/task_work.c:227 exit_task_work include/linux/task_work.h:40 [inline] do_exit+0xa11/0x27f0 kernel/exit.c:953 do_group_exit+0x207/0x2c0 kernel/exit.c:1102 __do_sys_exit_group kernel/exit.c:1113 [inline] __se_sys_exit_group kernel/exit.c:1111 [inline] __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1111 x64_sys_call+0x26c3/0x26d0 arch/x86/include/generated/asm/syscalls_64.h:232 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f04eebfac79 Code: Unable to access opcode bytes at 0x7f04eebfac4f. RSP: 002b:00007fffdcaa34a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f04eebfac79 RDX: 000000000000003c RSI: 00000000000000e7 RDI: 0000000000000000 RBP: 00007f04eec75270 R08: ffffffffffffffb8 R09: 00007fffdcaa36c8 R10: 0000200000000000 R11: 0000000000000246 R12: 00007f04eec75270 R13: 0000000000000000 R14: 00007f04eec75cc0 R15: 00007f04eebcca70 Address the issue moving the accounting hook into setup_udp_tunnel_sock() and set_xfrm_gro_udp_encap_rcv(), where the GRO callback is actually set. set_xfrm_gro_udp_encap_rcv() is prone to races with IPV6_ADDRFORM, run the relevant setsockopt under the socket lock to ensure using consistent values of sk_family and up->encap_type. Refactor the GRO callback selection code, to make it clear that the function pointer is always initialized. Reported-by: syzbot+8c469a2260132cd095c1@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=8c469a2260132cd095c1 Fixes: 172bf009c18d ("xfrm: Support GRO for IPv4 ESP in UDP encapsulation") Fixes: 5d7f5b2f6b935 ("udp_tunnel: use static call for GRO hooks when possible") Signed-off-by: Paolo Abeni Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/92bcdb6899145a9a387c8fa9e3ca656642a43634.1744228733.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/net/udp_tunnel.h | 1 - net/ipv4/udp.c | 31 ++++++++++++++++++++++++++----- net/ipv4/udp_tunnel_core.c | 2 ++ 3 files changed, 28 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 288f06f23a80..2df3b8344eb5 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -215,7 +215,6 @@ static inline void udp_tunnel_encap_enable(struct sock *sk) if (READ_ONCE(sk->sk_family) == PF_INET6) ipv6_stub->udpv6_encap_enable(); #endif - udp_tunnel_update_gro_rcv(sk, true); udp_encap_enable(); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8867fe687888..f9f5b92cf4b6 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2904,15 +2904,33 @@ void udp_destroy_sock(struct sock *sk) } } +typedef struct sk_buff *(*udp_gro_receive_t)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); + static void set_xfrm_gro_udp_encap_rcv(__u16 encap_type, unsigned short family, struct sock *sk) { #ifdef CONFIG_XFRM + udp_gro_receive_t new_gro_receive; + if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) { - if (family == AF_INET) - WRITE_ONCE(udp_sk(sk)->gro_receive, xfrm4_gro_udp_encap_rcv); - else if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) - WRITE_ONCE(udp_sk(sk)->gro_receive, ipv6_stub->xfrm6_gro_udp_encap_rcv); + if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) + new_gro_receive = ipv6_stub->xfrm6_gro_udp_encap_rcv; + else + new_gro_receive = xfrm4_gro_udp_encap_rcv; + + if (udp_sk(sk)->gro_receive != new_gro_receive) { + /* + * With IPV6_ADDRFORM the gro callback could change + * after being set, unregister the old one, if valid. + */ + if (udp_sk(sk)->gro_receive) + udp_tunnel_update_gro_rcv(sk, false); + + WRITE_ONCE(udp_sk(sk)->gro_receive, new_gro_receive); + udp_tunnel_update_gro_rcv(sk, true); + } } #endif } @@ -2962,6 +2980,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, break; case UDP_ENCAP: + sockopt_lock_sock(sk); switch (val) { case 0: #ifdef CONFIG_XFRM @@ -2985,6 +3004,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, err = -ENOPROTOOPT; break; } + sockopt_release_sock(sk); break; case UDP_NO_CHECK6_TX: @@ -3002,13 +3022,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, break; case UDP_GRO: - + sockopt_lock_sock(sk); /* when enabling GRO, accept the related GSO packet type */ if (valbool) udp_tunnel_encap_enable(sk); udp_assign_bit(GRO_ENABLED, sk, valbool); udp_assign_bit(ACCEPT_L4, sk, valbool); set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk); + sockopt_release_sock(sk); break; /* diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 3d1214e6df0e..2326548997d3 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -90,6 +90,8 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_tunnel_encap_enable(sk); + udp_tunnel_update_gro_rcv(sk, true); + if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sk) && sk->sk_kern_sock) udp_tunnel_update_gro_lookup(net, sk, true); -- cgit v1.2.3 From 097f171f98289cf737437599c40b0d1e81266e9e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:42:46 -0700 Subject: net: convert dev->rtnl_link_state to a bool netdevice reg_state was split into two 16 bit enums back in 2010 in commit a2835763e130 ("rtnetlink: handle rtnl_link netlink notifications manually"). Since the split the fields have been moved apart, and last year we converted reg_state to a normal u8 in commit 4d42b37def70 ("net: convert dev->reg_state to u8"). rtnl_link_state being a 16 bitfield makes no sense. Convert it to a single bool, it seems very unlikely after 15 years that we'll need more values in it. We could drop dev->rtnl_link_ops from the conditions but feels like having it there more clearly points at the reason for this hack. Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250410014246.780885-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/net_device.rst | 2 +- include/linux/netdevice.h | 10 ++-------- net/core/dev.c | 6 ++---- net/core/rtnetlink.c | 15 ++++++++------- 4 files changed, 13 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 6327e689e8a8..ca8605eb82ff 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -131,7 +131,7 @@ struct ref_tracker_dir refcnt_tracker struct list_head link_watch_list enum:8 reg_state bool dismantle -enum:16 rtnl_link_state +bool rtnl_link_initilizing bool needs_free_netdev void*priv_destructor struct net_device struct netpoll_info* npinfo read_mostly napi_poll/napi_poll_lock diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d8544f6a680c..e6036b82ef4c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1946,9 +1946,6 @@ enum netdev_reg_state { * * @reg_state: Register/unregister state machine * @dismantle: Device is going to be freed - * @rtnl_link_state: This enum represents the phases of creating - * a new link - * * @needs_free_netdev: Should unregister perform free_netdev? * @priv_destructor: Called from unregister * @npinfo: XXX: need comments on this one @@ -2363,11 +2360,8 @@ struct net_device { /** @moving_ns: device is changing netns, protected by @lock */ bool moving_ns; - - enum { - RTNL_LINK_INITIALIZED, - RTNL_LINK_INITIALIZING, - } rtnl_link_state:16; + /** @rtnl_link_initializing: Device being created, suppress events */ + bool rtnl_link_initializing; bool needs_free_netdev; void (*priv_destructor)(struct net_device *dev); diff --git a/net/core/dev.c b/net/core/dev.c index d0563ddff6ca..03d20a98f8b7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11110,8 +11110,7 @@ int register_netdevice(struct net_device *dev) * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); out: @@ -12025,8 +12024,7 @@ void unregister_netdevice_many_notify(struct list_head *head, */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL, NULL, 0, portid, nlh); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 39a5b72e861f..38526210b8fd 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3580,7 +3580,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh) { - unsigned int old_flags; + unsigned int old_flags, changed; int err; old_flags = dev->flags; @@ -3591,12 +3591,13 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, return err; } - if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { - __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh); - } else { - dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - __dev_notify_flags(dev, old_flags, ~0U, portid, nlh); + changed = old_flags ^ dev->flags; + if (dev->rtnl_link_initializing) { + dev->rtnl_link_initializing = false; + changed = ~0U; } + + __dev_notify_flags(dev, old_flags, changed, portid, nlh); return 0; } EXPORT_SYMBOL(rtnl_configure_link); @@ -3654,7 +3655,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, dev_net_set(dev, net); dev->rtnl_link_ops = ops; - dev->rtnl_link_state = RTNL_LINK_INITIALIZING; + dev->rtnl_link_initializing = true; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); -- cgit v1.2.3 From e846fb5e7c5243c65ff67247cb29a9d76bbcc4e8 Mon Sep 17 00:00:00 2001 From: Joseph Huang Date: Fri, 11 Apr 2025 11:03:16 -0400 Subject: net: bridge: mcast: Add offload failed mdb flag Add MDB_FLAGS_OFFLOAD_FAILED and MDB_PG_FLAGS_OFFLOAD_FAILED to indicate that an attempt to offload the MDB entry to switchdev has failed. Signed-off-by: Joseph Huang Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250411150323.1117797-2-Joseph.Huang@garmin.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 9 +++++---- net/bridge/br_mdb.c | 2 ++ net/bridge/br_private.h | 20 +++++++++++++++----- net/bridge/br_switchdev.c | 9 +++++---- 4 files changed, 27 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index a5b743a2f775..f2a6de424f3f 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -699,10 +699,11 @@ struct br_mdb_entry { #define MDB_TEMPORARY 0 #define MDB_PERMANENT 1 __u8 state; -#define MDB_FLAGS_OFFLOAD (1 << 0) -#define MDB_FLAGS_FAST_LEAVE (1 << 1) -#define MDB_FLAGS_STAR_EXCL (1 << 2) -#define MDB_FLAGS_BLOCKED (1 << 3) +#define MDB_FLAGS_OFFLOAD (1 << 0) +#define MDB_FLAGS_FAST_LEAVE (1 << 1) +#define MDB_FLAGS_STAR_EXCL (1 << 2) +#define MDB_FLAGS_BLOCKED (1 << 3) +#define MDB_FLAGS_OFFLOAD_FAILED (1 << 4) __u8 flags; __u16 vid; struct { diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 722203b98ff7..aac96bf4ba44 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -144,6 +144,8 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) e->flags |= MDB_FLAGS_STAR_EXCL; if (flags & MDB_PG_FLAGS_BLOCKED) e->flags |= MDB_FLAGS_BLOCKED; + if (flags & MDB_PG_FLAGS_OFFLOAD_FAILED) + e->flags |= MDB_FLAGS_OFFLOAD_FAILED; } static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index d5b3c5936a79..d9b8ed316040 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -306,11 +306,12 @@ struct net_bridge_fdb_flush_desc { u16 vlan_id; }; -#define MDB_PG_FLAGS_PERMANENT BIT(0) -#define MDB_PG_FLAGS_OFFLOAD BIT(1) -#define MDB_PG_FLAGS_FAST_LEAVE BIT(2) -#define MDB_PG_FLAGS_STAR_EXCL BIT(3) -#define MDB_PG_FLAGS_BLOCKED BIT(4) +#define MDB_PG_FLAGS_PERMANENT BIT(0) +#define MDB_PG_FLAGS_OFFLOAD BIT(1) +#define MDB_PG_FLAGS_FAST_LEAVE BIT(2) +#define MDB_PG_FLAGS_STAR_EXCL BIT(3) +#define MDB_PG_FLAGS_BLOCKED BIT(4) +#define MDB_PG_FLAGS_OFFLOAD_FAILED BIT(5) #define PG_SRC_ENT_LIMIT 32 @@ -1342,6 +1343,15 @@ br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx) return !!(vlan_snooping_enabled == br_multicast_ctx_is_vlan(brmctx)); } + +static inline void +br_multicast_set_pg_offload_flags(struct net_bridge_port_group *p, + bool offloaded) +{ + p->flags &= ~(MDB_PG_FLAGS_OFFLOAD | MDB_PG_FLAGS_OFFLOAD_FAILED); + p->flags |= (offloaded ? MDB_PG_FLAGS_OFFLOAD : + MDB_PG_FLAGS_OFFLOAD_FAILED); +} #else static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx, struct net_bridge_mcast_port **pmctx, diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 7b41ee8740cb..2d769ef3cb8a 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -505,8 +505,8 @@ static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *pri struct net_bridge_port *port = data->port; struct net_bridge *br = port->br; - if (err) - goto err; + if (err == -EOPNOTSUPP) + goto out_free; spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &data->ip); @@ -516,11 +516,12 @@ static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *pri pp = &p->next) { if (p->key.port != port) continue; - p->flags |= MDB_PG_FLAGS_OFFLOAD; + + br_multicast_set_pg_offload_flags(p, !err); } out: spin_unlock_bh(&br->multicast_lock); -err: +out_free: kfree(priv); } -- cgit v1.2.3 From 9fbe1e3e61c21508861a72324087aeeea85f796f Mon Sep 17 00:00:00 2001 From: Joseph Huang Date: Fri, 11 Apr 2025 11:03:17 -0400 Subject: net: bridge: Add offload_fail_notification bopt Add BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION bool option. Signed-off-by: Joseph Huang Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250411150323.1117797-3-Joseph.Huang@garmin.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 1 + net/bridge/br.c | 5 +++++ net/bridge/br_private.h | 1 + 3 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index f2a6de424f3f..73876c0e2bba 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -831,6 +831,7 @@ enum br_boolopt_id { BR_BOOLOPT_NO_LL_LEARN, BR_BOOLOPT_MCAST_VLAN_SNOOPING, BR_BOOLOPT_MST_ENABLE, + BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION, BR_BOOLOPT_MAX }; diff --git a/net/bridge/br.c b/net/bridge/br.c index 183fcb362f9e..25dda554ca5b 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -284,6 +284,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, case BR_BOOLOPT_MST_ENABLE: err = br_mst_set_enabled(br, on, extack); break; + case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: + br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on); + break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -302,6 +305,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); case BR_BOOLOPT_MST_ENABLE: return br_opt_get(br, BROPT_MST_ENABLED); + case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: + return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION); default: /* shouldn't be called with unsupported options */ WARN_ON(1); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index d9b8ed316040..e17a3c5fe689 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -484,6 +484,7 @@ enum net_bridge_opts { BROPT_VLAN_BRIDGE_BINDING, BROPT_MCAST_VLAN_SNOOPING_ENABLED, BROPT_MST_ENABLED, + BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, }; struct net_bridge { -- cgit v1.2.3 From cd3c93167da0e760b5819246eae7a4ea30fd014b Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Wed, 9 Apr 2025 12:41:36 +0200 Subject: page_pool: Move pp_magic check into helper functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we are about to stash some more information into the pp_magic field, let's move the magic signature checks into a pair of helper functions so it can be changed in one place. Reviewed-by: Mina Almasry Tested-by: Yonglong Liu Acked-by: Jesper Dangaard Brouer Reviewed-by: Ilias Apalodimas Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409-page-pool-track-dma-v9-1-6a9ef2e0cba8@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 4 ++-- include/linux/mm.h | 20 ++++++++++++++++++++ mm/page_alloc.c | 8 ++------ net/core/netmem_priv.h | 5 +++++ net/core/skbuff.c | 16 ++-------------- net/core/xdp.c | 4 ++-- 6 files changed, 33 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index f803e1c93590..5ce1b463b7a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -707,8 +707,8 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo); page = xdpi.page.page; - /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) - * as we know this is a page_pool page. + /* No need to check page_pool_page_is_pp() as we + * know this is a page_pool page. */ page_pool_recycle_direct(page->pp, page); } while (++n < num); diff --git a/include/linux/mm.h b/include/linux/mm.h index b7f13f087954..56c47f4a38ca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4248,4 +4248,24 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define VM_SEALED_SYSMAP VM_NONE #endif +/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is + * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for + * the head page of compound page and bit 1 for pfmemalloc page. + * page_is_pfmemalloc() is checked in __page_pool_put_page() to avoid recycling + * the pfmemalloc page. + */ +#define PP_MAGIC_MASK ~0x3UL + +#ifdef CONFIG_PAGE_POOL +static inline bool page_pool_page_is_pp(struct page *page) +{ + return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; +} +#else +static inline bool page_pool_page_is_pp(struct page *page) +{ + return false; +} +#endif + #endif /* _LINUX_MM_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd6b865cb1ab..a18340b32218 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -897,9 +897,7 @@ static inline bool page_expected_state(struct page *page, #ifdef CONFIG_MEMCG page->memcg_data | #endif -#ifdef CONFIG_PAGE_POOL - ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) | -#endif + page_pool_page_is_pp(page) | (page->flags & check_flags))) return false; @@ -926,10 +924,8 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif -#ifdef CONFIG_PAGE_POOL - if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE)) + if (unlikely(page_pool_page_is_pp(page))) bad_reason = "page_pool leak"; -#endif return bad_reason; } diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index 7eadb8393e00..f33162fd281c 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -18,6 +18,11 @@ static inline void netmem_clear_pp_magic(netmem_ref netmem) __netmem_clear_lsb(netmem)->pp_magic = 0; } +static inline bool netmem_is_pp(netmem_ref netmem) +{ + return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; +} + static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) { __netmem_clear_lsb(netmem)->pp = pool; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6cbf77bc61fc..74a2d886a35b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -893,11 +893,6 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } -static bool is_pp_netmem(netmem_ref netmem) -{ - return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE; -} - int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom) { @@ -995,14 +990,7 @@ bool napi_pp_put_page(netmem_ref netmem) { netmem = netmem_compound_head(netmem); - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely(!is_pp_netmem(netmem))) + if (unlikely(!netmem_is_pp(netmem))) return false; page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); @@ -1042,7 +1030,7 @@ static int skb_pp_frag_ref(struct sk_buff *skb) for (i = 0; i < shinfo->nr_frags; i++) { head_netmem = netmem_compound_head(shinfo->frags[i].netmem); - if (likely(is_pp_netmem(head_netmem))) + if (likely(netmem_is_pp(head_netmem))) page_pool_ref_netmem(head_netmem); else page_ref_inc(netmem_to_page(head_netmem)); diff --git a/net/core/xdp.c b/net/core/xdp.c index 3cd0db9c9d2d..a014df88620c 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -438,8 +438,8 @@ void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, netmem = netmem_compound_head(netmem); if (napi_direct && xdp_return_frame_no_direct()) napi_direct = false; - /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) - * as mem->type knows this a page_pool page + /* No need to check netmem_is_pp() as mem->type knows this a + * page_pool page */ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, napi_direct); -- cgit v1.2.3 From ee62ce7a1d909ccba0399680a03c2dee83bcae95 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Wed, 9 Apr 2025 12:41:37 +0200 Subject: page_pool: Track DMA-mapped pages and unmap them when destroying the pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When enabling DMA mapping in page_pool, pages are kept DMA mapped until they are released from the pool, to avoid the overhead of re-mapping the pages every time they are used. This causes resource leaks and/or crashes when there are pages still outstanding while the device is torn down, because page_pool will attempt an unmap through a non-existent DMA device on the subsequent page return. To fix this, implement a simple tracking of outstanding DMA-mapped pages in page pool using an xarray. This was first suggested by Mina[0], and turns out to be fairly straight forward: We simply store pointers to pages directly in the xarray with xa_alloc() when they are first DMA mapped, and remove them from the array on unmap. Then, when a page pool is torn down, it can simply walk the xarray and unmap all pages still present there before returning, which also allows us to get rid of the get/put_device() calls in page_pool. Using xa_cmpxchg(), no additional synchronisation is needed, as a page will only ever be unmapped once. To avoid having to walk the entire xarray on unmap to find the page reference, we stash the ID assigned by xa_alloc() into the page structure itself, using the upper bits of the pp_magic field. This requires a couple of defines to avoid conflicting with the POINTER_POISON_DELTA define, but this is all evaluated at compile-time, so does not affect run-time performance. The bitmap calculations in this patch gives the following number of bits for different architectures: - 23 bits on 32-bit architectures - 21 bits on PPC64 (because of the definition of ILLEGAL_POINTER_VALUE) - 32 bits on other 64-bit architectures Stashing a value into the unused bits of pp_magic does have the effect that it can make the value stored there lie outside the unmappable range (as governed by the mmap_min_addr sysctl), for architectures that don't define ILLEGAL_POINTER_VALUE. This means that if one of the pointers that is aliased to the pp_magic field (such as page->lru.next) is dereferenced while the page is owned by page_pool, that could lead to a dereference into userspace, which is a security concern. The risk of this is mitigated by the fact that (a) we always clear pp_magic before releasing a page from page_pool, and (b) this would need a use-after-free bug for struct page, which can have many other risks since page->lru.next is used as a generic list pointer in multiple places in the kernel. As such, with this patch we take the position that this risk is negligible in practice. For more discussion, see[1]. Since all the tracking added in this patch is performed on DMA map/unmap, no additional code is needed in the fast path, meaning the performance overhead of this tracking is negligible there. A micro-benchmark shows that the total overhead of the tracking itself is about 400 ns (39 cycles(tsc) 395.218 ns; sum for both map and unmap[2]). Since this cost is only paid on DMA map and unmap, it seems like an acceptable cost to fix the late unmap issue. Further optimisation can narrow the cases where this cost is paid (for instance by eliding the tracking when DMA map/unmap is a no-op). The extra memory needed to track the pages is neatly encapsulated inside xarray, which uses the 'struct xa_node' structure to track items. This structure is 576 bytes long, with slots for 64 items, meaning that a full node occurs only 9 bytes of overhead per slot it tracks (in practice, it probably won't be this efficient, but in any case it should be an acceptable overhead). [0] https://lore.kernel.org/all/CAHS8izPg7B5DwKfSuzz-iOop_YRbk3Sd6Y4rX7KBG9DcVJcyWg@mail.gmail.com/ [1] https://lore.kernel.org/r/20250320023202.GA25514@openwall.com [2] https://lore.kernel.org/r/ae07144c-9295-4c9d-a400-153bb689fe9e@huawei.com Reported-by: Yonglong Liu Closes: https://lore.kernel.org/r/8743264a-9700-4227-a556-5f931c720211@huawei.com Fixes: ff7d6b27f894 ("page_pool: refurbish version of page_pool code") Suggested-by: Mina Almasry Reviewed-by: Mina Almasry Reviewed-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Qiuling Ren Tested-by: Yuying Ma Tested-by: Yonglong Liu Acked-by: Jesper Dangaard Brouer Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409-page-pool-track-dma-v9-2-6a9ef2e0cba8@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/mm.h | 46 +++++++++++++++++++++--- include/linux/poison.h | 4 +++ include/net/page_pool/types.h | 6 ++++ net/core/netmem_priv.h | 28 ++++++++++++++- net/core/page_pool.c | 81 ++++++++++++++++++++++++++++++++++++------- 5 files changed, 147 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 56c47f4a38ca..130d3c9d2ee4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4248,13 +4248,51 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define VM_SEALED_SYSMAP VM_NONE #endif +/* + * DMA mapping IDs for page_pool + * + * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and + * stashes it in the upper bits of page->pp_magic. We always want to be able to + * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP + * pages can have arbitrary kernel pointers stored in the same field as pp_magic + * (since it overlaps with page->lru.next), so we must ensure that we cannot + * mistake a valid kernel pointer with any of the values we write into this + * field. + * + * On architectures that set POISON_POINTER_DELTA, this is already ensured, + * since this value becomes part of PP_SIGNATURE; meaning we can just use the + * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the + * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is + * 0, we make sure that we leave the two topmost bits empty, as that guarantees + * we won't mistake a valid kernel pointer for a value we set, regardless of the + * VMSPLIT setting. + * + * Altogether, this means that the number of bits available is constrained by + * the size of an unsigned long (at the upper end, subtracting two bits per the + * above), and the definition of PP_SIGNATURE (with or without + * POISON_POINTER_DELTA). + */ +#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) +#if POISON_POINTER_DELTA > 0 +/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA + * index to not overlap with that if set + */ +#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) +#else +/* Always leave out the topmost two; see above. */ +#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) +#endif + +#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ + PP_DMA_INDEX_SHIFT) + /* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for - * the head page of compound page and bit 1 for pfmemalloc page. - * page_is_pfmemalloc() is checked in __page_pool_put_page() to avoid recycling - * the pfmemalloc page. + * the head page of compound page and bit 1 for pfmemalloc page, as well as the + * bits used for the DMA index. page_is_pfmemalloc() is checked in + * __page_pool_put_page() to avoid recycling the pfmemalloc page. */ -#define PP_MAGIC_MASK ~0x3UL +#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) #ifdef CONFIG_PAGE_POOL static inline bool page_pool_page_is_pp(struct page *page) diff --git a/include/linux/poison.h b/include/linux/poison.h index 331a9a996fa8..8ca2235f78d5 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -70,6 +70,10 @@ #define KEY_DESTROY 0xbd /********** net/core/page_pool.c **********/ +/* + * page_pool uses additional free bits within this value to store data, see the + * definition of PP_DMA_INDEX_MASK in mm.h + */ #define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) /********** net/core/skbuff.c **********/ diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 36eb57d73abc..431b593de709 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA @@ -33,6 +34,9 @@ #define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \ PP_FLAG_SYSTEM_POOL | PP_FLAG_ALLOW_UNREADABLE_NETMEM) +/* Index limit to stay within PP_DMA_INDEX_BITS for DMA indices */ +#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1) + /* * Fast allocation side cache array/stack * @@ -221,6 +225,8 @@ struct page_pool { void *mp_priv; const struct memory_provider_ops *mp_ops; + struct xarray dma_mapped; + #ifdef CONFIG_PAGE_POOL_STATS /* recycle stats are per-cpu to avoid locking */ struct page_pool_recycle_stats __percpu *recycle_stats; diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index f33162fd281c..cd95394399b4 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -5,7 +5,7 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) { - return __netmem_clear_lsb(netmem)->pp_magic; + return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) @@ -15,6 +15,8 @@ static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) static inline void netmem_clear_pp_magic(netmem_ref netmem) { + WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK); + __netmem_clear_lsb(netmem)->pp_magic = 0; } @@ -33,4 +35,28 @@ static inline void netmem_set_dma_addr(netmem_ref netmem, { __netmem_clear_lsb(netmem)->dma_addr = dma_addr; } + +static inline unsigned long netmem_get_dma_index(netmem_ref netmem) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return 0; + + magic = __netmem_clear_lsb(netmem)->pp_magic; + + return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT; +} + +static inline void netmem_set_dma_index(netmem_ref netmem, + unsigned long id) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return; + + magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT); + __netmem_clear_lsb(netmem)->pp_magic = magic; +} #endif diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 7745ad924ae2..2b7684865941 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -276,8 +276,7 @@ static int page_pool_init(struct page_pool *pool, /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); - if (pool->dma_map) - get_device(pool->p.dev); + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { netdev_assert_locked(pool->slow.netdev); @@ -320,9 +319,7 @@ free_ptr_ring: static void page_pool_uninit(struct page_pool *pool) { ptr_ring_cleanup(&pool->ring, NULL); - - if (pool->dma_map) - put_device(pool->p.dev); + xa_destroy(&pool->dma_mapped); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) @@ -463,13 +460,21 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, netmem_ref netmem, u32 dma_sync_size) { - if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) - __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); + if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { + rcu_read_lock(); + /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ + if (pool->dma_sync) + __page_pool_dma_sync_for_device(pool, netmem, + dma_sync_size); + rcu_read_unlock(); + } } -static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; + int err; + u32 id; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit @@ -483,15 +488,30 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) if (dma_mapping_error(pool->p.dev, dma)) return false; - if (page_pool_set_dma_addr_netmem(netmem, dma)) + if (page_pool_set_dma_addr_netmem(netmem, dma)) { + WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); goto unmap_failed; + } + + if (in_softirq()) + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + else + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + if (err) { + WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + goto unset_failed; + } + netmem_set_dma_index(netmem, id); page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; +unset_failed: + page_pool_set_dma_addr_netmem(netmem, 0); unmap_failed: - WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); @@ -508,7 +528,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, if (unlikely(!page)) return NULL; - if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) { + if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { put_page(page); return NULL; } @@ -554,7 +574,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, */ for (i = 0; i < nr_pages; i++) { netmem = pool->alloc.cache[i]; - if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) { + if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { put_page(netmem_to_page(netmem)); continue; } @@ -656,6 +676,8 @@ void page_pool_clear_pp_info(netmem_ref netmem) static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, netmem_ref netmem) { + struct page *old, *page = netmem_to_page(netmem); + unsigned long id; dma_addr_t dma; if (!pool->dma_map) @@ -664,6 +686,17 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, */ return; + id = netmem_get_dma_index(netmem); + if (!id) + return; + + if (in_softirq()) + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); + else + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); + if (old != page) + return; + dma = page_pool_get_dma_addr_netmem(netmem); /* When page is unmapped, it cannot be returned to our pool */ @@ -671,6 +704,7 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr_netmem(netmem, 0); + netmem_set_dma_index(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need @@ -1080,8 +1114,29 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool) static void page_pool_scrub(struct page_pool *pool) { + unsigned long id; + void *ptr; + page_pool_empty_alloc_cache_once(pool); - pool->destroy_cnt++; + if (!pool->destroy_cnt++ && pool->dma_map) { + if (pool->dma_sync) { + /* Disable page_pool_dma_sync_for_device() */ + pool->dma_sync = false; + + /* Make sure all concurrent returns that may see the old + * value of dma_sync (and thus perform a sync) have + * finished before doing the unmapping below. Skip the + * wait if the device doesn't actually need syncing, or + * if there are no outstanding mapped pages. + */ + if (dma_dev_need_sync(pool->p.dev) && + !xa_empty(&pool->dma_mapped)) + synchronize_net(); + } + + xa_for_each(&pool->dma_mapped, id, ptr) + __page_pool_release_page_dma(pool, page_to_netmem(ptr)); + } /* No more consumers should exist, but producers could still * be in-flight. -- cgit v1.2.3 From ceaceaf79ea0fe337344fc5c1fb10a421a362410 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 11 Apr 2025 23:04:19 +0100 Subject: net: ethtool: fix get_ts_stats() documentation Commit 0e9c127729be ("ethtool: add interface to read Tx hardware timestamping statistics") added documentation for timestamping statistics, but added the detailed explanation for this method to the get_ts_info() rather than get_ts_stats(). Move it to the correct entry. Cc: Rahul Rameshbabu Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u3MTz-000Crx-IW@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 8210ece94fa6..013d25858642 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -926,10 +926,11 @@ struct kernel_ethtool_ts_info { * @get_ts_info: Get the time stamping and PTP hardware clock capabilities. * It may be called with RCU, or rtnl or reference on the device. * Drivers supporting transmit time stamps in software should set this to - * ethtool_op_get_ts_info(). Drivers must not zero statistics which they - * don't report. The stats structure is initialized to ETHTOOL_STAT_NOT_SET - * indicating driver does not report statistics. - * @get_ts_stats: Query the device hardware timestamping statistics. + * ethtool_op_get_ts_info(). + * @get_ts_stats: Query the device hardware timestamping statistics. Drivers + * must not zero statistics which they don't report. The stats structure + * is initialized to ETHTOOL_STAT_NOT_SET indicating driver does not + * report statistics. * @get_module_info: Get the size and type of the eeprom contained within * a plug-in module. * @get_module_eeprom: Get the eeprom information from the plug-in module -- cgit v1.2.3 From 7a60d91c690bf73c2c78e763efa29f294e217c3a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:32 -0700 Subject: net: Add ->exit_rtnl() hook to struct pernet_operations. struct pernet_operations provides two batching hooks; ->exit_batch() and ->exit_batch_rtnl(). The batching variant is beneficial if ->exit() meets any of the following conditions: 1) ->exit() repeatedly acquires a global lock for each netns 2) ->exit() has a time-consuming operation that can be factored out (e.g. synchronize_rcu(), smp_mb(), etc) 3) ->exit() does not need to repeat the same iterations for each netns (e.g. inet_twsk_purge()) Currently, none of the ->exit_batch_rtnl() functions satisfy any of the above conditions because RTNL is factored out and held by the caller and all of these functions iterate over the dying netns list. Also, we want to hold per-netns RTNL there but avoid spreading __rtnl_net_lock() across multiple locations. Let's add ->exit_rtnl() hook and run it under __rtnl_net_lock(). The following patches will convert all ->exit_batch_rtnl() users to ->exit_rtnl(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 2 ++ net/core/net_namespace.c | 53 +++++++++++++++++++++++++++++++++------------ 2 files changed, 41 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index bd57d8fb54f1..b071e6eed9d5 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -475,6 +475,8 @@ struct pernet_operations { void (*exit)(struct net *net); void (*exit_batch)(struct list_head *net_exit_list); /* Following method is called with RTNL held. */ + void (*exit_rtnl)(struct net *net, + struct list_head *dev_kill_list); void (*exit_batch_rtnl)(struct list_head *net_exit_list, struct list_head *dev_kill_list); unsigned int * const id; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 37026776ae4e..afaa3d1bda8d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -163,16 +163,51 @@ static void ops_pre_exit_list(const struct pernet_operations *ops, } } +static void ops_exit_rtnl_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + const struct pernet_operations *saved_ops = ops; + LIST_HEAD(dev_kill_list); + struct net *net; + + rtnl_lock(); + + list_for_each_entry(net, net_exit_list, exit_list) { + __rtnl_net_lock(net); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) { + if (ops->exit_rtnl) + ops->exit_rtnl(net, &dev_kill_list); + } + + __rtnl_net_unlock(net); + } + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) { + if (ops->exit_batch_rtnl) + ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); + } + + unregister_netdevice_many(&dev_kill_list); + + rtnl_unlock(); +} + static void ops_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { - struct net *net; if (ops->exit) { + struct net *net; + list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); cond_resched(); } } + if (ops->exit_batch) ops->exit_batch(net_exit_list); } @@ -213,18 +248,8 @@ static void ops_undo_list(const struct list_head *ops_list, else synchronize_rcu(); - if (hold_rtnl) { - LIST_HEAD(dev_kill_list); - - ops = saved_ops; - rtnl_lock(); - list_for_each_entry_continue_reverse(ops, ops_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - } + if (hold_rtnl) + ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list); ops = saved_ops; list_for_each_entry_continue_reverse(ops, ops_list, list) @@ -238,7 +263,7 @@ static void ops_undo_list(const struct list_head *ops_list, static void ops_undo_single(struct pernet_operations *ops, struct list_head *net_exit_list) { - bool hold_rtnl = !!ops->exit_batch_rtnl; + bool hold_rtnl = ops->exit_rtnl || ops->exit_batch_rtnl; LIST_HEAD(ops_list); list_add(&ops->list, &ops_list); -- cgit v1.2.3 From a967e01e2ad201f6ddc778ed65a5dae1c68ee8a5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:35 -0700 Subject: ipv4: ip_tunnel: Convert ip_tunnel_delete_nets() callers to ->exit_rtnl(). ip_tunnel_delete_nets() iterates the dying netns list and performs the same operations for each. Let's export ip_tunnel_destroy() as ip_tunnel_delete_net() and call it from ->exit_rtnl(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: David Ahern Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/ip_tunnels.h | 7 +++---- net/ipv4/ip_gre.c | 27 ++++++++++++--------------- net/ipv4/ip_tunnel.c | 25 +++++++------------------ net/ipv4/ip_vti.c | 9 ++++----- net/ipv4/ipip.c | 9 ++++----- 5 files changed, 30 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index a36a335cef9f..0c3d571a04a1 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -377,10 +377,9 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); - -void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id, - struct rtnl_link_ops *ops, - struct list_head *dev_to_kill); +void ip_tunnel_delete_net(struct net *net, unsigned int id, + struct rtnl_link_ops *ops, + struct list_head *dev_to_kill); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 26d15f907551..f5b9004d6938 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1066,16 +1066,15 @@ static int __net_init ipgre_init_net(struct net *net) return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } -static void __net_exit ipgre_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit ipgre_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, ipgre_net_id, &ipgre_link_ops, dev_to_kill); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, - .exit_batch_rtnl = ipgre_exit_batch_rtnl, + .exit_rtnl = ipgre_exit_rtnl, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1752,16 +1751,15 @@ static int __net_init ipgre_tap_init_net(struct net *net) return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } -static void __net_exit ipgre_tap_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit ipgre_tap_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops, - dev_to_kill); + ip_tunnel_delete_net(net, gre_tap_net_id, &ipgre_tap_ops, dev_to_kill); } static struct pernet_operations ipgre_tap_net_ops = { .init = ipgre_tap_init_net, - .exit_batch_rtnl = ipgre_tap_exit_batch_rtnl, + .exit_rtnl = ipgre_tap_exit_rtnl, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1772,16 +1770,15 @@ static int __net_init erspan_init_net(struct net *net) &erspan_link_ops, "erspan0"); } -static void __net_exit erspan_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit erspan_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, erspan_net_id, &erspan_link_ops, dev_to_kill); } static struct pernet_operations erspan_net_ops = { .init = erspan_init_net, - .exit_batch_rtnl = erspan_exit_batch_rtnl, + .exit_rtnl = erspan_exit_rtnl, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 1024f961ec9a..3913ec89ad20 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1174,13 +1174,16 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); -static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, - struct list_head *head, - struct rtnl_link_ops *ops) +void ip_tunnel_delete_net(struct net *net, unsigned int id, + struct rtnl_link_ops *ops, + struct list_head *head) { + struct ip_tunnel_net *itn = net_generic(net, id); struct net_device *dev, *aux; int h; + ASSERT_RTNL_NET(net); + for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == ops) unregister_netdevice_queue(dev, head); @@ -1198,21 +1201,7 @@ static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, unregister_netdevice_queue(t->dev, head); } } - -void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, - struct rtnl_link_ops *ops, - struct list_head *dev_to_kill) -{ - struct ip_tunnel_net *itn; - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) { - itn = net_generic(net, id); - ip_tunnel_destroy(net, itn, dev_to_kill, ops); - } -} -EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); +EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); int ip_tunnel_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 159b4473290e..686e4f3d83aa 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -523,16 +523,15 @@ static int __net_init vti_init_net(struct net *net) return 0; } -static void __net_exit vti_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit vti_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, vti_net_id, &vti_link_ops, dev_to_kill); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, - .exit_batch_rtnl = vti_exit_batch_rtnl, + .exit_rtnl = vti_exit_rtnl, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index bab0bf90c908..3e03af073a1c 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -604,16 +604,15 @@ static int __net_init ipip_init_net(struct net *net) return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } -static void __net_exit ipip_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit ipip_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, ipip_net_id, &ipip_link_ops, dev_to_kill); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, - .exit_batch_rtnl = ipip_exit_batch_rtnl, + .exit_rtnl = ipip_exit_rtnl, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; -- cgit v1.2.3 From c57a9c503543cd8829eeaaf88362199e0491c0d7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:43 -0700 Subject: net: Remove ->exit_batch_rtnl(). There are no ->exit_batch_rtnl() users remaining. Let's remove the hook. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-15-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 2 -- net/core/net_namespace.c | 8 +------- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index b071e6eed9d5..025a7574b275 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -477,8 +477,6 @@ struct pernet_operations { /* Following method is called with RTNL held. */ void (*exit_rtnl)(struct net *net, struct list_head *dev_kill_list); - void (*exit_batch_rtnl)(struct list_head *net_exit_list, - struct list_head *dev_kill_list); unsigned int * const id; const size_t size; }; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index afaa3d1bda8d..0a2b24af4028 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -185,12 +185,6 @@ static void ops_exit_rtnl_list(const struct list_head *ops_list, __rtnl_net_unlock(net); } - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, ops_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); @@ -263,7 +257,7 @@ static void ops_undo_list(const struct list_head *ops_list, static void ops_undo_single(struct pernet_operations *ops, struct list_head *net_exit_list) { - bool hold_rtnl = ops->exit_rtnl || ops->exit_batch_rtnl; + bool hold_rtnl = !!ops->exit_rtnl; LIST_HEAD(ops_list); list_add(&ops->list, &ops_list); -- cgit v1.2.3 From 651f88cb046c5e002f7c11de2cebf207787d2346 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 12 Apr 2025 09:08:45 +0100 Subject: net: stmmac: remove eee_usecs_rate plat_dat->eee_users_rate is now unused, so remove this member. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u3Vuv-000E7y-9k@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index c4ec8bb8144e..8aed09d65b4a 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -276,7 +276,6 @@ struct plat_stmmacenet_data { int mac_port_sel_speed; int has_xgmac; u8 vlan_fail_q; - unsigned long eee_usecs_rate; struct pci_dev *pdev; int int_snapshot_num; int msi_mac_vec; -- cgit v1.2.3 From 23738cc8048322cf324f330cd697380fb3455da5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:47 +0100 Subject: rxrpc: Pull out certain app callback funcs into an ops table A number of functions separately furnish an AF_RXRPC socket with callback function pointers into a kernel app (such as the AFS filesystem) that is using it. Replace most of these with an ops table for the entire socket. This makes it easier to add more callback functions. Note that the call incoming data processing callback is retaind as that gets set to different things, depending on the type of op. Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-3-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- fs/afs/rxrpc.c | 11 ++++++++--- include/net/af_rxrpc.h | 25 +++++++++++++++---------- net/rxrpc/af_rxrpc.c | 20 ++++++++------------ net/rxrpc/ar-internal.h | 3 +-- net/rxrpc/call_accept.c | 34 ++++++++++++++++------------------ net/rxrpc/rxperf.c | 10 +++++++--- 6 files changed, 55 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index d5e480a33859..a4734304e542 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -24,8 +24,15 @@ static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); +static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID); static int afs_deliver_cm_op_id(struct afs_call *); +static const struct rxrpc_kernel_ops afs_rxrpc_callback_ops = { + .notify_new_call = afs_rx_new_call, + .discard_new_call = afs_rx_discard_new_call, + .user_attach_call = afs_rx_attach, +}; + /* asynchronous incoming call initial processing */ static const struct afs_call_type afs_RXCMxxxx = { .name = "CB.xxxx", @@ -84,8 +91,7 @@ int afs_open_socket(struct afs_net *net) * it sends back to us. */ - rxrpc_kernel_new_call_notification(socket, afs_rx_new_call, - afs_rx_discard_new_call); + rxrpc_kernel_set_notifications(socket, &afs_rxrpc_callback_ops); ret = kernel_listen(socket, INT_MAX); if (ret < 0) @@ -738,7 +744,6 @@ void afs_charge_preallocation(struct work_struct *work) if (rxrpc_kernel_charge_accept(net->socket, afs_wake_up_async_call, - afs_rx_attach, (unsigned long)call, GFP_KERNEL, call->debug_id) < 0) diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index cf793d18e5df..ebb6092c488b 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -29,18 +29,23 @@ enum rxrpc_interruptibility { */ extern atomic_t rxrpc_debug_id; +/* + * Operations table for rxrpc to call out to a kernel application (e.g. kAFS). + */ +struct rxrpc_kernel_ops { + void (*notify_new_call)(struct sock *sk, struct rxrpc_call *call, + unsigned long user_call_ID); + void (*discard_new_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*user_attach_call)(struct rxrpc_call *call, unsigned long user_call_ID); +}; + typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, unsigned long); typedef void (*rxrpc_notify_end_tx_t)(struct sock *, struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_notify_new_call_t)(struct sock *, struct rxrpc_call *, - unsigned long); -typedef void (*rxrpc_discard_new_call_t)(struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_user_attach_call_t)(struct rxrpc_call *, unsigned long); -void rxrpc_kernel_new_call_notification(struct socket *, - rxrpc_notify_new_call_t, - rxrpc_discard_new_call_t); +void rxrpc_kernel_set_notifications(struct socket *sock, + const struct rxrpc_kernel_ops *app_ops); struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_peer *peer, struct key *key, @@ -72,9 +77,9 @@ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer); unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data); unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer); unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *); -int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, - rxrpc_user_attach_call_t, unsigned long, gfp_t, - unsigned int); +int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, + unsigned long user_call_ID, gfp_t gfp, + unsigned int debug_id); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 2841ce72d7b8..2e4f6c9ef3de 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -477,24 +477,20 @@ u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call) EXPORT_SYMBOL(rxrpc_kernel_get_epoch); /** - * rxrpc_kernel_new_call_notification - Get notifications of new calls - * @sock: The socket to intercept received messages on - * @notify_new_call: Function to be called when new calls appear - * @discard_new_call: Function to discard preallocated calls + * rxrpc_kernel_set_notifications - Set table of callback operations + * @sock: The socket to install table upon + * @app_ops: Callback operation table to set * - * Allow a kernel service to be given notifications about new calls. + * Allow a kernel service to set a table of event notifications on a socket. */ -void rxrpc_kernel_new_call_notification( - struct socket *sock, - rxrpc_notify_new_call_t notify_new_call, - rxrpc_discard_new_call_t discard_new_call) +void rxrpc_kernel_set_notifications(struct socket *sock, + const struct rxrpc_kernel_ops *app_ops) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - rx->notify_new_call = notify_new_call; - rx->discard_new_call = discard_new_call; + rx->app_ops = app_ops; } -EXPORT_SYMBOL(rxrpc_kernel_new_call_notification); +EXPORT_SYMBOL(rxrpc_kernel_set_notifications); /** * rxrpc_kernel_set_max_life - Set maximum lifespan on a call diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 3cc3af15086f..55810c6b4be8 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -146,8 +146,7 @@ struct rxrpc_backlog { struct rxrpc_sock { /* WARNING: sk has to be the first member */ struct sock sk; - rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */ - rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */ + const struct rxrpc_kernel_ops *app_ops; /* Table of kernel app notification funcs */ struct rxrpc_local *local; /* local endpoint */ struct rxrpc_backlog *backlog; /* Preallocation for services */ spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index e685034ce4f7..a4b363b47cca 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -34,7 +34,6 @@ static void rxrpc_dummy_notify(struct sock *sk, struct rxrpc_call *call, static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, struct rxrpc_backlog *b, rxrpc_notify_rx_t notify_rx, - rxrpc_user_attach_call_t user_attach_call, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { @@ -123,9 +122,10 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, call->user_call_ID = user_call_ID; call->notify_rx = notify_rx; - if (user_attach_call) { + if (rx->app_ops && + rx->app_ops->user_attach_call) { rxrpc_get_call(call, rxrpc_call_get_kernel_service); - user_attach_call(call, user_call_ID); + rx->app_ops->user_attach_call(call, user_call_ID); } rxrpc_get_call(call, rxrpc_call_get_userid); @@ -219,9 +219,10 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_call *call = b->call_backlog[tail]; rcu_assign_pointer(call->socket, rx); - if (rx->discard_new_call) { + if (rx->app_ops && + rx->app_ops->discard_new_call) { _debug("discard %lx", call->user_call_ID); - rx->discard_new_call(call, call->user_call_ID); + rx->app_ops->discard_new_call(call, call->user_call_ID); if (call->notify_rx) call->notify_rx = rxrpc_dummy_notify; rxrpc_put_call(call, rxrpc_call_put_kernel); @@ -387,8 +388,9 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, rxrpc_incoming_call(rx, call, skb); conn = call->conn; - if (rx->notify_new_call) - rx->notify_new_call(&rx->sk, call, call->user_call_ID); + if (rx->app_ops && + rx->app_ops->notify_new_call) + rx->app_ops->notify_new_call(&rx->sk, call, call->user_call_ID); spin_lock(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_UNSECURED) { @@ -440,8 +442,7 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID) if (rx->sk.sk_state == RXRPC_CLOSE) return -ESHUTDOWN; - return rxrpc_service_prealloc_one(rx, b, NULL, NULL, user_call_ID, - GFP_KERNEL, + return rxrpc_service_prealloc_one(rx, b, NULL, user_call_ID, GFP_KERNEL, atomic_inc_return(&rxrpc_debug_id)); } @@ -449,20 +450,18 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID) * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls * @sock: The socket on which to preallocate * @notify_rx: Event notification function for the call - * @user_attach_call: Func to attach call to user_call_ID * @user_call_ID: The tag to attach to the preallocated call * @gfp: The allocation conditions. * @debug_id: The tracing debug ID. * - * Charge up the socket with preallocated calls, each with a user ID. A - * function should be provided to effect the attachment from the user's side. - * The user is given a ref to hold on the call. + * Charge up the socket with preallocated calls, each with a user ID. The + * ->user_attach_call() callback function should be provided to effect the + * attachment from the user's side. The user is given a ref to hold on the + * call. * * Note that the call may be come connected before this function returns. */ -int rxrpc_kernel_charge_accept(struct socket *sock, - rxrpc_notify_rx_t notify_rx, - rxrpc_user_attach_call_t user_attach_call, +int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { @@ -472,8 +471,7 @@ int rxrpc_kernel_charge_accept(struct socket *sock, if (sock->sk->sk_state == RXRPC_CLOSE) return -ESHUTDOWN; - return rxrpc_service_prealloc_one(rx, b, notify_rx, - user_attach_call, user_call_ID, + return rxrpc_service_prealloc_one(rx, b, notify_rx, user_call_ID, gfp, debug_id); } EXPORT_SYMBOL(rxrpc_kernel_charge_accept); diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index e848a4777b8c..c76fbccfbb91 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -136,6 +136,12 @@ static void rxperf_notify_end_reply_tx(struct sock *sock, RXPERF_CALL_SV_AWAIT_ACK); } +static const struct rxrpc_kernel_ops rxperf_rxrpc_callback_ops = { + .notify_new_call = rxperf_rx_new_call, + .discard_new_call = rxperf_rx_discard_new_call, + .user_attach_call = rxperf_rx_attach, +}; + /* * Charge the incoming call preallocation. */ @@ -161,7 +167,6 @@ static void rxperf_charge_preallocation(struct work_struct *work) if (rxrpc_kernel_charge_accept(rxperf_socket, rxperf_notify_rx, - rxperf_rx_attach, (unsigned long)call, GFP_KERNEL, call->debug_id) < 0) @@ -209,8 +214,7 @@ static int rxperf_open_socket(void) if (ret < 0) goto error_2; - rxrpc_kernel_new_call_notification(socket, rxperf_rx_new_call, - rxperf_rx_discard_new_call); + rxrpc_kernel_set_notifications(socket, &rxperf_rxrpc_callback_ops); ret = kernel_listen(socket, INT_MAX); if (ret < 0) -- cgit v1.2.3 From 5800b1cf3fd8ccab752a101865be1e76dac33142 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:49 +0100 Subject: rxrpc: Allow CHALLENGEs to the passed to the app for a RESPONSE Allow the app to request that CHALLENGEs be passed to it through an out-of-band queue that allows recvmsg() to pick it up so that the app can add data to it with sendmsg(). This will allow the application (AFS or userspace) to interact with the process if it wants to and put values into user-defined fields. This will be used by AFS when talking to a fileserver to supply that fileserver with a crypto key by which callback RPCs can be encrypted (ie. notifications from the fileserver to the client). Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-5-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/networking/rxrpc.rst | 2 + fs/afs/Makefile | 1 + fs/afs/cm_security.c | 73 +++++++ fs/afs/internal.h | 6 + fs/afs/main.c | 1 + fs/afs/rxrpc.c | 18 ++ include/net/af_rxrpc.h | 24 +++ include/trace/events/rxrpc.h | 18 +- include/uapi/linux/rxrpc.h | 46 +++-- net/rxrpc/Makefile | 1 + net/rxrpc/af_rxrpc.c | 52 ++++- net/rxrpc/ar-internal.h | 51 ++++- net/rxrpc/call_object.c | 2 +- net/rxrpc/conn_event.c | 134 ++++++++++++- net/rxrpc/conn_object.c | 1 + net/rxrpc/insecure.c | 13 +- net/rxrpc/io_thread.c | 12 +- net/rxrpc/oob.c | 379 +++++++++++++++++++++++++++++++++++++ net/rxrpc/output.c | 56 ++++++ net/rxrpc/recvmsg.c | 120 ++++++++++-- net/rxrpc/rxkad.c | 288 +++++++++++++++++----------- net/rxrpc/sendmsg.c | 15 +- net/rxrpc/server_key.c | 40 ++++ 23 files changed, 1188 insertions(+), 165 deletions(-) create mode 100644 fs/afs/cm_security.c create mode 100644 net/rxrpc/oob.c (limited to 'include') diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index 2680abd1cb26..eb941b2577dd 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -1179,7 +1179,9 @@ API Function Reference .. kernel-doc:: net/rxrpc/af_rxrpc.c .. kernel-doc:: net/rxrpc/key.c +.. kernel-doc:: net/rxrpc/oob.c .. kernel-doc:: net/rxrpc/peer_object.c .. kernel-doc:: net/rxrpc/recvmsg.c +.. kernel-doc:: net/rxrpc/rxkad.c .. kernel-doc:: net/rxrpc/sendmsg.c .. kernel-doc:: net/rxrpc/server_key.c diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 5efd7e13b304..b49b8fe682f3 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -8,6 +8,7 @@ kafs-y := \ addr_prefs.o \ callback.o \ cell.o \ + cm_security.o \ cmservice.o \ dir.o \ dir_edit.o \ diff --git a/fs/afs/cm_security.c b/fs/afs/cm_security.c new file mode 100644 index 000000000000..efe6a23c8477 --- /dev/null +++ b/fs/afs/cm_security.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Cache manager security. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include "internal.h" +#include "afs_fs.h" +#include "protocol_yfs.h" +#define RXRPC_TRACE_ONLY_DEFINE_ENUMS +#include + +/* + * Respond to an RxGK challenge, adding appdata. + */ +static int afs_respond_to_challenge(struct sk_buff *challenge) +{ + struct rxrpc_peer *peer; + unsigned long peer_data; + u16 service_id; + u8 security_index; + + rxrpc_kernel_query_challenge(challenge, &peer, &peer_data, + &service_id, &security_index); + + _enter("%u,%u", service_id, security_index); + + switch (service_id) { + /* We don't send CM_SERVICE RPCs, so don't expect a challenge + * therefrom. + */ + case FS_SERVICE: + case VL_SERVICE: + case YFS_FS_SERVICE: + case YFS_VL_SERVICE: + break; + default: + pr_warn("Can't respond to unknown challenge %u:%u", + service_id, security_index); + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } + + switch (security_index) { + case RXRPC_SECURITY_RXKAD: + return rxkad_kernel_respond_to_challenge(challenge); + + default: + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } +} + +/* + * Process the OOB message queue, processing challenge packets. + */ +void afs_process_oob_queue(struct work_struct *work) +{ + struct afs_net *net = container_of(work, struct afs_net, rx_oob_work); + struct sk_buff *oob; + enum rxrpc_oob_type type; + + while ((oob = rxrpc_kernel_dequeue_oob(net->socket, &type))) { + switch (type) { + case RXRPC_OOB_CHALLENGE: + afs_respond_to_challenge(oob); + break; + } + rxrpc_kernel_free_oob(oob); + } +} diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 440b0e731093..b3612b700c6a 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -281,6 +281,7 @@ struct afs_net { struct socket *socket; struct afs_call *spare_incoming_call; struct work_struct charge_preallocation_work; + struct work_struct rx_oob_work; struct mutex socket_mutex; atomic_t nr_outstanding_calls; atomic_t nr_superblocks; @@ -1058,6 +1059,11 @@ extern void __net_exit afs_cell_purge(struct afs_net *); */ extern bool afs_cm_incoming_call(struct afs_call *); +/* + * cm_security.c + */ +void afs_process_oob_queue(struct work_struct *work); + /* * dir.c */ diff --git a/fs/afs/main.c b/fs/afs/main.c index c845c5daaeba..02475d415d88 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -73,6 +73,7 @@ static int __net_init afs_net_init(struct net *net_ns) generate_random_uuid((unsigned char *)&net->uuid); INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation); + INIT_WORK(&net->rx_oob_work, afs_process_oob_queue); mutex_init(&net->socket_mutex); net->cells = RB_ROOT; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index a4734304e542..212af2aa85bf 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -25,12 +25,14 @@ static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID); +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob); static int afs_deliver_cm_op_id(struct afs_call *); static const struct rxrpc_kernel_ops afs_rxrpc_callback_ops = { .notify_new_call = afs_rx_new_call, .discard_new_call = afs_rx_discard_new_call, .user_attach_call = afs_rx_attach, + .notify_oob = afs_rx_notify_oob, }; /* asynchronous incoming call initial processing */ @@ -56,6 +58,7 @@ int afs_open_socket(struct afs_net *net) goto error_1; socket->sk->sk_allocation = GFP_NOFS; + socket->sk->sk_user_data = net; /* bind the callback manager's address to make this a server socket */ memset(&srx, 0, sizeof(srx)); @@ -71,6 +74,10 @@ int afs_open_socket(struct afs_net *net) if (ret < 0) goto error_2; + ret = rxrpc_sock_set_manage_response(socket->sk, true); + if (ret < 0) + goto error_2; + ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); if (ret == -EADDRINUSE) { srx.transport.sin6.sin6_port = 0; @@ -131,6 +138,7 @@ void afs_close_socket(struct afs_net *net) kernel_sock_shutdown(net->socket, SHUT_RDWR); flush_workqueue(afs_async_calls); + net->socket->sk->sk_user_data = NULL; sock_release(net->socket); _debug("dework"); @@ -957,3 +965,13 @@ noinline int afs_protocol_error(struct afs_call *call, call->unmarshalling_error = true; return -EBADMSG; } + +/* + * Wake up OOB notification processing. + */ +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob) +{ + struct afs_net *net = sk->sk_user_data; + + schedule_work(&net->rx_oob_work); +} diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index ebb6092c488b..0b209f703ffc 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -16,6 +16,7 @@ struct sock; struct socket; struct rxrpc_call; struct rxrpc_peer; +struct krb5_buffer; enum rxrpc_abort_reason; enum rxrpc_interruptibility { @@ -24,6 +25,10 @@ enum rxrpc_interruptibility { RXRPC_UNINTERRUPTIBLE, /* Call should not be interruptible at all */ }; +enum rxrpc_oob_type { + RXRPC_OOB_CHALLENGE, /* Security challenge for a connection */ +}; + /* * Debug ID counter for tracing. */ @@ -37,6 +42,7 @@ struct rxrpc_kernel_ops { unsigned long user_call_ID); void (*discard_new_call)(struct rxrpc_call *call, unsigned long user_call_ID); void (*user_attach_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*notify_oob)(struct sock *sk, struct sk_buff *oob); }; typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, @@ -88,5 +94,23 @@ void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); int rxrpc_sock_set_security_keyring(struct sock *, struct key *); +int rxrpc_sock_set_manage_response(struct sock *sk, bool set); + +enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata); +struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock, + enum rxrpc_oob_type *_type); +void rxrpc_kernel_free_oob(struct sk_buff *oob); +void rxrpc_kernel_query_challenge(struct sk_buff *challenge, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata, + u16 *_service_id, u8 *_security_index); +int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code, + int error, enum rxrpc_abort_reason why); +int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge); +u32 rxgk_kernel_query_challenge(struct sk_buff *challenge); +int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, + struct krb5_buffer *appdata); #endif /* _NET_RXRPC_H */ diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index cad50d91077e..08ecebd90595 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -25,6 +25,7 @@ EM(afs_abort_probeuuid_negative, "afs-probeuuid-neg") \ EM(afs_abort_send_data_error, "afs-send-data") \ EM(afs_abort_unmarshal_error, "afs-unmarshal") \ + EM(afs_abort_unsupported_sec_class, "afs-unsup-sec-class") \ /* rxperf errors */ \ EM(rxperf_abort_general_error, "rxperf-error") \ EM(rxperf_abort_oom, "rxperf-oom") \ @@ -77,6 +78,7 @@ EM(rxrpc_abort_call_timeout, "call-timeout") \ EM(rxrpc_abort_no_service_key, "no-serv-key") \ EM(rxrpc_abort_nomem, "nomem") \ + EM(rxrpc_abort_response_sendmsg, "resp-sendmsg") \ EM(rxrpc_abort_service_not_offered, "serv-not-offered") \ EM(rxrpc_abort_shut_down, "shut-down") \ EM(rxrpc_abort_unsupported_security, "unsup-sec") \ @@ -133,24 +135,33 @@ EM(rxrpc_skb_get_conn_secured, "GET conn-secd") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ EM(rxrpc_skb_get_local_work, "GET locl-work") \ + EM(rxrpc_skb_get_post_oob, "GET post-oob ") \ EM(rxrpc_skb_get_reject_work, "GET rej-work ") \ EM(rxrpc_skb_get_to_recvmsg, "GET to-recv ") \ EM(rxrpc_skb_get_to_recvmsg_oos, "GET to-recv-o") \ EM(rxrpc_skb_new_encap_rcv, "NEW encap-rcv") \ EM(rxrpc_skb_new_error_report, "NEW error-rpt") \ EM(rxrpc_skb_new_jumbo_subpacket, "NEW jumbo-sub") \ + EM(rxrpc_skb_new_response_rxgk, "NEW resp-rxgk") \ + EM(rxrpc_skb_new_response_rxkad, "NEW resp-rxkd") \ EM(rxrpc_skb_new_unshared, "NEW unshared ") \ EM(rxrpc_skb_put_call_rx, "PUT call-rx ") \ + EM(rxrpc_skb_put_challenge, "PUT challenge") \ EM(rxrpc_skb_put_conn_secured, "PUT conn-secd") \ EM(rxrpc_skb_put_conn_work, "PUT conn-work") \ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ EM(rxrpc_skb_put_input, "PUT input ") \ EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \ + EM(rxrpc_skb_put_oob, "PUT oob ") \ EM(rxrpc_skb_put_purge, "PUT purge ") \ + EM(rxrpc_skb_put_purge_oob, "PUT purge-oob") \ + EM(rxrpc_skb_put_response, "PUT response ") \ EM(rxrpc_skb_put_rotate, "PUT rotate ") \ EM(rxrpc_skb_put_unknown, "PUT unknown ") \ EM(rxrpc_skb_see_conn_work, "SEE conn-work") \ + EM(rxrpc_skb_see_oob_challenge, "SEE oob-chall") \ EM(rxrpc_skb_see_recvmsg, "SEE recvmsg ") \ + EM(rxrpc_skb_see_recvmsg_oob, "SEE recvm-oob") \ EM(rxrpc_skb_see_reject, "SEE reject ") \ EM(rxrpc_skb_see_rotate, "SEE rotate ") \ E_(rxrpc_skb_see_version, "SEE version ") @@ -216,9 +227,11 @@ EM(rxrpc_conn_free, "FREE ") \ EM(rxrpc_conn_get_activate_call, "GET act-call") \ EM(rxrpc_conn_get_call_input, "GET inp-call") \ + EM(rxrpc_conn_get_challenge_input, "GET inp-chal") \ EM(rxrpc_conn_get_conn_input, "GET inp-conn") \ EM(rxrpc_conn_get_idle, "GET idle ") \ EM(rxrpc_conn_get_poke_abort, "GET pk-abort") \ + EM(rxrpc_conn_get_poke_response, "GET response") \ EM(rxrpc_conn_get_poke_secured, "GET secured ") \ EM(rxrpc_conn_get_poke_timer, "GET poke ") \ EM(rxrpc_conn_get_service_conn, "GET svc-conn") \ @@ -226,10 +239,12 @@ EM(rxrpc_conn_new_service, "NEW service ") \ EM(rxrpc_conn_put_call, "PUT call ") \ EM(rxrpc_conn_put_call_input, "PUT inp-call") \ + EM(rxrpc_conn_put_challenge_input, "PUT inp-chal") \ EM(rxrpc_conn_put_conn_input, "PUT inp-conn") \ EM(rxrpc_conn_put_discard_idle, "PUT disc-idl") \ EM(rxrpc_conn_put_local_dead, "PUT loc-dead") \ EM(rxrpc_conn_put_noreuse, "PUT noreuse ") \ + EM(rxrpc_conn_put_oob, "PUT oob ") \ EM(rxrpc_conn_put_poke, "PUT poke ") \ EM(rxrpc_conn_put_service_reaped, "PUT svc-reap") \ EM(rxrpc_conn_put_unbundle, "PUT unbundle") \ @@ -331,6 +346,7 @@ EM(rxrpc_recvmsg_full, "FULL") \ EM(rxrpc_recvmsg_hole, "HOLE") \ EM(rxrpc_recvmsg_next, "NEXT") \ + EM(rxrpc_recvmsg_oobq, "OOBQ") \ EM(rxrpc_recvmsg_requeue, "REQU") \ EM(rxrpc_recvmsg_return, "RETN") \ EM(rxrpc_recvmsg_terminal, "TERM") \ @@ -456,7 +472,7 @@ EM(rxrpc_tx_point_conn_abort, "ConnAbort") \ EM(rxrpc_tx_point_reject, "Reject") \ EM(rxrpc_tx_point_rxkad_challenge, "RxkadChall") \ - EM(rxrpc_tx_point_rxkad_response, "RxkadResp") \ + EM(rxrpc_tx_point_response, "Response") \ EM(rxrpc_tx_point_version_keepalive, "VerKeepalive") \ E_(rxrpc_tx_point_version_reply, "VerReply") diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h index 8f8dc7a937a4..c4e9833b0a12 100644 --- a/include/uapi/linux/rxrpc.h +++ b/include/uapi/linux/rxrpc.h @@ -36,26 +36,33 @@ struct sockaddr_rxrpc { #define RXRPC_MIN_SECURITY_LEVEL 4 /* minimum security level */ #define RXRPC_UPGRADEABLE_SERVICE 5 /* Upgrade service[0] -> service[1] */ #define RXRPC_SUPPORTED_CMSG 6 /* Get highest supported control message type */ +#define RXRPC_MANAGE_RESPONSE 7 /* [clnt] Want to manage RESPONSE packets */ /* * RxRPC control messages * - If neither abort or accept are specified, the message is a data message. * - terminal messages mean that a user call ID tag can be recycled + * - C/S/- indicate whether these are applicable to client, server or both * - s/r/- indicate whether these are applicable to sendmsg() and/or recvmsg() */ enum rxrpc_cmsg_type { - RXRPC_USER_CALL_ID = 1, /* sr: user call ID specifier */ - RXRPC_ABORT = 2, /* sr: abort request / notification [terminal] */ - RXRPC_ACK = 3, /* -r: [Service] RPC op final ACK received [terminal] */ - RXRPC_NET_ERROR = 5, /* -r: network error received [terminal] */ - RXRPC_BUSY = 6, /* -r: server busy received [terminal] */ - RXRPC_LOCAL_ERROR = 7, /* -r: local error generated [terminal] */ - RXRPC_NEW_CALL = 8, /* -r: [Service] new incoming call notification */ - RXRPC_EXCLUSIVE_CALL = 10, /* s-: Call should be on exclusive connection */ - RXRPC_UPGRADE_SERVICE = 11, /* s-: Request service upgrade for client call */ - RXRPC_TX_LENGTH = 12, /* s-: Total length of Tx data */ - RXRPC_SET_CALL_TIMEOUT = 13, /* s-: Set one or more call timeouts */ - RXRPC_CHARGE_ACCEPT = 14, /* s-: Charge the accept pool with a user call ID */ + RXRPC_USER_CALL_ID = 1, /* -sr: User call ID specifier */ + RXRPC_ABORT = 2, /* -sr: Abort request / notification [terminal] */ + RXRPC_ACK = 3, /* S-r: RPC op final ACK received [terminal] */ + RXRPC_NET_ERROR = 5, /* --r: Network error received [terminal] */ + RXRPC_BUSY = 6, /* C-r: Server busy received [terminal] */ + RXRPC_LOCAL_ERROR = 7, /* --r: Local error generated [terminal] */ + RXRPC_NEW_CALL = 8, /* S-r: New incoming call notification */ + RXRPC_EXCLUSIVE_CALL = 10, /* Cs-: Call should be on exclusive connection */ + RXRPC_UPGRADE_SERVICE = 11, /* Cs-: Request service upgrade for client call */ + RXRPC_TX_LENGTH = 12, /* -s-: Total length of Tx data */ + RXRPC_SET_CALL_TIMEOUT = 13, /* -s-: Set one or more call timeouts */ + RXRPC_CHARGE_ACCEPT = 14, /* Ss-: Charge the accept pool with a user call ID */ + RXRPC_OOB_ID = 15, /* -sr: OOB message ID */ + RXRPC_CHALLENGED = 16, /* C-r: Info on a received CHALLENGE */ + RXRPC_RESPOND = 17, /* Cs-: Respond to a challenge */ + RXRPC_RESPONDED = 18, /* S-r: Data received in RESPONSE */ + RXRPC_RESP_RXGK_APPDATA = 19, /* Cs-: RESPONSE: RxGK app data to include */ RXRPC__SUPPORTED }; @@ -118,4 +125,19 @@ enum rxrpc_cmsg_type { #define RXKADDATALEN 19270411 /* user data too long */ #define RXKADILLEGALLEVEL 19270412 /* caller not authorised to use encrypted conns */ +/* + * Challenge information in the RXRPC_CHALLENGED control message. + */ +struct rxrpc_challenge { + __u16 service_id; /* The service ID of the connection (may be upgraded) */ + __u8 security_index; /* The security index of the connection */ + __u8 pad; /* Round out to a multiple of 4 bytes. */ + /* ... The security class gets to append extra information ... */ +}; + +struct rxgk_challenge { + struct rxrpc_challenge base; + __u32 enctype; /* Krb5 encoding type */ +}; + #endif /* _UAPI_LINUX_RXRPC_H */ diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index 210b75e3179e..0981941dea73 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -24,6 +24,7 @@ rxrpc-y := \ local_object.o \ misc.o \ net_ns.o \ + oob.o \ output.o \ peer_event.o \ peer_object.o \ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 2e4f6c9ef3de..3a558c1a541e 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -633,7 +633,10 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) fallthrough; case RXRPC_SERVER_BOUND: case RXRPC_SERVER_LISTENING: - ret = rxrpc_do_sendmsg(rx, m, len); + if (m->msg_flags & MSG_OOB) + ret = rxrpc_sendmsg_oob(rx, m, len); + else + ret = rxrpc_do_sendmsg(rx, m, len); /* The socket has been unlocked */ goto out; default: @@ -668,7 +671,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - unsigned int min_sec_level; + unsigned int min_sec_level, val; u16 service_upgrade[2]; int ret; @@ -749,6 +752,26 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, rx->service_upgrade.to = service_upgrade[1]; goto success; + case RXRPC_MANAGE_RESPONSE: + ret = -EINVAL; + if (optlen != sizeof(unsigned int)) + goto error; + ret = -EISCONN; + if (rx->sk.sk_state != RXRPC_UNBOUND) + goto error; + ret = copy_safe_from_sockptr(&val, sizeof(val), + optval, optlen); + if (ret) + goto error; + ret = -EINVAL; + if (val > 1) + goto error; + if (val) + set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + else + clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + goto success; + default: break; } @@ -855,6 +878,8 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, rx->calls = RB_ROOT; spin_lock_init(&rx->incoming_lock); + skb_queue_head_init(&rx->recvmsg_oobq); + rx->pending_oobq = RB_ROOT; INIT_LIST_HEAD(&rx->sock_calls); INIT_LIST_HEAD(&rx->to_be_accepted); INIT_LIST_HEAD(&rx->recvmsg_q); @@ -888,8 +913,10 @@ static int rxrpc_shutdown(struct socket *sock, int flags) lock_sock(sk); if (sk->sk_state < RXRPC_CLOSE) { + spin_lock_irq(&rx->recvmsg_lock); sk->sk_state = RXRPC_CLOSE; sk->sk_shutdown = SHUTDOWN_MASK; + spin_unlock_irq(&rx->recvmsg_lock); } else { ret = -ESHUTDOWN; } @@ -900,6 +927,23 @@ static int rxrpc_shutdown(struct socket *sock, int flags) return ret; } +/* + * Purge the out-of-band queue. + */ +static void rxrpc_purge_oob_queue(struct sock *sk) +{ + struct rxrpc_sock *rx = rxrpc_sk(sk); + struct sk_buff *skb; + + while ((skb = skb_dequeue(&rx->recvmsg_oobq))) + rxrpc_kernel_free_oob(skb); + while (!RB_EMPTY_ROOT(&rx->pending_oobq)) { + skb = rb_entry(rx->pending_oobq.rb_node, struct sk_buff, rbnode); + rb_erase(&skb->rbnode, &rx->pending_oobq); + rxrpc_kernel_free_oob(skb); + } +} + /* * RxRPC socket destructor */ @@ -907,6 +951,7 @@ static void rxrpc_sock_destructor(struct sock *sk) { _enter("%p", sk); + rxrpc_purge_oob_queue(sk); rxrpc_purge_queue(&sk->sk_receive_queue); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); @@ -945,7 +990,9 @@ static int rxrpc_release_sock(struct sock *sk) break; } + spin_lock_irq(&rx->recvmsg_lock); sk->sk_state = RXRPC_CLOSE; + spin_unlock_irq(&rx->recvmsg_lock); if (rx->local && rx->local->service == rx) { write_lock(&rx->local->services_lock); @@ -957,6 +1004,7 @@ static int rxrpc_release_sock(struct sock *sk) rxrpc_discard_prealloc(rx); rxrpc_release_calls_on_socket(rx); flush_workqueue(rxrpc_workqueue); + rxrpc_purge_oob_queue(sk); rxrpc_purge_queue(&sk->sk_receive_queue); rxrpc_unuse_local(rx->local, rxrpc_local_unuse_release_sock); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c267b8ac1bb5..31d05784c530 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -39,6 +39,7 @@ struct rxrpc_txqueue; enum rxrpc_skb_mark { RXRPC_SKB_MARK_PACKET, /* Received packet */ RXRPC_SKB_MARK_ERROR, /* Error notification */ + RXRPC_SKB_MARK_CHALLENGE, /* Challenge notification */ RXRPC_SKB_MARK_SERVICE_CONN_SECURED, /* Service connection response has been verified */ RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ @@ -149,6 +150,9 @@ struct rxrpc_sock { const struct rxrpc_kernel_ops *app_ops; /* Table of kernel app notification funcs */ struct rxrpc_local *local; /* local endpoint */ struct rxrpc_backlog *backlog; /* Preallocation for services */ + struct sk_buff_head recvmsg_oobq; /* OOB messages for recvmsg to pick up */ + struct rb_root pending_oobq; /* OOB messages awaiting userspace to respond to */ + u64 oob_id_counter; /* OOB message ID counter */ spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */ struct list_head sock_calls; /* List of calls owned by this socket */ struct list_head to_be_accepted; /* calls awaiting acceptance */ @@ -159,6 +163,7 @@ struct rxrpc_sock { struct rb_root calls; /* User ID -> call mapping */ unsigned long flags; #define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */ +#define RXRPC_SOCK_MANAGE_RESPONSE 1 /* User wants to manage RESPONSE packets */ rwlock_t call_lock; /* lock for calls */ u32 min_sec_level; /* minimum security level */ #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT @@ -202,7 +207,7 @@ struct rxrpc_host_header { */ struct rxrpc_skb_priv { union { - struct rxrpc_connection *conn; /* Connection referred to (poke packet) */ + struct rxrpc_connection *poke_conn; /* Conn referred to (poke packet) */ struct { u16 offset; /* Offset of data */ u16 len; /* Length of data */ @@ -216,6 +221,19 @@ struct rxrpc_skb_priv { u16 nr_acks; /* Number of acks+nacks */ u8 reason; /* Reason for ack */ } ack; + struct { + struct rxrpc_connection *conn; /* Connection referred to */ + union { + u32 rxkad_nonce; + }; + } chall; + struct { + rxrpc_serial_t challenge_serial; + u32 kvno; + u32 version; + u16 len; + u16 ticket_len; + } resp; }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ }; @@ -269,9 +287,24 @@ struct rxrpc_security { /* issue a challenge */ int (*issue_challenge)(struct rxrpc_connection *); + /* Validate a challenge packet */ + bool (*validate_challenge)(struct rxrpc_connection *conn, + struct sk_buff *skb); + + /* Fill out the cmsg for recvmsg() to pass on a challenge to userspace. + * The security class gets to add additional information. + */ + int (*challenge_to_recvmsg)(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct msghdr *msg); + + /* Parse sendmsg() control message and respond to challenge. */ + int (*sendmsg_respond_to_challenge)(struct sk_buff *challenge, + struct msghdr *msg); + /* respond to a challenge */ - int (*respond_to_challenge)(struct rxrpc_connection *, - struct sk_buff *); + int (*respond_to_challenge)(struct rxrpc_connection *conn, + struct sk_buff *challenge); /* verify a response */ int (*verify_response)(struct rxrpc_connection *, @@ -526,6 +559,7 @@ struct rxrpc_connection { u32 nonce; /* response re-use preventer */ } rxkad; }; + struct sk_buff *tx_response; /* Response packet to be transmitted */ unsigned long flags; unsigned long events; unsigned long idle_timestamp; /* Time at which last became idle */ @@ -1199,8 +1233,11 @@ void rxrpc_error_report(struct sock *); bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, s32 abort_code, int err); int rxrpc_io_thread(void *data); +void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb); static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local) { + if (!local->io_thread) + return; wake_up_process(READ_ONCE(local->io_thread)); } @@ -1289,6 +1326,13 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) return net_generic(net, rxrpc_net_id); } +/* + * out_of_band.c + */ +void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb); +void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb); +int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len); + /* * output.c */ @@ -1300,6 +1344,7 @@ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); +void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *skb); /* * peer_event.c diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index f428ea77f803..fc88ffe1b050 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -145,8 +145,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); INIT_LIST_HEAD(&call->attend_link); - skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->recvmsg_queue); + skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 4d9c5e21ba78..232b6986da83 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -19,7 +19,7 @@ /* * Set the completion state on an aborted connection. */ -static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff *skb, +static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, s32 abort_code, int err, enum rxrpc_call_completion compl) { @@ -49,12 +49,20 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, s32 abort_code, int err, enum rxrpc_abort_reason why) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - if (rxrpc_set_conn_aborted(conn, skb, abort_code, err, + u32 cid = conn->proto.cid, call = 0, seq = 0; + + if (skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + cid = sp->hdr.cid; + call = sp->hdr.callNumber; + seq = sp->hdr.seq; + } + + if (rxrpc_set_conn_aborted(conn, abort_code, err, RXRPC_CALL_LOCALLY_ABORTED)) { - trace_rxrpc_abort(0, why, sp->hdr.cid, sp->hdr.callNumber, - sp->hdr.seq, abort_code, err); + trace_rxrpc_abort(0, why, cid, call, seq, abort_code, err); rxrpc_poke_conn(conn, rxrpc_conn_get_poke_abort); } return -EPROTO; @@ -67,7 +75,7 @@ static void rxrpc_input_conn_abort(struct rxrpc_connection *conn, struct sk_buff *skb) { trace_rxrpc_rx_conn_abort(conn, skb); - rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, + rxrpc_set_conn_aborted(conn, skb->priority, -ECONNABORTED, RXRPC_CALL_REMOTELY_ABORTED); } @@ -248,7 +256,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_CHALLENGE: - return conn->security->respond_to_challenge(conn, skb); + ret = conn->security->respond_to_challenge(conn, skb); + sp->chall.conn = NULL; + rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input); + return ret; case RXRPC_PACKET_TYPE_RESPONSE: ret = conn->security->verify_response(conn, skb); @@ -270,7 +281,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, * we've already received the packet, put it on the * front of the queue. */ - sp->conn = rxrpc_get_connection(conn, rxrpc_conn_get_poke_secured); + sp->poke_conn = rxrpc_get_connection( + conn, rxrpc_conn_get_poke_secured); skb->mark = RXRPC_SKB_MARK_SERVICE_CONN_SECURED; rxrpc_get_skb(skb, rxrpc_skb_get_conn_secured); skb_queue_head(&conn->local->rx_queue, skb); @@ -391,6 +403,61 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, rxrpc_queue_conn(conn, rxrpc_conn_queue_rx_work); } +/* + * Post a CHALLENGE packet to the socket of one of a connection's calls so that + * it can get application data to include in the packet, possibly querying + * userspace. + */ +static bool rxrpc_post_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_call *call = NULL; + struct rxrpc_sock *rx; + bool respond = false; + + sp->chall.conn = + rxrpc_get_connection(conn, rxrpc_conn_get_challenge_input); + + if (!conn->security->challenge_to_recvmsg) { + rxrpc_post_packet_to_conn(conn, skb); + return true; + } + + rcu_read_lock(); + + for (int i = 0; i < ARRAY_SIZE(conn->channels); i++) { + if (conn->channels[i].call) { + call = conn->channels[i].call; + rx = rcu_dereference(call->socket); + if (!rx) { + call = NULL; + continue; + } + + respond = true; + if (test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags)) + break; + call = NULL; + } + } + + if (!respond) { + rcu_read_unlock(); + rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input); + sp->chall.conn = NULL; + return false; + } + + if (call) + rxrpc_notify_socket_oob(call, skb); + rcu_read_unlock(); + + if (!call) + rxrpc_post_packet_to_conn(conn, skb); + return true; +} + /* * Input a connection-level packet. */ @@ -411,6 +478,16 @@ bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) return true; case RXRPC_PACKET_TYPE_CHALLENGE: + rxrpc_see_skb(skb, rxrpc_skb_see_oob_challenge); + if (rxrpc_is_conn_aborted(conn)) { + if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED) + rxrpc_send_conn_abort(conn); + return true; + } + if (!conn->security->validate_challenge(conn, skb)) + return false; + return rxrpc_post_challenge(conn, skb); + case RXRPC_PACKET_TYPE_RESPONSE: if (rxrpc_is_conn_aborted(conn)) { if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED) @@ -436,6 +513,19 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) if (test_and_clear_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events)) rxrpc_abort_calls(conn); + if (conn->tx_response) { + struct sk_buff *skb; + + spin_lock_irq(&conn->local->lock); + skb = conn->tx_response; + conn->tx_response = NULL; + spin_unlock_irq(&conn->local->lock); + + if (conn->state != RXRPC_CONN_ABORTED) + rxrpc_send_response(conn, skb); + rxrpc_free_skb(skb, rxrpc_skb_put_response); + } + if (skb) { switch (skb->mark) { case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: @@ -452,3 +542,31 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) rxrpc_process_delayed_final_acks(conn, false); } + +/* + * Post a RESPONSE message to the I/O thread for transmission. + */ +void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_local *local = conn->local; + struct sk_buff *old; + + _enter("%x", sp->resp.challenge_serial); + + spin_lock_irq(&local->lock); + old = conn->tx_response; + if (old) { + struct rxrpc_skb_priv *osp = rxrpc_skb(skb); + + /* Always go with the response to the most recent challenge. */ + if (after(sp->resp.challenge_serial, osp->resp.challenge_serial)) + conn->tx_response = old; + else + old = skb; + } else { + conn->tx_response = skb; + } + spin_unlock_irq(&local->lock); + rxrpc_poke_conn(conn, rxrpc_conn_get_poke_response); +} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 8ac22dde8b39..d14a802d2001 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -329,6 +329,7 @@ static void rxrpc_clean_up_connection(struct work_struct *work) } rxrpc_purge_queue(&conn->rx_queue); + rxrpc_free_skb(conn->tx_response, rxrpc_skb_put_response); rxrpc_kill_client_conn(conn); diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index e068f9b79d02..1f7c136d6d0e 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -42,13 +42,19 @@ static void none_free_call_crypto(struct rxrpc_call *call) { } -static int none_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb) +static bool none_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) { return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, rxrpc_eproto_rxnull_challenge); } +static int none_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + return -EINVAL; +} + static int none_verify_response(struct rxrpc_connection *conn, struct sk_buff *skb) { @@ -82,7 +88,8 @@ const struct rxrpc_security rxrpc_no_security = { .alloc_txbuf = none_alloc_txbuf, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, - .respond_to_challenge = none_respond_to_challenge, + .validate_challenge = none_validate_challenge, + .sendmsg_respond_to_challenge = none_sendmsg_respond_to_challenge, .verify_response = none_verify_response, .clear = none_clear, }; diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 64f8d77b8731..27b650d30f4d 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -489,8 +489,8 @@ int rxrpc_io_thread(void *data) rxrpc_free_skb(skb, rxrpc_skb_put_error_report); break; case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: - rxrpc_input_conn_event(sp->conn, skb); - rxrpc_put_connection(sp->conn, rxrpc_conn_put_poke); + rxrpc_input_conn_event(sp->poke_conn, skb); + rxrpc_put_connection(sp->poke_conn, rxrpc_conn_put_poke); rxrpc_free_skb(skb, rxrpc_skb_put_conn_secured); break; default: @@ -501,9 +501,11 @@ int rxrpc_io_thread(void *data) } /* Deal with connections that want immediate attention. */ - spin_lock_irq(&local->lock); - list_splice_tail_init(&local->conn_attend_q, &conn_attend_q); - spin_unlock_irq(&local->lock); + if (!list_empty_careful(&local->conn_attend_q)) { + spin_lock_irq(&local->lock); + list_splice_tail_init(&local->conn_attend_q, &conn_attend_q); + spin_unlock_irq(&local->lock); + } while ((conn = list_first_entry_or_null(&conn_attend_q, struct rxrpc_connection, diff --git a/net/rxrpc/oob.c b/net/rxrpc/oob.c new file mode 100644 index 000000000000..3601022b9190 --- /dev/null +++ b/net/rxrpc/oob.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Out of band message handling (e.g. challenge-response) + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +enum rxrpc_oob_command { + RXRPC_OOB_CMD_UNSET, + RXRPC_OOB_CMD_RESPOND, +} __mode(byte); + +struct rxrpc_oob_params { + u64 oob_id; /* ID number of message if reply */ + s32 abort_code; + enum rxrpc_oob_command command; + bool have_oob_id:1; +}; + +/* + * Post an out-of-band message for attention by the socket or kernel service + * associated with a reference call. + */ +void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_sock *rx; + struct sock *sk; + + rcu_read_lock(); + + rx = rcu_dereference(call->socket); + if (rx) { + sk = &rx->sk; + spin_lock_irq(&rx->recvmsg_lock); + + if (sk->sk_state < RXRPC_CLOSE) { + skb->skb_mstamp_ns = rx->oob_id_counter++; + rxrpc_get_skb(skb, rxrpc_skb_get_post_oob); + skb_queue_tail(&rx->recvmsg_oobq, skb); + + trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial); + if (rx->app_ops) + rx->app_ops->notify_oob(sk, skb); + } + + spin_unlock_irq(&rx->recvmsg_lock); + if (!rx->app_ops && !sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + } + + rcu_read_unlock(); +} + +/* + * Locate the OOB message to respond to by its ID. + */ +static struct sk_buff *rxrpc_find_pending_oob(struct rxrpc_sock *rx, u64 oob_id) +{ + struct rb_node *p; + struct sk_buff *skb; + + p = rx->pending_oobq.rb_node; + while (p) { + skb = rb_entry(p, struct sk_buff, rbnode); + + if (oob_id < skb->skb_mstamp_ns) + p = p->rb_left; + else if (oob_id > skb->skb_mstamp_ns) + p = p->rb_right; + else + return skb; + } + + return NULL; +} + +/* + * Add an OOB message into the pending-response set. We always assign the next + * value from a 64-bit counter to the oob_id, so just assume we're always going + * to be on the right-hand edge of the tree and that the counter won't wrap. + * The tree is also given a ref to the message. + */ +void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb) +{ + struct rb_node **pp = &rx->pending_oobq.rb_node, *p = NULL; + + while (*pp) { + p = *pp; + pp = &(*pp)->rb_right; + } + + rb_link_node(&skb->rbnode, p, pp); + rb_insert_color(&skb->rbnode, &rx->pending_oobq); +} + +/* + * Extract control messages from the sendmsg() control buffer. + */ +static int rxrpc_sendmsg_oob_cmsg(struct msghdr *msg, struct rxrpc_oob_params *p) +{ + struct cmsghdr *cmsg; + int len; + + if (msg->msg_controllen == 0) + return -EINVAL; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + len = cmsg->cmsg_len - sizeof(struct cmsghdr); + _debug("CMSG %d, %d, %d", + cmsg->cmsg_level, cmsg->cmsg_type, len); + + if (cmsg->cmsg_level != SOL_RXRPC) + continue; + + switch (cmsg->cmsg_type) { + case RXRPC_OOB_ID: + if (len != sizeof(p->oob_id) || p->have_oob_id) + return -EINVAL; + memcpy(&p->oob_id, CMSG_DATA(cmsg), sizeof(p->oob_id)); + p->have_oob_id = true; + break; + case RXRPC_RESPOND: + if (p->command != RXRPC_OOB_CMD_UNSET) + return -EINVAL; + p->command = RXRPC_OOB_CMD_RESPOND; + break; + case RXRPC_ABORT: + if (len != sizeof(p->abort_code) || p->abort_code) + return -EINVAL; + memcpy(&p->abort_code, CMSG_DATA(cmsg), sizeof(p->abort_code)); + if (p->abort_code == 0) + return -EINVAL; + break; + case RXRPC_RESP_RXGK_APPDATA: + if (p->command != RXRPC_OOB_CMD_RESPOND) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + + switch (p->command) { + case RXRPC_OOB_CMD_RESPOND: + if (!p->have_oob_id) + return -EBADSLT; + break; + default: + return -EINVAL; + } + + return 0; +} + +/* + * Allow userspace to respond to an OOB using sendmsg(). + */ +static int rxrpc_respond_to_oob(struct rxrpc_sock *rx, + struct rxrpc_oob_params *p, + struct msghdr *msg) +{ + struct rxrpc_connection *conn; + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + int ret; + + skb = rxrpc_find_pending_oob(rx, p->oob_id); + if (skb) + rb_erase(&skb->rbnode, &rx->pending_oobq); + release_sock(&rx->sk); + if (!skb) + return -EBADSLT; + + sp = rxrpc_skb(skb); + + switch (p->command) { + case RXRPC_OOB_CMD_RESPOND: + ret = -EPROTO; + if (skb->mark != RXRPC_OOB_CHALLENGE) + break; + conn = sp->chall.conn; + ret = -EOPNOTSUPP; + if (!conn->security->sendmsg_respond_to_challenge) + break; + if (p->abort_code) { + rxrpc_abort_conn(conn, NULL, p->abort_code, -ECONNABORTED, + rxrpc_abort_response_sendmsg); + ret = 0; + } else { + ret = conn->security->sendmsg_respond_to_challenge(skb, msg); + } + break; + default: + ret = -EINVAL; + break; + } + + rxrpc_free_skb(skb, rxrpc_skb_put_oob); + return ret; +} + +/* + * Send an out-of-band message or respond to a received out-of-band message. + * - caller gives us the socket lock + * - the socket may be either a client socket or a server socket + */ +int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) +{ + struct rxrpc_oob_params p = {}; + int ret; + + _enter(""); + + ret = rxrpc_sendmsg_oob_cmsg(msg, &p); + if (ret < 0) + goto error_release_sock; + + if (p.have_oob_id) + return rxrpc_respond_to_oob(rx, &p, msg); + + release_sock(&rx->sk); + + switch (p.command) { + default: + ret = -EINVAL; + break; + } + + _leave(" = %d", ret); + return ret; + +error_release_sock: + release_sock(&rx->sk); + return ret; +} + +/** + * rxrpc_kernel_query_oob - Query the parameters of an out-of-band message + * @oob: The message to query + * @_peer: Where to return the peer record + * @_peer_appdata: The application data attached to a peer record + * + * Extract useful parameters from an out-of-band message. The source peer + * parameters are returned through the argument list and the message type is + * returned. + * + * Return: + * * %RXRPC_OOB_CHALLENGE - Challenge wanting a response. + */ +enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(oob); + enum rxrpc_oob_type type = oob->mark; + + switch (type) { + case RXRPC_OOB_CHALLENGE: + *_peer = sp->chall.conn->peer; + *_peer_appdata = 0; /* TODO: retrieve appdata */ + break; + default: + WARN_ON_ONCE(1); + *_peer = NULL; + *_peer_appdata = 0; + break; + } + + return type; +} +EXPORT_SYMBOL(rxrpc_kernel_query_oob); + +/** + * rxrpc_kernel_dequeue_oob - Dequeue and return the front OOB message + * @sock: The socket to query + * @_type: Where to return the message type + * + * Dequeue the front OOB message, if there is one, and return it and + * its type. + * + * Return: The sk_buff representing the OOB message or %NULL if the queue was + * empty. + */ +struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock, + enum rxrpc_oob_type *_type) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *oob; + + oob = skb_dequeue(&rx->recvmsg_oobq); + if (oob) + *_type = oob->mark; + return oob; +} +EXPORT_SYMBOL(rxrpc_kernel_dequeue_oob); + +/** + * rxrpc_kernel_free_oob - Free an out-of-band message + * @oob: The OOB message to free + * + * Free an OOB message along with any resources it holds. + */ +void rxrpc_kernel_free_oob(struct sk_buff *oob) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(oob); + + switch (oob->mark) { + case RXRPC_OOB_CHALLENGE: + rxrpc_put_connection(sp->chall.conn, rxrpc_conn_put_oob); + break; + } + + rxrpc_free_skb(oob, rxrpc_skb_put_purge_oob); +} +EXPORT_SYMBOL(rxrpc_kernel_free_oob); + +/** + * rxrpc_kernel_query_challenge - Query the parameters of a challenge + * @challenge: The challenge to query + * @_peer: Where to return the peer record + * @_peer_appdata: The application data attached to a peer record + * @_service_id: Where to return the connection service ID + * @_security_index: Where to return the connection security index + * + * Extract useful parameters from a CHALLENGE message. + */ +void rxrpc_kernel_query_challenge(struct sk_buff *challenge, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata, + u16 *_service_id, u8 *_security_index) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + *_peer = sp->chall.conn->peer; + *_peer_appdata = 0; /* TODO: retrieve appdata */ + *_service_id = sp->hdr.serviceId; + *_security_index = sp->hdr.securityIndex; +} +EXPORT_SYMBOL(rxrpc_kernel_query_challenge); + +/** + * rxrpc_kernel_reject_challenge - Allow a kernel service to reject a challenge + * @challenge: The challenge to be rejected + * @abort_code: The abort code to stick into the ABORT packet + * @error: Local error value + * @why: Indication as to why. + * + * Allow a kernel service to reject a challenge by aborting the connection if + * it's still in an abortable state. The error is returned so this function + * can be used with a return statement. + * + * Return: The %error parameter. + */ +int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code, + int error, enum rxrpc_abort_reason why) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + _enter("{%x},%d,%d,%u", sp->hdr.serial, abort_code, error, why); + + rxrpc_abort_conn(sp->chall.conn, NULL, abort_code, error, why); + return error; +} +EXPORT_SYMBOL(rxrpc_kernel_reject_challenge); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 95905b85a8d7..04c900990a40 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -916,3 +916,59 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) peer->last_tx_at = ktime_get_seconds(); _leave(""); } + +/* + * Send a RESPONSE message. + */ +void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *response) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(response); + struct scatterlist sg[16]; + struct bio_vec bvec[16]; + struct msghdr msg; + size_t len = sp->resp.len; + __be32 wserial; + u32 serial = 0; + int ret, nr_sg; + + _enter("C=%x,%x", conn->debug_id, sp->resp.challenge_serial); + + sg_init_table(sg, ARRAY_SIZE(sg)); + ret = skb_to_sgvec(response, sg, 0, len); + if (ret < 0) + goto fail; + nr_sg = ret; + + for (int i = 0; i < nr_sg; i++) + bvec_set_page(&bvec[i], sg_page(&sg[i]), sg[i].length, sg[i].offset); + + iov_iter_bvec(&msg.msg_iter, WRITE, bvec, nr_sg, len); + + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_SPLICE_PAGES; + + serial = rxrpc_get_next_serials(conn, 1); + wserial = htonl(serial); + + ret = skb_store_bits(response, offsetof(struct rxrpc_wire_header, serial), + &wserial, sizeof(wserial)); + if (ret < 0) + goto fail; + + rxrpc_local_dont_fragment(conn->local, false); + + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + if (ret < 0) + goto fail; + + conn->peer->last_tx_at = ktime_get_seconds(); + return; + +fail: + trace_rxrpc_tx_fail(conn->debug_id, serial, ret, + rxrpc_tx_point_response); + kleave(" = %d", ret); +} diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 4c622752a569..86a27fb55a1c 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -154,6 +154,82 @@ static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb) return call->security->verify_packet(call, skb); } +/* + * Transcribe a call's user ID to a control message. + */ +static int rxrpc_recvmsg_user_id(struct rxrpc_call *call, struct msghdr *msg, + int flags) +{ + if (!test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) + return 0; + + if (flags & MSG_CMSG_COMPAT) { + unsigned int id32 = call->user_call_ID; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned int), &id32); + } else { + unsigned long idl = call->user_call_ID; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned long), &idl); + } +} + +/* + * Deal with a CHALLENGE packet. + */ +static int rxrpc_recvmsg_challenge(struct socket *sock, struct msghdr *msg, + struct sk_buff *challenge, unsigned int flags) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + struct rxrpc_connection *conn = sp->chall.conn; + + return conn->security->challenge_to_recvmsg(conn, challenge, msg); +} + +/* + * Process OOB packets. Called with the socket locked. + */ +static int rxrpc_recvmsg_oob(struct socket *sock, struct msghdr *msg, + unsigned int flags) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *skb; + bool need_response = false; + int ret; + + skb = skb_peek(&rx->recvmsg_oobq); + if (!skb) + return -EAGAIN; + rxrpc_see_skb(skb, rxrpc_skb_see_recvmsg); + + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_OOB_ID, sizeof(u64), + &skb->skb_mstamp_ns); + if (ret < 0) + return ret; + + switch ((enum rxrpc_oob_type)skb->mark) { + case RXRPC_OOB_CHALLENGE: + need_response = true; + ret = rxrpc_recvmsg_challenge(sock, msg, skb, flags); + break; + default: + WARN_ONCE(1, "recvmsg() can't process unknown OOB type %u\n", + skb->mark); + ret = -EIO; + break; + } + + if (!(flags & MSG_PEEK)) + skb_unlink(skb, &rx->recvmsg_oobq); + if (need_response) + rxrpc_add_pending_oob(rx, skb); + else + rxrpc_free_skb(skb, rxrpc_skb_put_oob); + return ret; +} + /* * Deliver messages to a call. This keeps processing packets until the buffer * is filled and we find either more DATA (returns 0) or the end of the DATA @@ -165,6 +241,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, size_t len, int flags, size_t *_offset) { struct rxrpc_skb_priv *sp; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct sk_buff *skb; rxrpc_seq_t seq = 0; size_t remain; @@ -207,7 +284,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq, sp->offset, sp->len, ret2); if (ret2 < 0) { - kdebug("verify = %d", ret2); ret = ret2; goto out; } @@ -255,6 +331,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (!(flags & MSG_PEEK)) rxrpc_rotate_rx_window(call); + + if (!rx->app_ops && + !skb_queue_empty_lockless(&rx->recvmsg_oobq)) { + trace_rxrpc_recvdata(call, rxrpc_recvmsg_oobq, seq, + rx_pkt_offset, rx_pkt_len, ret); + break; + } } out: @@ -262,6 +345,7 @@ out: call->rx_pkt_offset = rx_pkt_offset; call->rx_pkt_len = rx_pkt_len; } + done: trace_rxrpc_recvdata(call, rxrpc_recvmsg_data_return, seq, rx_pkt_offset, rx_pkt_len, ret); @@ -301,6 +385,7 @@ try_again: /* Return immediately if a client socket has no outstanding calls */ if (RB_EMPTY_ROOT(&rx->calls) && list_empty(&rx->recvmsg_q) && + skb_queue_empty_lockless(&rx->recvmsg_oobq) && rx->sk.sk_state != RXRPC_SERVER_LISTENING) { release_sock(&rx->sk); return -EAGAIN; @@ -322,7 +407,8 @@ try_again: if (ret) goto wait_error; - if (list_empty(&rx->recvmsg_q)) { + if (list_empty(&rx->recvmsg_q) && + skb_queue_empty_lockless(&rx->recvmsg_oobq)) { if (signal_pending(current)) goto wait_interrupted; trace_rxrpc_recvmsg(0, rxrpc_recvmsg_wait, 0); @@ -332,6 +418,15 @@ try_again: goto try_again; } + /* Deal with OOB messages before we consider getting normal data. */ + if (!skb_queue_empty_lockless(&rx->recvmsg_oobq)) { + ret = rxrpc_recvmsg_oob(sock, msg, flags); + release_sock(&rx->sk); + if (ret == -EAGAIN) + goto try_again; + goto error_no_call; + } + /* Find the next call and dequeue it if we're not just peeking. If we * do dequeue it, that comes with a ref that we will need to release. * We also want to weed out calls that got requeued whilst we were @@ -342,7 +437,8 @@ try_again: call = list_entry(l, struct rxrpc_call, recvmsg_link); if (!rxrpc_call_is_complete(call) && - skb_queue_empty(&call->recvmsg_queue)) { + skb_queue_empty(&call->recvmsg_queue) && + skb_queue_empty(&rx->recvmsg_oobq)) { list_del_init(&call->recvmsg_link); spin_unlock_irq(&rx->recvmsg_lock); release_sock(&rx->sk); @@ -377,21 +473,9 @@ try_again: if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); - if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { - if (flags & MSG_CMSG_COMPAT) { - unsigned int id32 = call->user_call_ID; - - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - sizeof(unsigned int), &id32); - } else { - unsigned long idl = call->user_call_ID; - - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - sizeof(unsigned long), &idl); - } - if (ret < 0) - goto error_unlock_call; - } + ret = rxrpc_recvmsg_user_id(call, msg, flags); + if (ret < 0) + goto error_unlock_call; if (msg->msg_name && call->peer) { size_t len = sizeof(call->dest_srx); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 6cb37b0eb77f..8ddccee69009 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -697,62 +697,6 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) return 0; } -/* - * send a Kerberos security response - */ -static int rxkad_send_response(struct rxrpc_connection *conn, - struct rxrpc_host_header *hdr, - struct rxkad_response *resp, - const struct rxkad_key *s2) -{ - struct rxrpc_wire_header whdr; - struct msghdr msg; - struct kvec iov[3]; - size_t len; - u32 serial; - int ret; - - _enter(""); - - msg.msg_name = &conn->peer->srx.transport; - msg.msg_namelen = conn->peer->srx.transport_len; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - memset(&whdr, 0, sizeof(whdr)); - whdr.epoch = htonl(hdr->epoch); - whdr.cid = htonl(hdr->cid); - whdr.type = RXRPC_PACKET_TYPE_RESPONSE; - whdr.flags = conn->out_clientflag; - whdr.securityIndex = hdr->securityIndex; - whdr.serviceId = htons(hdr->serviceId); - - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - iov[1].iov_base = resp; - iov[1].iov_len = sizeof(*resp); - iov[2].iov_base = (void *)s2->ticket; - iov[2].iov_len = s2->ticket_len; - - len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len; - - serial = rxrpc_get_next_serial(conn); - whdr.serial = htonl(serial); - - rxrpc_local_dont_fragment(conn->local, false); - ret = kernel_sendmsg(conn->local->socket, &msg, iov, 3, len); - if (ret < 0) { - trace_rxrpc_tx_fail(conn->debug_id, serial, ret, - rxrpc_tx_point_rxkad_response); - return -EAGAIN; - } - - conn->peer->last_tx_at = ktime_get_seconds(); - _leave(" = 0"); - return 0; -} - /* * calculate the response checksum */ @@ -772,12 +716,21 @@ static void rxkad_calc_response_checksum(struct rxkad_response *response) * encrypt the response packet */ static int rxkad_encrypt_response(struct rxrpc_connection *conn, - struct rxkad_response *resp, + struct sk_buff *response, const struct rxkad_key *s2) { struct skcipher_request *req; struct rxrpc_crypt iv; struct scatterlist sg[1]; + size_t encsize = sizeof(((struct rxkad_response *)0)->encrypted); + int ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + ret = skb_to_sgvec(response, sg, + sizeof(struct rxrpc_wire_header) + + offsetof(struct rxkad_response, encrypted), encsize); + if (ret < 0) + return ret; req = skcipher_request_alloc(&conn->rxkad.cipher->base, GFP_NOFS); if (!req) @@ -786,88 +739,205 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn, /* continue encrypting from where we left off */ memcpy(&iv, s2->session_key, sizeof(iv)); - sg_init_table(sg, 1); - sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); skcipher_request_set_sync_tfm(req, conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); - crypto_skcipher_encrypt(req); + skcipher_request_set_crypt(req, sg, sg, encsize, iv.x); + ret = crypto_skcipher_encrypt(req); skcipher_request_free(req); - return 0; + return ret; } /* - * respond to a challenge packet + * Validate a challenge packet. */ -static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb) +static bool rxkad_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) { - const struct rxrpc_key_token *token; struct rxkad_challenge challenge; - struct rxkad_response *resp; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - u32 version, nonce, min_level; - int ret = -EPROTO; + u32 version, min_level; + int ret; _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); - if (!conn->key) - return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, - rxkad_abort_chall_no_key); + if (!conn->key) { + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxkad_abort_chall_no_key); + return false; + } ret = key_validate(conn->key); - if (ret < 0) - return rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, - rxkad_abort_chall_key_expired); + if (ret < 0) { + rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, + rxkad_abort_chall_key_expired); + return false; + } if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - &challenge, sizeof(challenge)) < 0) - return rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, - rxkad_abort_chall_short); + &challenge, sizeof(challenge)) < 0) { + rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, + rxkad_abort_chall_short); + return false; + } version = ntohl(challenge.version); - nonce = ntohl(challenge.nonce); + sp->chall.rxkad_nonce = ntohl(challenge.nonce); min_level = ntohl(challenge.min_level); - trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, nonce, min_level); + trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, + sp->chall.rxkad_nonce, min_level); - if (version != RXKAD_VERSION) - return rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, - rxkad_abort_chall_version); + if (version != RXKAD_VERSION) { + rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, + rxkad_abort_chall_version); + return false; + } - if (conn->security_level < min_level) - return rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES, - rxkad_abort_chall_level); + if (conn->security_level < min_level) { + rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES, + rxkad_abort_chall_level); + return false; + } + return true; +} + +/* + * Insert the header into the response. + */ +static noinline +int rxkad_insert_response_header(struct rxrpc_connection *conn, + const struct rxrpc_key_token *token, + struct sk_buff *challenge, + struct sk_buff *response, + size_t *offset) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + struct { + struct rxrpc_wire_header whdr; + struct rxkad_response resp; + } h; + int ret; + + h.whdr.epoch = htonl(conn->proto.epoch); + h.whdr.cid = htonl(conn->proto.cid); + h.whdr.callNumber = 0; + h.whdr.serial = 0; + h.whdr.seq = 0; + h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE; + h.whdr.flags = conn->out_clientflag; + h.whdr.userStatus = 0; + h.whdr.securityIndex = conn->security_ix; + h.whdr.cksum = 0; + h.whdr.serviceId = htons(conn->service_id); + h.resp.version = htonl(RXKAD_VERSION); + h.resp.__pad = 0; + h.resp.encrypted.epoch = htonl(conn->proto.epoch); + h.resp.encrypted.cid = htonl(conn->proto.cid); + h.resp.encrypted.checksum = 0; + h.resp.encrypted.securityIndex = htonl(conn->security_ix); + h.resp.encrypted.call_id[0] = htonl(conn->channels[0].call_counter); + h.resp.encrypted.call_id[1] = htonl(conn->channels[1].call_counter); + h.resp.encrypted.call_id[2] = htonl(conn->channels[2].call_counter); + h.resp.encrypted.call_id[3] = htonl(conn->channels[3].call_counter); + h.resp.encrypted.inc_nonce = htonl(csp->chall.rxkad_nonce + 1); + h.resp.encrypted.level = htonl(conn->security_level); + h.resp.kvno = htonl(token->kad->kvno); + h.resp.ticket_len = htonl(token->kad->ticket_len); + + rxkad_calc_response_checksum(&h.resp); + + ret = skb_store_bits(response, *offset, &h, sizeof(h)); + *offset += sizeof(h); + return ret; +} + +/* + * respond to a challenge packet + */ +static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, + struct sk_buff *challenge) +{ + const struct rxrpc_key_token *token; + struct rxrpc_skb_priv *csp, *rsp; + struct sk_buff *response; + size_t len, offset = 0; + int ret = -EPROTO; + + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + ret = key_validate(conn->key); + if (ret < 0) + return rxrpc_abort_conn(conn, challenge, RXKADEXPIRED, ret, + rxkad_abort_chall_key_expired); token = conn->key->payload.data[0]; /* build the response packet */ - resp = kzalloc(sizeof(struct rxkad_response), GFP_NOFS); - if (!resp) - return -ENOMEM; + len = sizeof(struct rxrpc_wire_header) + + sizeof(struct rxkad_response) + + token->kad->ticket_len; + + response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS); + if (!response) + goto error; + rxrpc_new_skb(response, rxrpc_skb_new_response_rxkad); + response->len = len; + response->data_len = len; + + offset = 0; + ret = rxkad_insert_response_header(conn, token, challenge, response, + &offset); + if (ret < 0) + goto error; + + ret = rxkad_encrypt_response(conn, response, token->kad); + if (ret < 0) + goto error; + + ret = skb_store_bits(response, offset, token->kad->ticket, + token->kad->ticket_len); + if (ret < 0) + goto error; - resp->version = htonl(RXKAD_VERSION); - resp->encrypted.epoch = htonl(conn->proto.epoch); - resp->encrypted.cid = htonl(conn->proto.cid); - resp->encrypted.securityIndex = htonl(conn->security_ix); - resp->encrypted.inc_nonce = htonl(nonce + 1); - resp->encrypted.level = htonl(conn->security_level); - resp->kvno = htonl(token->kad->kvno); - resp->ticket_len = htonl(token->kad->ticket_len); - resp->encrypted.call_id[0] = htonl(conn->channels[0].call_counter); - resp->encrypted.call_id[1] = htonl(conn->channels[1].call_counter); - resp->encrypted.call_id[2] = htonl(conn->channels[2].call_counter); - resp->encrypted.call_id[3] = htonl(conn->channels[3].call_counter); - - /* calculate the response checksum and then do the encryption */ - rxkad_calc_response_checksum(resp); - ret = rxkad_encrypt_response(conn, resp, token->kad); - if (ret == 0) - ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad); - kfree(resp); + csp = rxrpc_skb(challenge); + rsp = rxrpc_skb(response); + rsp->resp.len = len; + rsp->resp.challenge_serial = csp->hdr.serial; + rxrpc_post_response(conn, response); + response = NULL; + ret = 0; + +error: + rxrpc_free_skb(response, rxrpc_skb_put_response); return ret; } +/* + * RxKAD does automatic response only as there's nothing to manage that isn't + * already in the key. + */ +static int rxkad_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + return -EINVAL; +} + +/** + * rxkad_kernel_respond_to_challenge - Respond to a challenge with appdata + * @challenge: The challenge to respond to + * + * Allow a kernel application to respond to a CHALLENGE. + * + * Return: %0 if successful and a negative error code otherwise. + */ +int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + + return rxkad_respond_to_challenge(csp->chall.conn, challenge); +} +EXPORT_SYMBOL(rxkad_kernel_respond_to_challenge); + /* * decrypt the kerberos IV ticket in the response */ @@ -1276,6 +1346,8 @@ const struct rxrpc_security rxkad = { .verify_packet = rxkad_verify_packet, .free_call_crypto = rxkad_free_call_crypto, .issue_challenge = rxkad_issue_challenge, + .validate_challenge = rxkad_validate_challenge, + .sendmsg_respond_to_challenge = rxkad_sendmsg_respond_to_challenge, .respond_to_challenge = rxkad_respond_to_challenge, .verify_response = rxkad_verify_response, .clear = rxkad_clear, diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 2342b5f1547c..ebbb78b842de 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -758,14 +758,21 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) if (rxrpc_call_is_complete(call)) { /* it's too late for this call */ ret = -ESHUTDOWN; - } else if (p.command == RXRPC_CMD_SEND_ABORT) { + goto out_put_unlock; + } + + switch (p.command) { + case RXRPC_CMD_SEND_ABORT: rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED, rxrpc_abort_call_sendmsg); ret = 0; - } else if (p.command != RXRPC_CMD_SEND_DATA) { - ret = -EINVAL; - } else { + break; + case RXRPC_CMD_SEND_DATA: ret = rxrpc_send_data(rx, call, msg, len, NULL, &dropped_lock); + break; + default: + ret = -EINVAL; + break; } out_put_unlock: diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c index eb7525e92951..36b05fd842a7 100644 --- a/net/rxrpc/server_key.c +++ b/net/rxrpc/server_key.c @@ -171,3 +171,43 @@ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) return ret; } EXPORT_SYMBOL(rxrpc_sock_set_security_keyring); + +/** + * rxrpc_sock_set_manage_response - Set the manage-response flag for a kernel service + * @sk: The socket to set the keyring on + * @set: True to set, false to clear the flag + * + * Set the flag on an rxrpc socket to say that the caller wants to manage the + * RESPONSE packet and the user-defined data it may contain. Setting this + * means that recvmsg() will return messages with RXRPC_CHALLENGED in the + * control message buffer containing information about the challenge. + * + * The user should respond to the challenge by passing RXRPC_RESPOND or + * RXRPC_RESPOND_ABORT control messages with sendmsg() to the same call. + * Supplementary control messages, such as RXRPC_RESP_RXGK_APPDATA, may be + * included to indicate the parts the user wants to supply. + * + * The server will be passed the response data with a RXRPC_RESPONDED control + * message when it gets the first data from each call. + * + * Note that this is only honoured by security classes that need auxiliary data + * (e.g. RxGK). Those that don't offer the facility (e.g. RxKAD) respond + * without consulting userspace. + * + * Return: The previous setting. + */ +int rxrpc_sock_set_manage_response(struct sock *sk, bool set) +{ + struct rxrpc_sock *rx = rxrpc_sk(sk); + int ret; + + lock_sock(sk); + ret = !!test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + if (set) + set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + else + clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + release_sock(sk); + return ret; +} +EXPORT_SYMBOL(rxrpc_sock_set_manage_response); -- cgit v1.2.3 From 01af64269751f261421a9e80a527c8e987aeda8d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:50 +0100 Subject: rxrpc: Add the security index for yfs-rxgk Add the security index and abort codes for the YFS variant of rxgk. Signed-off-by: David Howells Link: https://patch.msgid.link/20250411095303.2316168-6-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- fs/afs/misc.c | 27 +++++++++++++++++++++++++++ include/crypto/krb5.h | 5 +++++ include/uapi/linux/rxrpc.h | 31 +++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+) (limited to 'include') diff --git a/fs/afs/misc.c b/fs/afs/misc.c index b8180bf2281f..8f2b3a177690 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "internal.h" #include "afs_fs.h" #include "protocol_uae.h" @@ -103,6 +104,32 @@ int afs_abort_to_error(u32 abort_code) case RXKADDATALEN: return -EKEYREJECTED; case RXKADILLEGALLEVEL: return -EKEYREJECTED; + case RXGK_INCONSISTENCY: return -EPROTO; + case RXGK_PACKETSHORT: return -EPROTO; + case RXGK_BADCHALLENGE: return -EPROTO; + case RXGK_SEALEDINCON: return -EKEYREJECTED; + case RXGK_NOTAUTH: return -EKEYREJECTED; + case RXGK_EXPIRED: return -EKEYEXPIRED; + case RXGK_BADLEVEL: return -EKEYREJECTED; + case RXGK_BADKEYNO: return -EKEYREJECTED; + case RXGK_NOTRXGK: return -EKEYREJECTED; + case RXGK_UNSUPPORTED: return -EKEYREJECTED; + case RXGK_GSSERROR: return -EKEYREJECTED; +#ifdef RXGK_BADETYPE + case RXGK_BADETYPE: return -ENOPKG; +#endif +#ifdef RXGK_BADTOKEN + case RXGK_BADTOKEN: return -EKEYREJECTED; +#endif +#ifdef RXGK_BADETYPE + case RXGK_DATALEN: return -EPROTO; +#endif +#ifdef RXGK_BADQOP + case RXGK_BADQOP: return -EKEYREJECTED; +#endif + + case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG; + case RXGEN_OPCODE: return -ENOTSUPP; default: return -EREMOTEIO; diff --git a/include/crypto/krb5.h b/include/crypto/krb5.h index 62d998e62f47..71dd38f59be1 100644 --- a/include/crypto/krb5.h +++ b/include/crypto/krb5.h @@ -63,6 +63,11 @@ struct scatterlist; #define KEY_USAGE_SEED_ENCRYPTION (0xAA) #define KEY_USAGE_SEED_INTEGRITY (0x55) +/* + * Standard Kerberos error codes. + */ +#define KRB5_PROG_KEYTYPE_NOSUPP -1765328233 + /* * Mode of operation. */ diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h index c4e9833b0a12..d9735abd4c79 100644 --- a/include/uapi/linux/rxrpc.h +++ b/include/uapi/linux/rxrpc.h @@ -80,6 +80,7 @@ enum rxrpc_cmsg_type { #define RXRPC_SECURITY_RXKAD 2 /* kaserver or kerberos 4 */ #define RXRPC_SECURITY_RXGK 4 /* gssapi-based */ #define RXRPC_SECURITY_RXK5 5 /* kerberos 5 */ +#define RXRPC_SECURITY_YFS_RXGK 6 /* YFS gssapi-based */ /* * RxRPC-level abort codes @@ -125,6 +126,36 @@ enum rxrpc_cmsg_type { #define RXKADDATALEN 19270411 /* user data too long */ #define RXKADILLEGALLEVEL 19270412 /* caller not authorised to use encrypted conns */ +/* + * RxGK GSSAPI security abort codes. + */ +#if 0 /* Original standard abort codes (used by OpenAFS) */ +#define RXGK_INCONSISTENCY 1233242880 /* Security module structure inconsistent */ +#define RXGK_PACKETSHORT 1233242881 /* Packet too short for security challenge */ +#define RXGK_BADCHALLENGE 1233242882 /* Invalid security challenge */ +#define RXGK_BADETYPE 1233242883 /* Invalid or impermissible encryption type */ +#define RXGK_BADLEVEL 1233242884 /* Invalid or impermissible security level */ +#define RXGK_BADKEYNO 1233242885 /* Key version number not found */ +#define RXGK_EXPIRED 1233242886 /* Token has expired */ +#define RXGK_NOTAUTH 1233242887 /* Caller not authorized */ +#define RXGK_BAD_TOKEN 1233242888 /* Security object was passed a bad token */ +#define RXGK_SEALED_INCON 1233242889 /* Sealed data inconsistent */ +#define RXGK_DATA_LEN 1233242890 /* User data too long */ +#define RXGK_BAD_QOP 1233242891 /* Inadequate quality of protection available */ +#else /* Revised standard abort codes (used by YFS) */ +#define RXGK_INCONSISTENCY 1233242880 /* Security module structure inconsistent */ +#define RXGK_PACKETSHORT 1233242881 /* Packet too short for security challenge */ +#define RXGK_BADCHALLENGE 1233242882 /* Security challenge/response failed */ +#define RXGK_SEALEDINCON 1233242883 /* Sealed data is inconsistent */ +#define RXGK_NOTAUTH 1233242884 /* Caller not authorised */ +#define RXGK_EXPIRED 1233242885 /* Authentication expired */ +#define RXGK_BADLEVEL 1233242886 /* Unsupported or not permitted security level */ +#define RXGK_BADKEYNO 1233242887 /* Bad transport key number */ +#define RXGK_NOTRXGK 1233242888 /* Security layer is not rxgk */ +#define RXGK_UNSUPPORTED 1233242889 /* Endpoint does not support rxgk */ +#define RXGK_GSSERROR 1233242890 /* GSSAPI mechanism error */ +#endif + /* * Challenge information in the RXRPC_CHALLENGED control message. */ -- cgit v1.2.3 From 0ca100ff4df64f5d0f6c1dd5080c3e096786bea6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:51 +0100 Subject: rxrpc: Add YFS RxGK (GSSAPI) security class Add support for the YFS-variant RxGK security class to support GSSAPI-derived authentication. This also allows the use of better crypto over the rxkad security class. The key payload is XDR encoded of the form: typedef int64_t opr_time; const AFSTOKEN_RK_TIX_MAX = 12000; /* Matches entry in rxkad.h */ struct token_rxkad { afs_int32 viceid; afs_int32 kvno; afs_int64 key; afs_int32 begintime; afs_int32 endtime; afs_int32 primary_flag; opaque ticket; }; struct token_rxgk { opr_time begintime; opr_time endtime; afs_int64 level; afs_int64 lifetime; afs_int64 bytelife; afs_int64 enctype; opaque key<>; opaque ticket<>; }; const AFSTOKEN_UNION_NOAUTH = 0; const AFSTOKEN_UNION_KAD = 2; const AFSTOKEN_UNION_YFSGK = 6; union ktc_tokenUnion switch (afs_int32 type) { case AFSTOKEN_UNION_KAD: token_rxkad kad; case AFSTOKEN_UNION_YFSGK: token_rxgk gk; }; const AFSTOKEN_LENGTH_MAX = 16384; typedef opaque token_opaque; const AFSTOKEN_MAX = 8; const AFSTOKEN_CELL_MAX = 64; struct ktc_setTokenData { afs_int32 flags; string cell; token_opaque tokens; }; The parser for the basic token struct is already present, as is the rxkad token type. This adds a parser for the rxgk token type. Signed-off-by: David Howells cc: Marc Dionne cc: Herbert Xu cc: Chuck Lever cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-7-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/keys/rxrpc-type.h | 17 +++++ net/rxrpc/key.c | 185 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 202 insertions(+) (limited to 'include') diff --git a/include/keys/rxrpc-type.h b/include/keys/rxrpc-type.h index 333c0f49a9cd..0ddbe197a261 100644 --- a/include/keys/rxrpc-type.h +++ b/include/keys/rxrpc-type.h @@ -9,6 +9,7 @@ #define _KEYS_RXRPC_TYPE_H #include +#include /* * key type for AF_RXRPC keys @@ -31,6 +32,21 @@ struct rxkad_key { u8 ticket[]; /* the encrypted ticket */ }; +/* + * RxRPC key for YFS-RxGK (type-6 security) + */ +struct rxgk_key { + s64 begintime; /* Time at which the ticket starts */ + s64 endtime; /* Time at which the ticket ends */ + u64 lifetime; /* Maximum lifespan of a connection (seconds) */ + u64 bytelife; /* Maximum number of bytes on a connection */ + unsigned int enctype; /* Encoding type */ + s8 level; /* Negotiated security RXRPC_SECURITY_PLAIN/AUTH/ENCRYPT */ + struct krb5_buffer key; /* Master key, K0 */ + struct krb5_buffer ticket; /* Ticket to be passed to server */ + u8 _key[]; /* Key storage */ +}; + /* * list of tokens attached to an rxrpc key */ @@ -40,6 +56,7 @@ struct rxrpc_key_token { struct rxrpc_key_token *next; /* the next token in the list */ union { struct rxkad_key *kad; + struct rxgk_key *rxgk; }; }; diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 8c99cf19b19d..9fdc1f031c9d 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -129,6 +129,160 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, return 0; } +static u64 xdr_dec64(const __be32 *xdr) +{ + return (u64)ntohl(xdr[0]) << 32 | (u64)ntohl(xdr[1]); +} + +static time64_t rxrpc_s64_to_time64(s64 time_in_100ns) +{ + bool neg = false; + u64 tmp = time_in_100ns; + + if (time_in_100ns < 0) { + tmp = -time_in_100ns; + neg = true; + } + do_div(tmp, 10000000); + return neg ? -tmp : tmp; +} + +/* + * Parse a YFS-RxGK type XDR format token + * - the caller guarantees we have at least 4 words + * + * struct token_rxgk { + * opr_time begintime; + * opr_time endtime; + * afs_int64 level; + * afs_int64 lifetime; + * afs_int64 bytelife; + * afs_int64 enctype; + * opaque key<>; + * opaque ticket<>; + * }; + */ +static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep, + size_t datalen, + const __be32 *xdr, unsigned int toklen) +{ + struct rxrpc_key_token *token, **pptoken; + time64_t expiry; + size_t plen; + const __be32 *ticket, *key; + s64 tmp; + u32 tktlen, keylen; + + _enter(",{%x,%x,%x,%x},%x", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), + toklen); + + if (6 * 2 + 2 > toklen / 4) + goto reject; + + key = xdr + (6 * 2 + 1); + keylen = ntohl(key[-1]); + _debug("keylen: %x", keylen); + keylen = round_up(keylen, 4); + if ((6 * 2 + 2) * 4 + keylen > toklen) + goto reject; + + ticket = xdr + (6 * 2 + 1 + (keylen / 4) + 1); + tktlen = ntohl(ticket[-1]); + _debug("tktlen: %x", tktlen); + tktlen = round_up(tktlen, 4); + if ((6 * 2 + 2) * 4 + keylen + tktlen != toklen) { + kleave(" = -EKEYREJECTED [%x!=%x, %x,%x]", + (6 * 2 + 2) * 4 + keylen + tktlen, toklen, + keylen, tktlen); + goto reject; + } + + plen = sizeof(*token) + sizeof(*token->rxgk) + tktlen + keylen; + prep->quotalen = datalen + plen; + + plen -= sizeof(*token); + token = kzalloc(sizeof(*token), GFP_KERNEL); + if (!token) + goto nomem; + + token->rxgk = kzalloc(sizeof(*token->rxgk) + keylen, GFP_KERNEL); + if (!token->rxgk) + goto nomem_token; + + token->security_index = RXRPC_SECURITY_YFS_RXGK; + token->rxgk->begintime = xdr_dec64(xdr + 0 * 2); + token->rxgk->endtime = xdr_dec64(xdr + 1 * 2); + token->rxgk->level = tmp = xdr_dec64(xdr + 2 * 2); + if (tmp < -1LL || tmp > RXRPC_SECURITY_ENCRYPT) + goto reject_token; + token->rxgk->lifetime = xdr_dec64(xdr + 3 * 2); + token->rxgk->bytelife = xdr_dec64(xdr + 4 * 2); + token->rxgk->enctype = tmp = xdr_dec64(xdr + 5 * 2); + if (tmp < 0 || tmp > UINT_MAX) + goto reject_token; + token->rxgk->key.len = ntohl(key[-1]); + token->rxgk->key.data = token->rxgk->_key; + token->rxgk->ticket.len = ntohl(ticket[-1]); + + if (token->rxgk->endtime != 0) { + expiry = rxrpc_s64_to_time64(token->rxgk->endtime); + if (expiry < 0) + goto expired; + if (expiry < prep->expiry) + prep->expiry = expiry; + } + + memcpy(token->rxgk->key.data, key, token->rxgk->key.len); + + /* Pad the ticket so that we can use it directly in XDR */ + token->rxgk->ticket.data = kzalloc(round_up(token->rxgk->ticket.len, 4), + GFP_KERNEL); + if (!token->rxgk->ticket.data) + goto nomem_yrxgk; + memcpy(token->rxgk->ticket.data, ticket, token->rxgk->ticket.len); + + _debug("SCIX: %u", token->security_index); + _debug("EXPY: %llx", token->rxgk->endtime); + _debug("LIFE: %llx", token->rxgk->lifetime); + _debug("BYTE: %llx", token->rxgk->bytelife); + _debug("ENC : %u", token->rxgk->enctype); + _debug("LEVL: %u", token->rxgk->level); + _debug("KLEN: %u", token->rxgk->key.len); + _debug("TLEN: %u", token->rxgk->ticket.len); + _debug("KEY0: %*phN", token->rxgk->key.len, token->rxgk->key.data); + _debug("TICK: %*phN", + min_t(u32, token->rxgk->ticket.len, 32), token->rxgk->ticket.data); + + /* count the number of tokens attached */ + prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); + + /* attach the data */ + for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; + *pptoken; + pptoken = &(*pptoken)->next) + continue; + *pptoken = token; + + _leave(" = 0"); + return 0; + +nomem_yrxgk: + kfree(token->rxgk); +nomem_token: + kfree(token); +nomem: + return -ENOMEM; +reject_token: + kfree(token); +reject: + return -EKEYREJECTED; +expired: + kfree(token->rxgk); + kfree(token); + return -EKEYEXPIRED; +} + /* * attempt to parse the data as the XDR format * - the caller guarantees we have more than 7 words @@ -228,6 +382,9 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) case RXRPC_SECURITY_RXKAD: ret2 = rxrpc_preparse_xdr_rxkad(prep, datalen, token, toklen); break; + case RXRPC_SECURITY_YFS_RXGK: + ret2 = rxrpc_preparse_xdr_yfs_rxgk(prep, datalen, token, toklen); + break; default: ret2 = -EPROTONOSUPPORT; break; @@ -390,6 +547,10 @@ static void rxrpc_free_token_list(struct rxrpc_key_token *token) case RXRPC_SECURITY_RXKAD: kfree(token->kad); break; + case RXRPC_SECURITY_YFS_RXGK: + kfree(token->rxgk->ticket.data); + kfree(token->rxgk); + break; default: pr_err("Unknown token type %x on rxrpc key\n", token->security_index); @@ -433,6 +594,9 @@ static void rxrpc_describe(const struct key *key, struct seq_file *m) case RXRPC_SECURITY_RXKAD: seq_puts(m, "ka"); break; + case RXRPC_SECURITY_YFS_RXGK: + seq_puts(m, "ygk"); + break; default: /* we have a ticket we can't encode */ seq_printf(m, "%u", token->security_index); break; @@ -597,6 +761,13 @@ static long rxrpc_read(const struct key *key, toksize += RND(token->kad->ticket_len); break; + case RXRPC_SECURITY_YFS_RXGK: + toksize += 6 * 8 + 2 * 4; + if (!token->no_leak_key) + toksize += RND(token->rxgk->key.len); + toksize += RND(token->rxgk->ticket.len); + break; + default: /* we have a ticket we can't encode */ pr_err("Unsupported key token type (%u)\n", token->security_index); @@ -676,6 +847,20 @@ static long rxrpc_read(const struct key *key, ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); break; + case RXRPC_SECURITY_YFS_RXGK: + ENCODE64(token->rxgk->begintime); + ENCODE64(token->rxgk->endtime); + ENCODE64(token->rxgk->level); + ENCODE64(token->rxgk->lifetime); + ENCODE64(token->rxgk->bytelife); + ENCODE64(token->rxgk->enctype); + if (token->no_leak_key) + ENCODE(0); + else + ENCODE_DATA(token->rxgk->key.len, token->rxgk->key.data); + ENCODE_DATA(token->rxgk->ticket.len, token->rxgk->ticket.data); + break; + default: pr_err("Unsupported key token type (%u)\n", token->security_index); -- cgit v1.2.3 From 9d1d2b59341f58126a69b51f9f5f8ccb9f12e54a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:53 +0100 Subject: rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI) Implement the basic parts of the yfs-rxgk security class (security index 6) to support GSSAPI-negotiated security. Signed-off-by: David Howells cc: Marc Dionne cc: Herbert Xu cc: Chuck Lever cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-9-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/networking/rxrpc.rst | 1 + fs/afs/cm_security.c | 12 + include/trace/events/rxrpc.h | 45 +- net/rxrpc/Makefile | 2 + net/rxrpc/ar-internal.h | 17 + net/rxrpc/output.c | 2 +- net/rxrpc/protocol.h | 20 + net/rxrpc/rxgk.c | 1215 ++++++++++++++++++++++++++++++++++++ net/rxrpc/rxgk_app.c | 285 +++++++++ net/rxrpc/rxgk_common.h | 91 +++ net/rxrpc/rxkad.c | 6 +- net/rxrpc/security.c | 3 + 12 files changed, 1694 insertions(+), 5 deletions(-) create mode 100644 net/rxrpc/rxgk.c create mode 100644 net/rxrpc/rxgk_app.c (limited to 'include') diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index eb941b2577dd..a01f0c81ca4b 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -1182,6 +1182,7 @@ API Function Reference .. kernel-doc:: net/rxrpc/oob.c .. kernel-doc:: net/rxrpc/peer_object.c .. kernel-doc:: net/rxrpc/recvmsg.c +.. kernel-doc:: net/rxrpc/rxgk.c .. kernel-doc:: net/rxrpc/rxkad.c .. kernel-doc:: net/rxrpc/sendmsg.c .. kernel-doc:: net/rxrpc/server_key.c diff --git a/fs/afs/cm_security.c b/fs/afs/cm_security.c index efe6a23c8477..e8eb63e4d124 100644 --- a/fs/afs/cm_security.c +++ b/fs/afs/cm_security.c @@ -6,6 +6,7 @@ */ #include +#include #include "internal.h" #include "afs_fs.h" #include "protocol_yfs.h" @@ -17,6 +18,9 @@ */ static int afs_respond_to_challenge(struct sk_buff *challenge) { +#ifdef CONFIG_RXGK + struct krb5_buffer appdata = {}; +#endif struct rxrpc_peer *peer; unsigned long peer_data; u16 service_id; @@ -44,8 +48,16 @@ static int afs_respond_to_challenge(struct sk_buff *challenge) } switch (security_index) { +#ifdef CONFIG_RXKAD case RXRPC_SECURITY_RXKAD: return rxkad_kernel_respond_to_challenge(challenge); +#endif + +#ifdef CONFIG_RXGK + case RXRPC_SECURITY_RXGK: + case RXRPC_SECURITY_YFS_RXGK: + return rxgk_kernel_respond_to_challenge(challenge, &appdata); +#endif default: return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 08ecebd90595..aab81e8196ae 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -69,6 +69,38 @@ EM(rxkad_abort_resp_tkt_sname, "rxkad-resp-tk-sname") \ EM(rxkad_abort_resp_unknown_tkt, "rxkad-resp-unknown-tkt") \ EM(rxkad_abort_resp_version, "rxkad-resp-version") \ + /* RxGK security errors */ \ + EM(rxgk_abort_1_verify_mic_eproto, "rxgk1-vfy-mic-eproto") \ + EM(rxgk_abort_2_decrypt_eproto, "rxgk2-dec-eproto") \ + EM(rxgk_abort_2_short_data, "rxgk2-short-data") \ + EM(rxgk_abort_2_short_encdata, "rxgk2-short-encdata") \ + EM(rxgk_abort_2_short_header, "rxgk2-short-hdr") \ + EM(rxgk_abort_bad_key_number, "rxgk-bad-key-num") \ + EM(rxgk_abort_chall_key_expired, "rxgk-chall-key-exp") \ + EM(rxgk_abort_chall_no_key, "rxgk-chall-nokey") \ + EM(rxgk_abort_chall_short, "rxgk-chall-short") \ + EM(rxgk_abort_resp_auth_dec, "rxgk-resp-auth-dec") \ + EM(rxgk_abort_resp_bad_callid, "rxgk-resp-bad-callid") \ + EM(rxgk_abort_resp_bad_nonce, "rxgk-resp-bad-nonce") \ + EM(rxgk_abort_resp_bad_param, "rxgk-resp-bad-param") \ + EM(rxgk_abort_resp_call_ctr, "rxgk-resp-call-ctr") \ + EM(rxgk_abort_resp_call_state, "rxgk-resp-call-state") \ + EM(rxgk_abort_resp_internal_error, "rxgk-resp-int-error") \ + EM(rxgk_abort_resp_nopkg, "rxgk-resp-nopkg") \ + EM(rxgk_abort_resp_short_applen, "rxgk-resp-short-applen") \ + EM(rxgk_abort_resp_short_auth, "rxgk-resp-short-auth") \ + EM(rxgk_abort_resp_short_call_list, "rxgk-resp-short-callls") \ + EM(rxgk_abort_resp_short_packet, "rxgk-resp-short-packet") \ + EM(rxgk_abort_resp_short_yfs_klen, "rxgk-resp-short-yfs-klen") \ + EM(rxgk_abort_resp_short_yfs_key, "rxgk-resp-short-yfs-key") \ + EM(rxgk_abort_resp_short_yfs_tkt, "rxgk-resp-short-yfs-tkt") \ + EM(rxgk_abort_resp_tok_dec, "rxgk-resp-tok-dec") \ + EM(rxgk_abort_resp_tok_internal_error, "rxgk-resp-tok-int-err") \ + EM(rxgk_abort_resp_tok_keyerr, "rxgk-resp-tok-keyerr") \ + EM(rxgk_abort_resp_tok_nokey, "rxgk-resp-tok-nokey") \ + EM(rxgk_abort_resp_tok_nopkg, "rxgk-resp-tok-nopkg") \ + EM(rxgk_abort_resp_tok_short, "rxgk-resp-tok-short") \ + EM(rxgk_abort_resp_xdr_align, "rxgk-resp-xdr-align") \ /* rxrpc errors */ \ EM(rxrpc_abort_call_improper_term, "call-improper-term") \ EM(rxrpc_abort_call_reset, "call-reset") \ @@ -471,6 +503,7 @@ EM(rxrpc_tx_point_call_final_resend, "CallFinalResend") \ EM(rxrpc_tx_point_conn_abort, "ConnAbort") \ EM(rxrpc_tx_point_reject, "Reject") \ + EM(rxrpc_tx_point_rxgk_challenge, "RxGKChall") \ EM(rxrpc_tx_point_rxkad_challenge, "RxkadChall") \ EM(rxrpc_tx_point_response, "Response") \ EM(rxrpc_tx_point_version_keepalive, "VerKeepalive") \ @@ -489,6 +522,7 @@ #define rxrpc_txbuf_traces \ EM(rxrpc_txbuf_alloc_data, "ALLOC DATA ") \ + EM(rxrpc_txbuf_alloc_response, "ALLOC RESP ") \ EM(rxrpc_txbuf_free, "FREE ") \ EM(rxrpc_txbuf_get_buffer, "GET BUFFER ") \ EM(rxrpc_txbuf_get_trans, "GET TRANS ") \ @@ -496,6 +530,7 @@ EM(rxrpc_txbuf_put_cleaned, "PUT CLEANED") \ EM(rxrpc_txbuf_put_nomem, "PUT NOMEM ") \ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ + EM(rxrpc_txbuf_put_response_tx, "PUT RESP TX") \ EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ EM(rxrpc_txbuf_put_trans, "PUT TRANS ") \ EM(rxrpc_txbuf_see_lost, "SEE LOST ") \ @@ -1178,6 +1213,7 @@ TRACE_EVENT(rxrpc_rx_challenge, __field(u32, version) __field(u32, nonce) __field(u32, min_level) + __field(u8, security_ix) ), TP_fast_assign( @@ -1186,11 +1222,13 @@ TRACE_EVENT(rxrpc_rx_challenge, __entry->version = version; __entry->nonce = nonce; __entry->min_level = min_level; + __entry->security_ix = conn->security_ix; ), - TP_printk("C=%08x CHALLENGE %08x v=%x n=%x ml=%x", + TP_printk("C=%08x CHALLENGE r=%08x sx=%u v=%x n=%x ml=%x", __entry->conn, __entry->serial, + __entry->security_ix, __entry->version, __entry->nonce, __entry->min_level) @@ -1208,6 +1246,7 @@ TRACE_EVENT(rxrpc_rx_response, __field(u32, version) __field(u32, kvno) __field(u32, ticket_len) + __field(u8, security_ix) ), TP_fast_assign( @@ -1216,11 +1255,13 @@ TRACE_EVENT(rxrpc_rx_response, __entry->version = version; __entry->kvno = kvno; __entry->ticket_len = ticket_len; + __entry->security_ix = conn->security_ix; ), - TP_printk("C=%08x RESPONSE %08x v=%x kvno=%x tl=%x", + TP_printk("C=%08x RESPONSE r=%08x sx=%u v=%x kvno=%x tl=%x", __entry->conn, __entry->serial, + __entry->security_ix, __entry->version, __entry->kvno, __entry->ticket_len) diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index 3eda77a0266b..c0542bae719e 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -41,6 +41,8 @@ rxrpc-$(CONFIG_PROC_FS) += proc.o rxrpc-$(CONFIG_RXKAD) += rxkad.o rxrpc-$(CONFIG_SYSCTL) += sysctl.o rxrpc-$(CONFIG_RXGK) += \ + rxgk.o \ + rxgk_app.o \ rxgk_kdf.o obj-$(CONFIG_RXPERF) += rxperf.o diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index a776f08d10b3..57ea6d61b7a2 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -31,6 +31,7 @@ struct key_preparsed_payload; struct rxrpc_connection; struct rxrpc_txbuf; struct rxrpc_txqueue; +struct rxgk_context; /* * Mark applied to socket buffers in skb->mark. skb->priority is used @@ -312,6 +313,11 @@ struct rxrpc_security { /* clear connection security */ void (*clear)(struct rxrpc_connection *); + + /* Default ticket -> key decoder */ + int (*default_decode_ticket)(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key); }; /* @@ -559,7 +565,10 @@ struct rxrpc_connection { u32 nonce; /* response re-use preventer */ } rxkad; struct { + struct rxgk_context *keys[1]; u64 start_time; /* The start time for TK derivation */ + u8 nonce[20]; /* Response re-use preventer */ + u32 enctype; /* Kerberos 5 encoding type */ } rxgk; }; struct sk_buff *tx_response; /* Response packet to be transmitted */ @@ -903,6 +912,8 @@ struct rxrpc_txbuf { unsigned short len; /* Amount of data in buffer */ unsigned short space; /* Remaining data space */ unsigned short offset; /* Offset of fill point */ + unsigned short crypto_header; /* Size of crypto header */ + unsigned short sec_header; /* Size of security header */ unsigned short pkt_len; /* Size of packet content */ unsigned short alloc_size; /* Amount of bufferage allocated */ unsigned int flags; @@ -1339,6 +1350,7 @@ int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len); /* * output.c */ +ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len); void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why); void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call); @@ -1411,6 +1423,11 @@ void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans); void rxrpc_call_init_rtt(struct rxrpc_call *call); +/* + * rxgk.c + */ +extern const struct rxrpc_security rxgk_yfs; + /* * rxkad.c */ diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 04c900990a40..8138f35d7945 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -18,7 +18,7 @@ extern int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); -static ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len) +ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len) { struct sockaddr *sa = msg->msg_name; struct sock *sk = socket->sk; diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index 42f70e4636f8..f8bfec12bc7e 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -181,4 +181,24 @@ struct rxkad_response { __be32 ticket_len; /* Kerberos ticket length */ } __packed; +/* + * GSSAPI security type-4 and type-6 data header. + */ +struct rxgk_header { + __be32 epoch; + __be32 cid; + __be32 call_number; + __be32 seq; + __be32 sec_index; + __be32 data_len; +} __packed; + +/* + * GSSAPI security type-4 and type-6 response packet header. + */ +struct rxgk_response { + __be64 start_time; + __be32 token_len; +} __packed; + #endif /* _LINUX_RXRPC_PACKET_H */ diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c new file mode 100644 index 000000000000..02752eeb2395 --- /dev/null +++ b/net/rxrpc/rxgk.c @@ -0,0 +1,1215 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* GSSAPI-based RxRPC security + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include "ar-internal.h" +#include "rxgk_common.h" + +/* + * Parse the information from a server key + */ +static int rxgk_preparse_server_key(struct key_preparsed_payload *prep) +{ + const struct krb5_enctype *krb5; + struct krb5_buffer *server_key = (void *)&prep->payload.data[2]; + unsigned int service, sec_class, kvno, enctype; + int n = 0; + + _enter("%zu", prep->datalen); + + if (sscanf(prep->orig_description, "%u:%u:%u:%u%n", + &service, &sec_class, &kvno, &enctype, &n) != 4) + return -EINVAL; + + if (prep->orig_description[n]) + return -EINVAL; + + krb5 = crypto_krb5_find_enctype(enctype); + if (!krb5) + return -ENOPKG; + + prep->payload.data[0] = (struct krb5_enctype *)krb5; + + if (prep->datalen != krb5->key_len) + return -EKEYREJECTED; + + server_key->len = prep->datalen; + server_key->data = kmemdup(prep->data, prep->datalen, GFP_KERNEL); + if (!server_key->data) + return -ENOMEM; + + _leave(" = 0"); + return 0; +} + +static void rxgk_free_server_key(union key_payload *payload) +{ + struct krb5_buffer *server_key = (void *)&payload->data[2]; + + kfree_sensitive(server_key->data); +} + +static void rxgk_free_preparse_server_key(struct key_preparsed_payload *prep) +{ + rxgk_free_server_key(&prep->payload); +} + +static void rxgk_destroy_server_key(struct key *key) +{ + rxgk_free_server_key(&key->payload); +} + +static void rxgk_describe_server_key(const struct key *key, struct seq_file *m) +{ + const struct krb5_enctype *krb5 = key->payload.data[0]; + + if (krb5) + seq_printf(m, ": %s", krb5->name); +} + +static struct rxgk_context *rxgk_get_key(struct rxrpc_connection *conn, + u16 *specific_key_number) +{ + refcount_inc(&conn->rxgk.keys[0]->usage); + return conn->rxgk.keys[0]; +} + +/* + * initialise connection security + */ +static int rxgk_init_connection_security(struct rxrpc_connection *conn, + struct rxrpc_key_token *token) +{ + struct rxgk_context *gk; + int ret; + + _enter("{%d},{%x}", conn->debug_id, key_serial(conn->key)); + + conn->security_ix = token->security_index; + conn->security_level = token->rxgk->level; + + if (rxrpc_conn_is_client(conn)) { + conn->rxgk.start_time = ktime_get(); + do_div(conn->rxgk.start_time, 100); + } + + gk = rxgk_generate_transport_key(conn, token->rxgk, 0, GFP_NOFS); + if (IS_ERR(gk)) + return PTR_ERR(gk); + conn->rxgk.enctype = gk->krb5->etype; + conn->rxgk.keys[0] = gk; + + switch (conn->security_level) { + case RXRPC_SECURITY_PLAIN: + case RXRPC_SECURITY_AUTH: + case RXRPC_SECURITY_ENCRYPT: + break; + default: + ret = -EKEYREJECTED; + goto error; + } + + ret = 0; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * Clean up the crypto on a call. + */ +static void rxgk_free_call_crypto(struct rxrpc_call *call) +{ +} + +/* + * Work out how much data we can put in a packet. + */ +static struct rxrpc_txbuf *rxgk_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) +{ + enum krb5_crypto_mode mode; + struct rxgk_context *gk; + struct rxrpc_txbuf *txb; + size_t shdr, alloc, limit, part, offset, gap; + + switch (call->conn->security_level) { + default: + alloc = umin(remain, RXRPC_JUMBO_DATALEN); + return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp); + case RXRPC_SECURITY_AUTH: + shdr = 0; + mode = KRB5_CHECKSUM_MODE; + break; + case RXRPC_SECURITY_ENCRYPT: + shdr = sizeof(struct rxgk_header); + mode = KRB5_ENCRYPT_MODE; + break; + } + + gk = rxgk_get_key(call->conn, NULL); + if (IS_ERR(gk)) + return NULL; + + /* Work out the maximum amount of data that will fit. */ + alloc = RXRPC_JUMBO_DATALEN; + limit = crypto_krb5_how_much_data(gk->krb5, mode, &alloc, &offset); + + if (remain < limit - shdr) { + part = remain; + alloc = crypto_krb5_how_much_buffer(gk->krb5, mode, + shdr + part, &offset); + gap = 0; + } else { + part = limit - shdr; + gap = RXRPC_JUMBO_DATALEN - alloc; + alloc = RXRPC_JUMBO_DATALEN; + } + + rxgk_put(gk); + + txb = rxrpc_alloc_data_txbuf(call, alloc, 16, gfp); + if (!txb) + return NULL; + + txb->crypto_header = offset; + txb->sec_header = shdr; + txb->offset += offset + shdr; + txb->space = part; + + /* Clear excess space in the packet */ + if (gap) + memset(txb->data + alloc - gap, 0, gap); + return txb; +} + +/* + * Integrity mode (sign a packet - level 1 security) + */ +static int rxgk_secure_packet_integrity(const struct rxrpc_call *call, + struct rxgk_context *gk, + struct rxrpc_txbuf *txb) +{ + struct rxgk_header *hdr; + struct scatterlist sg[1]; + struct krb5_buffer metadata; + int ret = -ENOMEM; + + _enter(""); + + hdr = kzalloc(sizeof(*hdr), GFP_NOFS); + if (!hdr) + goto error_gk; + + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(txb->seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(txb->len); + metadata.len = sizeof(*hdr); + metadata.data = hdr; + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], txb->data, txb->alloc_size); + + ret = crypto_krb5_get_mic(gk->krb5, gk->tx_Kc, &metadata, + sg, 1, txb->alloc_size, + txb->crypto_header, txb->sec_header + txb->len); + if (ret >= 0) { + txb->pkt_len = ret; + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; + gk->bytes_remaining -= ret; + } + kfree(hdr); +error_gk: + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * wholly encrypt a packet (level 2 security) + */ +static int rxgk_secure_packet_encrypted(const struct rxrpc_call *call, + struct rxgk_context *gk, + struct rxrpc_txbuf *txb) +{ + struct rxgk_header *hdr; + struct scatterlist sg[1]; + int ret; + + _enter("%x", txb->len); + + /* Insert the header into the buffer. */ + hdr = txb->data + txb->crypto_header; + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(txb->seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(txb->len); + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], txb->data, txb->alloc_size); + + ret = crypto_krb5_encrypt(gk->krb5, gk->tx_enc, + sg, 1, txb->alloc_size, + txb->crypto_header, txb->sec_header + txb->len, + false); + if (ret >= 0) { + txb->pkt_len = ret; + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; + gk->bytes_remaining -= ret; + } + + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * checksum an RxRPC packet header + */ +static int rxgk_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +{ + struct rxgk_context *gk; + int ret; + + _enter("{%d{%x}},{#%u},%u,", + call->debug_id, key_serial(call->conn->key), txb->seq, txb->len); + + gk = rxgk_get_key(call->conn, NULL); + if (IS_ERR(gk)) + return PTR_ERR(gk) == -ESTALE ? -EKEYREJECTED : PTR_ERR(gk); + + ret = key_validate(call->conn->key); + if (ret < 0) + return ret; + + txb->cksum = htons(gk->key_number); + + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + rxgk_put(gk); + txb->pkt_len = txb->len; + return 0; + case RXRPC_SECURITY_AUTH: + return rxgk_secure_packet_integrity(call, gk, txb); + case RXRPC_SECURITY_ENCRYPT: + return rxgk_secure_packet_encrypted(call, gk, txb); + default: + rxgk_put(gk); + return -EPERM; + } +} + +/* + * Integrity mode (check the signature on a packet - level 1 security) + */ +static int rxgk_verify_packet_integrity(struct rxrpc_call *call, + struct rxgk_context *gk, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_header *hdr; + struct krb5_buffer metadata; + unsigned int offset = sp->offset, len = sp->len; + size_t data_offset = 0, data_len = len; + u32 ac; + int ret = -ENOMEM; + + _enter(""); + + crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE, + &data_offset, &data_len); + + hdr = kzalloc(sizeof(*hdr), GFP_NOFS); + if (!hdr) + return -ENOMEM; + + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(sp->hdr.seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(data_len); + + metadata.len = sizeof(*hdr); + metadata.data = hdr; + ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata, + skb, &offset, &len, &ac); + kfree(hdr); + if (ret == -EPROTO) { + rxrpc_abort_eproto(call, skb, ac, + rxgk_abort_1_verify_mic_eproto); + } else { + sp->offset = offset; + sp->len = len; + } + + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Decrypt an encrypted packet (level 2 security). + */ +static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, + struct rxgk_context *gk, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_header hdr; + unsigned int offset = sp->offset, len = sp->len; + int ret; + u32 ac; + + _enter(""); + + ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); + if (ret == -EPROTO) + rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); + if (ret < 0) + goto error; + + if (len < sizeof(hdr)) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_header); + goto error; + } + + /* Extract the header from the skb */ + ret = skb_copy_bits(skb, offset, &hdr, sizeof(hdr)); + if (ret < 0) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_encdata); + goto error; + } + offset += sizeof(hdr); + len -= sizeof(hdr); + + if (ntohl(hdr.epoch) != call->conn->proto.epoch || + ntohl(hdr.cid) != call->cid || + ntohl(hdr.call_number) != call->call_id || + ntohl(hdr.seq) != sp->hdr.seq || + ntohl(hdr.sec_index) != call->security_ix || + ntohl(hdr.data_len) > len) { + ret = rxrpc_abort_eproto(call, skb, RXGK_SEALEDINCON, + rxgk_abort_2_short_data); + goto error; + } + + sp->offset = offset; + sp->len = ntohl(hdr.data_len); + ret = 0; +error: + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Verify the security on a received packet or subpacket (if part of a + * jumbo packet). + */ +static int rxgk_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_context *gk; + u16 key_number = sp->hdr.cksum; + + _enter("{%d{%x}},{#%u}", + call->debug_id, key_serial(call->conn->key), sp->hdr.seq); + + gk = rxgk_get_key(call->conn, &key_number); + if (IS_ERR(gk)) { + switch (PTR_ERR(gk)) { + case -ESTALE: + return rxrpc_abort_eproto(call, skb, RXGK_BADKEYNO, + rxgk_abort_bad_key_number); + default: + return PTR_ERR(gk); + } + } + + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + return 0; + case RXRPC_SECURITY_AUTH: + return rxgk_verify_packet_integrity(call, gk, skb); + case RXRPC_SECURITY_ENCRYPT: + return rxgk_verify_packet_encrypted(call, gk, skb); + default: + rxgk_put(gk); + return -ENOANO; + } +} + +/* + * Allocate memory to hold a challenge or a response packet. We're not running + * in the io_thread, so we can't use ->tx_alloc. + */ +static struct page *rxgk_alloc_packet(size_t total_len) +{ + gfp_t gfp = GFP_NOFS; + int order; + + order = get_order(total_len); + if (order > 0) + gfp |= __GFP_COMP; + return alloc_pages(gfp, order); +} + +/* + * Issue a challenge. + */ +static int rxgk_issue_challenge(struct rxrpc_connection *conn) +{ + struct rxrpc_wire_header *whdr; + struct bio_vec bvec[1]; + struct msghdr msg; + struct page *page; + size_t len = sizeof(*whdr) + sizeof(conn->rxgk.nonce); + u32 serial; + int ret; + + _enter("{%d}", conn->debug_id); + + get_random_bytes(&conn->rxgk.nonce, sizeof(conn->rxgk.nonce)); + + /* We can't use conn->tx_alloc without a lock */ + page = rxgk_alloc_packet(sizeof(*whdr) + sizeof(conn->rxgk.nonce)); + if (!page) + return -ENOMEM; + + bvec_set_page(&bvec[0], page, len, 0); + iov_iter_bvec(&msg.msg_iter, WRITE, bvec, 1, len); + + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_SPLICE_PAGES; + + whdr = page_address(page); + whdr->epoch = htonl(conn->proto.epoch); + whdr->cid = htonl(conn->proto.cid); + whdr->callNumber = 0; + whdr->seq = 0; + whdr->type = RXRPC_PACKET_TYPE_CHALLENGE; + whdr->flags = conn->out_clientflag; + whdr->userStatus = 0; + whdr->securityIndex = conn->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(conn->service_id); + + memcpy(whdr + 1, conn->rxgk.nonce, sizeof(conn->rxgk.nonce)); + + serial = rxrpc_get_next_serials(conn, 1); + whdr->serial = htonl(serial); + + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + if (ret > 0) + conn->peer->last_tx_at = ktime_get_seconds(); + __free_page(page); + + if (ret < 0) { + trace_rxrpc_tx_fail(conn->debug_id, serial, ret, + rxrpc_tx_point_rxgk_challenge); + return -EAGAIN; + } + + trace_rxrpc_tx_packet(conn->debug_id, whdr, + rxrpc_tx_point_rxgk_challenge); + _leave(" = 0"); + return 0; +} + +/* + * Validate a challenge packet. + */ +static bool rxgk_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + u8 nonce[20]; + + if (!conn->key) { + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxgk_abort_chall_no_key); + return false; + } + + if (key_validate(conn->key) < 0) { + rxrpc_abort_conn(conn, skb, RXGK_EXPIRED, -EPROTO, + rxgk_abort_chall_key_expired); + return false; + } + + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + nonce, sizeof(nonce)) < 0) { + rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_chall_short); + return false; + } + + trace_rxrpc_rx_challenge(conn, sp->hdr.serial, 0, *(u32 *)nonce, 0); + return true; +} + +/** + * rxgk_kernel_query_challenge - Query RxGK-specific challenge parameters + * @challenge: The challenge packet to query + * + * Return: The Kerberos 5 encoding type for the challenged connection. + */ +u32 rxgk_kernel_query_challenge(struct sk_buff *challenge) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + return sp->chall.conn->rxgk.enctype; +} +EXPORT_SYMBOL(rxgk_kernel_query_challenge); + +/* + * Fill out the control message to pass to userspace to inform about the + * challenge. + */ +static int rxgk_challenge_to_recvmsg(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct msghdr *msg) +{ + struct rxgk_challenge chall; + + chall.base.service_id = conn->service_id; + chall.base.security_index = conn->security_ix; + chall.enctype = conn->rxgk.enctype; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_CHALLENGED, sizeof(chall), &chall); +} + +/* + * Insert the requisite amount of XDR padding for the length given. + */ +static int rxgk_pad_out(struct sk_buff *response, size_t len, size_t offset) +{ + __be32 zero = 0; + size_t pad = xdr_round_up(len) - len; + int ret; + + if (!pad) + return 0; + + ret = skb_store_bits(response, offset, &zero, pad); + if (ret < 0) + return ret; + return pad; +} + +/* + * Insert the header into the response. + */ +static noinline ssize_t rxgk_insert_response_header(struct rxrpc_connection *conn, + struct rxgk_context *gk, + struct sk_buff *response, + size_t offset) +{ + struct rxrpc_skb_priv *rsp = rxrpc_skb(response); + + struct { + struct rxrpc_wire_header whdr; + __be32 start_time_msw; + __be32 start_time_lsw; + __be32 ticket_len; + } h; + int ret; + + rsp->resp.kvno = gk->key_number; + rsp->resp.version = gk->krb5->etype; + + h.whdr.epoch = htonl(conn->proto.epoch); + h.whdr.cid = htonl(conn->proto.cid); + h.whdr.callNumber = 0; + h.whdr.serial = 0; + h.whdr.seq = 0; + h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE; + h.whdr.flags = conn->out_clientflag; + h.whdr.userStatus = 0; + h.whdr.securityIndex = conn->security_ix; + h.whdr.cksum = htons(gk->key_number); + h.whdr.serviceId = htons(conn->service_id); + h.start_time_msw = htonl(upper_32_bits(conn->rxgk.start_time)); + h.start_time_lsw = htonl(lower_32_bits(conn->rxgk.start_time)); + h.ticket_len = htonl(gk->key->ticket.len); + + ret = skb_store_bits(response, offset, &h, sizeof(h)); + return ret < 0 ? ret : sizeof(h); +} + +/* + * Construct the authenticator to go in the response packet + * + * struct RXGK_Authenticator { + * opaque nonce[20]; + * opaque appdata<>; + * RXGK_Level level; + * unsigned int epoch; + * unsigned int cid; + * unsigned int call_numbers<>; + * }; + */ +static ssize_t rxgk_construct_authenticator(struct rxrpc_connection *conn, + struct sk_buff *challenge, + const struct krb5_buffer *appdata, + struct sk_buff *response, + size_t offset) +{ + struct { + u8 nonce[20]; + __be32 appdata_len; + } a; + struct { + __be32 level; + __be32 epoch; + __be32 cid; + __be32 call_numbers_count; + __be32 call_numbers[4]; + } b; + int ret; + + ret = skb_copy_bits(challenge, sizeof(struct rxrpc_wire_header), + a.nonce, sizeof(a.nonce)); + if (ret < 0) + return -EPROTO; + + a.appdata_len = htonl(appdata->len); + + ret = skb_store_bits(response, offset, &a, sizeof(a)); + if (ret < 0) + return ret; + offset += sizeof(a); + + if (appdata->len) { + ret = skb_store_bits(response, offset, appdata->data, appdata->len); + if (ret < 0) + return ret; + offset += appdata->len; + + ret = rxgk_pad_out(response, appdata->len, offset); + if (ret < 0) + return ret; + offset += ret; + } + + b.level = htonl(conn->security_level); + b.epoch = htonl(conn->proto.epoch); + b.cid = htonl(conn->proto.cid); + b.call_numbers_count = htonl(4); + b.call_numbers[0] = htonl(conn->channels[0].call_counter); + b.call_numbers[1] = htonl(conn->channels[1].call_counter); + b.call_numbers[2] = htonl(conn->channels[2].call_counter); + b.call_numbers[3] = htonl(conn->channels[3].call_counter); + + ret = skb_store_bits(response, offset, &b, sizeof(b)); + if (ret < 0) + return ret; + return sizeof(a) + xdr_round_up(appdata->len) + sizeof(b); +} + +static ssize_t rxgk_encrypt_authenticator(struct rxrpc_connection *conn, + struct rxgk_context *gk, + struct sk_buff *response, + size_t offset, + size_t alloc_len, + size_t auth_offset, + size_t auth_len) +{ + struct scatterlist sg[16]; + int nr_sg; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(response, sg, offset, alloc_len); + if (unlikely(nr_sg < 0)) + return nr_sg; + return crypto_krb5_encrypt(gk->krb5, gk->resp_enc, sg, nr_sg, alloc_len, + auth_offset, auth_len, false); +} + +/* + * Construct the response. + * + * struct RXGK_Response { + * rxgkTime start_time; + * RXGK_Data token; + * opaque authenticator + * }; + */ +static int rxgk_construct_response(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + struct rxrpc_skb_priv *csp, *rsp; + struct rxgk_context *gk; + struct sk_buff *response; + size_t len, auth_len, authx_len, offset, auth_offset, authx_offset; + __be32 tmp; + int ret; + + gk = rxgk_get_key(conn, NULL); + if (IS_ERR(gk)) + return PTR_ERR(gk); + + auth_len = 20 + (4 + appdata->len) + 12 + (1 + 4) * 4; + authx_len = crypto_krb5_how_much_buffer(gk->krb5, KRB5_ENCRYPT_MODE, + auth_len, &auth_offset); + len = sizeof(struct rxrpc_wire_header) + + 8 + (4 + xdr_round_up(gk->key->ticket.len)) + (4 + authx_len); + + response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS); + if (!response) + goto error; + rxrpc_new_skb(response, rxrpc_skb_new_response_rxgk); + response->len = len; + response->data_len = len; + + ret = rxgk_insert_response_header(conn, gk, response, 0); + if (ret < 0) + goto error; + offset = ret; + + ret = skb_store_bits(response, offset, gk->key->ticket.data, gk->key->ticket.len); + if (ret < 0) + goto error; + offset += gk->key->ticket.len; + ret = rxgk_pad_out(response, gk->key->ticket.len, offset); + if (ret < 0) + goto error; + + authx_offset = offset + ret + 4; /* Leave a gap for the length. */ + + ret = rxgk_construct_authenticator(conn, challenge, appdata, response, + authx_offset + auth_offset); + if (ret < 0) + goto error; + auth_len = ret; + + ret = rxgk_encrypt_authenticator(conn, gk, response, + authx_offset, authx_len, + auth_offset, auth_len); + if (ret < 0) + goto error; + authx_len = ret; + + tmp = htonl(authx_len); + ret = skb_store_bits(response, authx_offset - 4, &tmp, 4); + if (ret < 0) + goto error; + + ret = rxgk_pad_out(response, authx_len, authx_offset + authx_len); + if (ret < 0) + return ret; + len = authx_offset + authx_len + ret; + + if (len != response->len) { + response->len = len; + response->data_len = len; + } + + csp = rxrpc_skb(challenge); + rsp = rxrpc_skb(response); + rsp->resp.len = len; + rsp->resp.challenge_serial = csp->hdr.serial; + rxrpc_post_response(conn, response); + response = NULL; + ret = 0; + +error: + rxrpc_free_skb(response, rxrpc_skb_put_response); + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Respond to a challenge packet. + */ +static int rxgk_respond_to_challenge(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + if (key_validate(conn->key) < 0) + return rxrpc_abort_conn(conn, NULL, RXGK_EXPIRED, -EPROTO, + rxgk_abort_chall_key_expired); + + return rxgk_construct_response(conn, challenge, appdata); +} + +static int rxgk_respond_to_challenge_no_appdata(struct rxrpc_connection *conn, + struct sk_buff *challenge) +{ + struct krb5_buffer appdata = {}; + + return rxgk_respond_to_challenge(conn, challenge, &appdata); +} + +/** + * rxgk_kernel_respond_to_challenge - Respond to a challenge with appdata + * @challenge: The challenge to respond to + * @appdata: The application data to include in the RESPONSE authenticator + * + * Allow a kernel application to respond to a CHALLENGE with application data + * to be included in the RxGK RESPONSE Authenticator. + * + * Return: %0 if successful and a negative error code otherwise. + */ +int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + + return rxgk_respond_to_challenge(csp->chall.conn, challenge, appdata); +} +EXPORT_SYMBOL(rxgk_kernel_respond_to_challenge); + +/* + * Parse sendmsg() control message and respond to challenge. We need to see if + * there's an appdata to fish out. + */ +static int rxgk_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + struct krb5_buffer appdata = {}; + struct cmsghdr *cmsg; + + for_each_cmsghdr(cmsg, msg) { + if (cmsg->cmsg_level != SOL_RXRPC || + cmsg->cmsg_type != RXRPC_RESP_RXGK_APPDATA) + continue; + if (appdata.data) + return -EINVAL; + appdata.data = CMSG_DATA(cmsg); + appdata.len = cmsg->cmsg_len - sizeof(struct cmsghdr); + } + + return rxgk_kernel_respond_to_challenge(challenge, &appdata); +} + +/* + * Verify the authenticator. + * + * struct RXGK_Authenticator { + * opaque nonce[20]; + * opaque appdata<>; + * RXGK_Level level; + * unsigned int epoch; + * unsigned int cid; + * unsigned int call_numbers<>; + * }; + */ +static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn, + const struct krb5_enctype *krb5, + struct sk_buff *skb, + __be32 *p, __be32 *end) +{ + u32 app_len, call_count, level, epoch, cid, i; + + _enter(""); + + if (memcmp(p, conn->rxgk.nonce, 20) != 0) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_nonce); + p += 20 / sizeof(__be32); + + app_len = ntohl(*p++); + if (app_len > (end - p) * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_applen); + + p += xdr_round_up(app_len) / sizeof(__be32); + if (end - p < 4) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_applen); + + level = ntohl(*p++); + epoch = ntohl(*p++); + cid = ntohl(*p++); + call_count = ntohl(*p++); + + if (level != conn->security_level || + epoch != conn->proto.epoch || + cid != conn->proto.cid || + call_count > 4) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_param); + + if (end - p < call_count) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_call_list); + + for (i = 0; i < call_count; i++) { + u32 call_id = ntohl(*p++); + + if (call_id > INT_MAX) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_callid); + + if (call_id < conn->channels[i].call_counter) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_call_ctr); + + if (call_id > conn->channels[i].call_counter) { + if (conn->channels[i].call) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_call_state); + + conn->channels[i].call_counter = call_id; + } + } + + _leave(" = 0"); + return 0; +} + +/* + * Extract the authenticator and verify it. + */ +static int rxgk_verify_authenticator(struct rxrpc_connection *conn, + const struct krb5_enctype *krb5, + struct sk_buff *skb, + unsigned int auth_offset, unsigned int auth_len) +{ + void *auth; + __be32 *p; + int ret; + + auth = kmalloc(auth_len, GFP_NOFS); + if (!auth) + return -ENOMEM; + + ret = skb_copy_bits(skb, auth_offset, auth, auth_len); + if (ret < 0) { + ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_auth); + goto error; + } + + p = auth; + ret = rxgk_do_verify_authenticator(conn, krb5, skb, p, p + auth_len); +error: + kfree(auth); + return ret; +} + +/* + * Verify a response. + * + * struct RXGK_Response { + * rxgkTime start_time; + * RXGK_Data token; + * opaque authenticator + * }; + */ +static int rxgk_verify_response(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + const struct krb5_enctype *krb5; + struct rxrpc_key_token *token; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_response rhdr; + struct rxgk_context *gk; + struct key *key = NULL; + unsigned int offset = sizeof(struct rxrpc_wire_header); + unsigned int len = skb->len - sizeof(struct rxrpc_wire_header); + unsigned int token_offset, token_len; + unsigned int auth_offset, auth_len; + __be32 xauth_len; + int ret, ec; + + _enter("{%d}", conn->debug_id); + + /* Parse the RXGK_Response object */ + if (sizeof(rhdr) + sizeof(__be32) > len) + goto short_packet; + + if (skb_copy_bits(skb, offset, &rhdr, sizeof(rhdr)) < 0) + goto short_packet; + offset += sizeof(rhdr); + len -= sizeof(rhdr); + + token_offset = offset; + token_len = ntohl(rhdr.token_len); + if (xdr_round_up(token_len) + sizeof(__be32) > len) + goto short_packet; + + offset += xdr_round_up(token_len); + len -= xdr_round_up(token_len); + + if (skb_copy_bits(skb, offset, &xauth_len, sizeof(xauth_len)) < 0) + goto short_packet; + offset += sizeof(xauth_len); + len -= sizeof(xauth_len); + + auth_offset = offset; + auth_len = ntohl(xauth_len); + if (auth_len < len) + goto short_packet; + if (auth_len & 3) + goto inconsistent; + if (auth_len < 20 + 9 * 4) + goto auth_too_short; + + /* We need to extract and decrypt the token and instantiate a session + * key for it. This bit, however, is application-specific. If + * possible, we use a default parser, but we might end up bumping this + * to the app to deal with - which might mean a round trip to + * userspace. + */ + ret = rxgk_extract_token(conn, skb, token_offset, token_len, &key); + if (ret < 0) + goto out; + + /* We now have a key instantiated from the decrypted ticket. We can + * pass this to the application so that they can parse the ticket + * content and we can use the session key it contains to derive the + * keys we need. + * + * Note that we have to switch enctype at this point as the enctype of + * the ticket doesn't necessarily match that of the transport. + */ + token = key->payload.data[0]; + conn->security_level = token->rxgk->level; + conn->rxgk.start_time = __be64_to_cpu(rhdr.start_time); + + gk = rxgk_generate_transport_key(conn, token->rxgk, sp->hdr.cksum, GFP_NOFS); + if (IS_ERR(gk)) { + ret = PTR_ERR(gk); + goto cant_get_token; + } + + krb5 = gk->krb5; + + trace_rxrpc_rx_response(conn, sp->hdr.serial, krb5->etype, sp->hdr.cksum, token_len); + + /* Decrypt, parse and verify the authenticator. */ + ret = rxgk_decrypt_skb(krb5, gk->resp_enc, skb, + &auth_offset, &auth_len, &ec); + if (ret < 0) { + rxrpc_abort_conn(conn, skb, RXGK_SEALEDINCON, ret, + rxgk_abort_resp_auth_dec); + goto out; + } + + ret = rxgk_verify_authenticator(conn, krb5, skb, auth_offset, auth_len); + if (ret < 0) + goto out; + + conn->key = key; + key = NULL; + ret = 0; +out: + key_put(key); + _leave(" = %d", ret); + return ret; + +inconsistent: + ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_xdr_align); + goto out; +auth_too_short: + ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_short_auth); + goto out; +short_packet: + ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_short_packet); + goto out; + +cant_get_token: + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -EINVAL: + ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_internal_error); + goto out; + case -ENOPKG: + ret = rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP, + -EKEYREJECTED, rxgk_abort_resp_nopkg); + goto out; + } + +temporary_error: + /* Ignore the response packet if we got a temporary error such as + * ENOMEM. We just want to send the challenge again. Note that we + * also come out this way if the ticket decryption fails. + */ + goto out; +} + +/* + * clear the connection security + */ +static void rxgk_clear(struct rxrpc_connection *conn) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(conn->rxgk.keys); i++) + rxgk_put(conn->rxgk.keys[i]); +} + +/* + * Initialise the RxGK security service. + */ +static int rxgk_init(void) +{ + return 0; +} + +/* + * Clean up the RxGK security service. + */ +static void rxgk_exit(void) +{ +} + +/* + * RxRPC YFS GSSAPI-based security + */ +const struct rxrpc_security rxgk_yfs = { + .name = "yfs-rxgk", + .security_index = RXRPC_SECURITY_YFS_RXGK, + .no_key_abort = RXGK_NOTAUTH, + .init = rxgk_init, + .exit = rxgk_exit, + .preparse_server_key = rxgk_preparse_server_key, + .free_preparse_server_key = rxgk_free_preparse_server_key, + .destroy_server_key = rxgk_destroy_server_key, + .describe_server_key = rxgk_describe_server_key, + .init_connection_security = rxgk_init_connection_security, + .alloc_txbuf = rxgk_alloc_txbuf, + .secure_packet = rxgk_secure_packet, + .verify_packet = rxgk_verify_packet, + .free_call_crypto = rxgk_free_call_crypto, + .issue_challenge = rxgk_issue_challenge, + .validate_challenge = rxgk_validate_challenge, + .challenge_to_recvmsg = rxgk_challenge_to_recvmsg, + .sendmsg_respond_to_challenge = rxgk_sendmsg_respond_to_challenge, + .respond_to_challenge = rxgk_respond_to_challenge_no_appdata, + .verify_response = rxgk_verify_response, + .clear = rxgk_clear, + .default_decode_ticket = rxgk_yfs_decode_ticket, +}; diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c new file mode 100644 index 000000000000..6206a84395b8 --- /dev/null +++ b/net/rxrpc/rxgk_app.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Application-specific bits for GSSAPI-based RxRPC security + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include "ar-internal.h" +#include "rxgk_common.h" + +/* + * Decode a default-style YFS ticket in a response and turn it into an + * rxrpc-type key. + * + * struct rxgk_key { + * afs_uint32 enctype; + * opaque key<>; + * }; + * + * struct RXGK_AuthName { + * afs_int32 kind; + * opaque data; + * opaque display; + * }; + * + * struct RXGK_Token { + * rxgk_key K0; + * RXGK_Level level; + * rxgkTime starttime; + * afs_int32 lifetime; + * afs_int32 bytelife; + * rxgkTime expirationtime; + * struct RXGK_AuthName identities<>; + * }; + */ +int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key) +{ + struct rxrpc_key_token *token; + const struct cred *cred = current_cred(); // TODO - use socket creds + struct key *key; + size_t pre_ticket_len, payload_len; + unsigned int klen, enctype; + void *payload, *ticket; + __be32 *t, *p, *q, tmp[2]; + int ret; + + _enter(""); + + /* Get the session key length */ + ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp)); + if (ret < 0) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_klen); + enctype = ntohl(tmp[0]); + klen = ntohl(tmp[1]); + + if (klen > ticket_len - 10 * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_key); + + pre_ticket_len = ((5 + 14) * sizeof(__be32) + + xdr_round_up(klen) + + sizeof(__be32)); + payload_len = pre_ticket_len + xdr_round_up(ticket_len); + + payload = kzalloc(payload_len, GFP_NOFS); + if (!payload) + return -ENOMEM; + + /* We need to fill out the XDR form for a key payload that we can pass + * to add_key(). Start by copying in the ticket so that we can parse + * it. + */ + ticket = payload + pre_ticket_len; + ret = skb_copy_bits(skb, ticket_offset, ticket, ticket_len); + if (ret < 0) { + ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_tkt); + goto error; + } + + /* Fill out the form header. */ + p = payload; + p[0] = htonl(0); /* Flags */ + p[1] = htonl(1); /* len(cellname) */ + p[2] = htonl(0x20000000); /* Cellname " " */ + p[3] = htonl(1); /* #tokens */ + p[4] = htonl(15 * sizeof(__be32) + xdr_round_up(klen) + + xdr_round_up(ticket_len)); /* Token len */ + + /* Now fill in the body. Most of this we can just scrape directly from + * the ticket. + */ + t = ticket + sizeof(__be32) * 2 + xdr_round_up(klen); + q = payload + 5 * sizeof(__be32); + q[0] = htonl(RXRPC_SECURITY_YFS_RXGK); + q[1] = t[1]; /* begintime - msw */ + q[2] = t[2]; /* - lsw */ + q[3] = t[5]; /* endtime - msw */ + q[4] = t[6]; /* - lsw */ + q[5] = 0; /* level - msw */ + q[6] = t[0]; /* - lsw */ + q[7] = 0; /* lifetime - msw */ + q[8] = t[3]; /* - lsw */ + q[9] = 0; /* bytelife - msw */ + q[10] = t[4]; /* - lsw */ + q[11] = 0; /* enctype - msw */ + q[12] = htonl(enctype); /* - lsw */ + q[13] = htonl(klen); /* Key length */ + + q += 14; + + memcpy(q, ticket + sizeof(__be32) * 2, klen); + q += xdr_round_up(klen) / 4; + q[0] = htonl(ticket_len); + q++; + if (WARN_ON((unsigned long)q != (unsigned long)ticket)) { + ret = -EIO; + goto error; + } + + /* Ticket read in with skb_copy_bits above */ + q += xdr_round_up(ticket_len) / 4; + if (WARN_ON((unsigned long)q - (unsigned long)payload != payload_len)) { + ret = -EIO; + goto error; + } + + /* Now turn that into a key. */ + key = key_alloc(&key_type_rxrpc, "x", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, // TODO: Use socket owner + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(key)) { + _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); + goto error; + } + + _debug("key %d", key_serial(key)); + + ret = key_instantiate_and_link(key, payload, payload_len, NULL, NULL); + if (ret < 0) + goto error_key; + + token = key->payload.data[0]; + token->no_leak_key = true; + *_key = key; + key = NULL; + ret = 0; + goto error; + +error_key: + key_put(key); +error: + kfree_sensitive(payload); + _leave(" = %d", ret); + return ret; +} + +/* + * Extract the token and set up a session key from the details. + * + * struct RXGK_TokenContainer { + * afs_int32 kvno; + * afs_int32 enctype; + * opaque encrypted_token<>; + * }; + * + * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-afs-08 sec 6.1] + */ +int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int token_offset, unsigned int token_len, + struct key **_key) +{ + const struct krb5_enctype *krb5; + const struct krb5_buffer *server_secret; + struct crypto_aead *token_enc = NULL; + struct key *server_key; + unsigned int ticket_offset, ticket_len; + u32 kvno, enctype; + int ret, ec; + + struct { + __be32 kvno; + __be32 enctype; + __be32 token_len; + } container; + + /* Decode the RXGK_TokenContainer object. This tells us which server + * key we should be using. We can then fetch the key, get the secret + * and set up the crypto to extract the token. + */ + if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0) + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); + + kvno = ntohl(container.kvno); + enctype = ntohl(container.enctype); + ticket_len = ntohl(container.token_len); + ticket_offset = token_offset + sizeof(container); + + if (xdr_round_up(ticket_len) > token_len - 3 * 4) + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); + + _debug("KVNO %u", kvno); + _debug("ENC %u", enctype); + _debug("TLEN %u", ticket_len); + + server_key = rxrpc_look_up_server_security(conn, skb, kvno, enctype); + if (IS_ERR(server_key)) + goto cant_get_server_key; + + down_read(&server_key->sem); + server_secret = (const void *)&server_key->payload.data[2]; + ret = rxgk_set_up_token_cipher(server_secret, &token_enc, enctype, &krb5, GFP_NOFS); + up_read(&server_key->sem); + key_put(server_key); + if (ret < 0) + goto cant_get_token; + + /* We can now decrypt and parse the token/ticket. This allows us to + * gain access to K0, from which we can derive the transport key and + * thence decode the authenticator. + */ + ret = rxgk_decrypt_skb(krb5, token_enc, skb, + &ticket_offset, &ticket_len, &ec); + crypto_free_aead(token_enc); + token_enc = NULL; + if (ret < 0) + return rxrpc_abort_conn(conn, skb, ec, ret, + rxgk_abort_resp_tok_dec); + + ret = conn->security->default_decode_ticket(conn, skb, ticket_offset, + ticket_len, _key); + if (ret < 0) + goto cant_get_token; + + _leave(" = 0"); + return ret; + +cant_get_server_key: + ret = PTR_ERR(server_key); + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -ENOKEY: + case -EKEYREJECTED: + case -EKEYEXPIRED: + case -EKEYREVOKED: + case -EPERM: + return rxrpc_abort_conn(conn, skb, RXGK_BADKEYNO, -EKEYREJECTED, + rxgk_abort_resp_tok_nokey); + default: + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_tok_keyerr); + } + +cant_get_token: + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -EINVAL: + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_tok_internal_error); + case -ENOPKG: + return rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP, + -EKEYREJECTED, rxgk_abort_resp_tok_nopkg); + } + +temporary_error: + /* Ignore the response packet if we got a temporary error such as + * ENOMEM. We just want to send the challenge again. Note that we + * also come out this way if the ticket decryption fails. + */ + return ret; +} diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h index da1464e65766..7370a5655985 100644 --- a/net/rxrpc/rxgk_common.h +++ b/net/rxrpc/rxgk_common.h @@ -33,6 +33,19 @@ struct rxgk_context { struct crypto_aead *resp_enc; /* Response packet enc key */ }; +#define xdr_round_up(x) (round_up((x), sizeof(__be32))) +#define xdr_object_len(x) (4 + xdr_round_up(x)) + +/* + * rxgk_app.c + */ +int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key); +int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int token_offset, unsigned int token_len, + struct key **_key); + /* * rxgk_kdf.c */ @@ -46,3 +59,81 @@ int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key, unsigned int enctype, const struct krb5_enctype **_krb5, gfp_t gfp); + +/* + * Apply decryption and checksumming functions to part of an skbuff. The + * offset and length are updated to reflect the actual content of the encrypted + * region. + */ +static inline +int rxgk_decrypt_skb(const struct krb5_enctype *krb5, + struct crypto_aead *aead, + struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len, + int *_error_code) +{ + struct scatterlist sg[16]; + size_t offset = 0, len = *_len; + int nr_sg, ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(skb, sg, *_offset, len); + if (unlikely(nr_sg < 0)) + return nr_sg; + + ret = crypto_krb5_decrypt(krb5, aead, sg, nr_sg, + &offset, &len); + switch (ret) { + case 0: + *_offset += offset; + *_len = len; + break; + case -EPROTO: + case -EBADMSG: + *_error_code = RXGK_SEALEDINCON; + break; + default: + break; + } + + return ret; +} + +/* + * Check the MIC on a region of an skbuff. The offset and length are updated + * to reflect the actual content of the secure region. + */ +static inline +int rxgk_verify_mic_skb(const struct krb5_enctype *krb5, + struct crypto_shash *shash, + const struct krb5_buffer *metadata, + struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len, + u32 *_error_code) +{ + struct scatterlist sg[16]; + size_t offset = 0, len = *_len; + int nr_sg, ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(skb, sg, *_offset, len); + if (unlikely(nr_sg < 0)) + return nr_sg; + + ret = crypto_krb5_verify_mic(krb5, shash, metadata, sg, nr_sg, + &offset, &len); + switch (ret) { + case 0: + *_offset += offset; + *_len = len; + break; + case -EPROTO: + case -EBADMSG: + *_error_code = RXGK_SEALEDINCON; + break; + default: + break; + } + + return ret; +} diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 8ddccee69009..0b5e007c7de9 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -177,8 +177,10 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem if (!txb) return NULL; - txb->offset += shdr; - txb->space = part; + txb->crypto_header = 0; + txb->sec_header = shdr; + txb->offset += shdr; + txb->space = part; return txb; } diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 9784adc8f275..078d91a6b77f 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -20,6 +20,9 @@ static const struct rxrpc_security *rxrpc_security_types[] = { #ifdef CONFIG_RXKAD [RXRPC_SECURITY_RXKAD] = &rxkad, #endif +#ifdef CONFIG_RXGK + [RXRPC_SECURITY_YFS_RXGK] = &rxgk_yfs, +#endif }; int __init rxrpc_init_security(void) -- cgit v1.2.3 From 7a7513a3081c6a2729d8570c77bbed1978277dc9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:54 +0100 Subject: rxrpc: rxgk: Implement connection rekeying Implement rekeying of connections with the RxGK security class. This involves regenerating the keys with a different key number as part of the input data after a certain amount of time or a certain amount of bytes encrypted. Rekeying may be triggered by either end. The LSW of the key number is inserted into the security-specific field in the RX header, and we try and expand it to 32-bits to make it last longer. Signed-off-by: David Howells cc: Marc Dionne cc: Herbert Xu cc: Chuck Lever cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-10-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 24 +++++++ net/rxrpc/ar-internal.h | 5 +- net/rxrpc/conn_object.c | 1 + net/rxrpc/rxgk.c | 158 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 181 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index aab81e8196ae..920439df1f6f 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -2725,6 +2725,30 @@ TRACE_EVENT(rxrpc_rack_timer, ktime_to_us(__entry->delay)) ); +TRACE_EVENT(rxrpc_rxgk_rekey, + TP_PROTO(struct rxrpc_connection *conn, + unsigned int current_key, unsigned int requested_key), + + TP_ARGS(conn, current_key, requested_key), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(unsigned int, current_key) + __field(unsigned int, requested_key) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->current_key = current_key; + __entry->requested_key = requested_key; + ), + + TP_printk("C=%08x cur=%x req=%x", + __entry->conn, + __entry->current_key, + __entry->requested_key) + ); + #undef EM #undef E_ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 57ea6d61b7a2..fd235bfa226d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -565,13 +565,16 @@ struct rxrpc_connection { u32 nonce; /* response re-use preventer */ } rxkad; struct { - struct rxgk_context *keys[1]; + struct rxgk_context *keys[4]; /* (Re-)keying buffer */ u64 start_time; /* The start time for TK derivation */ u8 nonce[20]; /* Response re-use preventer */ u32 enctype; /* Kerberos 5 encoding type */ + u32 key_number; /* Current key number */ } rxgk; }; + rwlock_t security_use_lock; /* Security use/modification lock */ struct sk_buff *tx_response; /* Response packet to be transmitted */ + unsigned long flags; unsigned long events; unsigned long idle_timestamp; /* Time at which last became idle */ diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index d14a802d2001..37340becb224 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -73,6 +73,7 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, skb_queue_head_init(&conn->rx_queue); conn->rxnet = rxnet; conn->security = &rxrpc_no_security; + rwlock_init(&conn->security_use_lock); spin_lock_init(&conn->state_lock); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); conn->idle_timestamp = jiffies; diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 02752eeb2395..8b1ccdf8bc58 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -76,11 +76,155 @@ static void rxgk_describe_server_key(const struct key *key, struct seq_file *m) seq_printf(m, ": %s", krb5->name); } +/* + * Handle rekeying the connection when we see our limits overrun or when the + * far side decided to rekey. + * + * Returns a ref on the context if successful or -ESTALE if the key is out of + * date. + */ +static struct rxgk_context *rxgk_rekey(struct rxrpc_connection *conn, + const u16 *specific_key_number) +{ + struct rxgk_context *gk, *dead = NULL; + unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1; + bool crank = false; + + _enter("%d", specific_key_number ? *specific_key_number : -1); + + mutex_lock(&conn->security_lock); + + current_key = conn->rxgk.key_number; + if (!specific_key_number) { + key_number = current_key; + } else { + if (*specific_key_number == (u16)current_key) + key_number = current_key; + else if (*specific_key_number == (u16)(current_key - 1)) + key_number = current_key - 1; + else if (*specific_key_number == (u16)(current_key + 1)) + goto crank_window; + else + goto bad_key; + } + + gk = conn->rxgk.keys[key_number & mask]; + if (!gk) + goto generate_key; + if (!specific_key_number && + test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags)) + goto crank_window; + +grab: + refcount_inc(&gk->usage); + mutex_unlock(&conn->security_lock); + rxgk_put(dead); + return gk; + +crank_window: + trace_rxrpc_rxgk_rekey(conn, current_key, + specific_key_number ? *specific_key_number : -1); + if (current_key == UINT_MAX) + goto bad_key; + if (current_key + 1 == UINT_MAX) + set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); + + key_number = current_key + 1; + if (WARN_ON(conn->rxgk.keys[key_number & mask])) + goto bad_key; + crank = true; + +generate_key: + gk = conn->rxgk.keys[current_key & mask]; + gk = rxgk_generate_transport_key(conn, gk->key, key_number, GFP_NOFS); + if (IS_ERR(gk)) { + mutex_unlock(&conn->security_lock); + return gk; + } + + write_lock(&conn->security_use_lock); + if (crank) { + current_key++; + conn->rxgk.key_number = current_key; + dead = conn->rxgk.keys[(current_key - 2) & mask]; + conn->rxgk.keys[(current_key - 2) & mask] = NULL; + } + conn->rxgk.keys[current_key & mask] = gk; + write_unlock(&conn->security_use_lock); + goto grab; + +bad_key: + mutex_unlock(&conn->security_lock); + return ERR_PTR(-ESTALE); +} + +/* + * Get the specified keying context. + * + * Returns a ref on the context if successful or -ESTALE if the key is out of + * date. + */ static struct rxgk_context *rxgk_get_key(struct rxrpc_connection *conn, - u16 *specific_key_number) + const u16 *specific_key_number) { - refcount_inc(&conn->rxgk.keys[0]->usage); - return conn->rxgk.keys[0]; + struct rxgk_context *gk; + unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1; + + _enter("{%u},%d", + conn->rxgk.key_number, specific_key_number ? *specific_key_number : -1); + + read_lock(&conn->security_use_lock); + + current_key = conn->rxgk.key_number; + if (!specific_key_number) { + key_number = current_key; + } else { + /* Only the bottom 16 bits of the key number are exposed in the + * header, so we try and keep the upper 16 bits in step. The + * whole 32 bits are used to generate the TK. + */ + if (*specific_key_number == (u16)current_key) + key_number = current_key; + else if (*specific_key_number == (u16)(current_key - 1)) + key_number = current_key - 1; + else if (*specific_key_number == (u16)(current_key + 1)) + goto rekey; + else + goto bad_key; + } + + gk = conn->rxgk.keys[key_number & mask]; + if (!gk) + goto slow_path; + if (!specific_key_number && + key_number < UINT_MAX) { + if (time_after(jiffies, gk->expiry) || + gk->bytes_remaining < 0) { + set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags); + goto slow_path; + } + + if (test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags)) + goto slow_path; + } + + refcount_inc(&gk->usage); + read_unlock(&conn->security_use_lock); + return gk; + +rekey: + _debug("rekey"); + if (current_key == UINT_MAX) + goto bad_key; + gk = conn->rxgk.keys[current_key & mask]; + if (gk) + set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags); +slow_path: + read_unlock(&conn->security_use_lock); + return rxgk_rekey(conn, specific_key_number); +bad_key: + read_unlock(&conn->security_use_lock); + return ERR_PTR(-ESTALE); } /* @@ -92,7 +236,8 @@ static int rxgk_init_connection_security(struct rxrpc_connection *conn, struct rxgk_context *gk; int ret; - _enter("{%d},{%x}", conn->debug_id, key_serial(conn->key)); + _enter("{%d,%u},{%x}", + conn->debug_id, conn->rxgk.key_number, key_serial(conn->key)); conn->security_ix = token->security_index; conn->security_level = token->rxgk->level; @@ -102,11 +247,12 @@ static int rxgk_init_connection_security(struct rxrpc_connection *conn, do_div(conn->rxgk.start_time, 100); } - gk = rxgk_generate_transport_key(conn, token->rxgk, 0, GFP_NOFS); + gk = rxgk_generate_transport_key(conn, token->rxgk, conn->rxgk.key_number, + GFP_NOFS); if (IS_ERR(gk)) return PTR_ERR(gk); conn->rxgk.enctype = gk->krb5->etype; - conn->rxgk.keys[0] = gk; + conn->rxgk.keys[gk->key_number & 3] = gk; switch (conn->security_level) { case RXRPC_SECURITY_PLAIN: -- cgit v1.2.3 From d03539d5c2dec9b028297c15e57bd3c01d0d9c0d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:56 +0100 Subject: rxrpc: Display security params in the afs_cb_call tracepoint Make the afs_cb_call tracepoint display some security parameters to make debugging easier. Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-12-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/networking/rxrpc.rst | 1 + fs/afs/internal.h | 2 ++ fs/afs/rxrpc.c | 4 ++++ include/net/af_rxrpc.h | 2 ++ include/trace/events/afs.h | 11 +++++++++-- net/rxrpc/ar-internal.h | 1 + net/rxrpc/call_object.c | 20 ++++++++++++++++++++ net/rxrpc/rxgk.c | 2 ++ 8 files changed, 41 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index a01f0c81ca4b..fe2ea73be441 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -1178,6 +1178,7 @@ API Function Reference ====================== .. kernel-doc:: net/rxrpc/af_rxrpc.c +.. kernel-doc:: net/rxrpc/call_object.c .. kernel-doc:: net/rxrpc/key.c .. kernel-doc:: net/rxrpc/oob.c .. kernel-doc:: net/rxrpc/peer_object.c diff --git a/fs/afs/internal.h b/fs/afs/internal.h index b3612b700c6a..178804817efb 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -176,8 +176,10 @@ struct afs_call { bool intr; /* T if interruptible */ bool unmarshalling_error; /* T if an unmarshalling error occurred */ bool responded; /* Got a response from the call (may be abort) */ + u8 security_ix; /* Security class */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ + u32 enctype; /* Security encoding type */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ union { /* place to extract temporary data */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 212af2aa85bf..00b3bc087f61 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -813,6 +813,10 @@ static int afs_deliver_cm_op_id(struct afs_call *call) if (!afs_cm_incoming_call(call)) return -ENOTSUPP; + call->security_ix = rxrpc_kernel_query_call_security(call->rxcall, + &call->service_id, + &call->enctype); + trace_afs_cb_call(call); call->work.func = call->type->work; diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 0b209f703ffc..f15341594cc8 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -112,5 +112,7 @@ int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge); u32 rxgk_kernel_query_challenge(struct sk_buff *challenge); int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, struct krb5_buffer *appdata); +u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call, + u16 *_service_id, u32 *_enctype); #endif /* _NET_RXRPC_H */ diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 8857f5ea77d4..7f83d242c8e9 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -663,19 +663,26 @@ TRACE_EVENT(afs_cb_call, __field(unsigned int, call) __field(u32, op) __field(u16, service_id) + __field(u8, security_ix) + __field(u32, enctype) ), TP_fast_assign( __entry->call = call->debug_id; __entry->op = call->operation_ID; __entry->service_id = call->service_id; + __entry->security_ix = call->security_ix; + __entry->enctype = call->enctype; ), - TP_printk("c=%08x %s", + TP_printk("c=%08x %s sv=%u sx=%u en=%u", __entry->call, __entry->service_id == 2501 ? __print_symbolic(__entry->op, yfs_cm_operations) : - __print_symbolic(__entry->op, afs_cm_operations)) + __print_symbolic(__entry->op, afs_cm_operations), + __entry->service_id, + __entry->security_ix, + __entry->enctype) ); TRACE_EVENT(afs_call, diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index fd235bfa226d..ca62a1db3286 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -740,6 +740,7 @@ struct rxrpc_call { u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ u32 security_level; /* Security level selected */ + u32 security_enctype; /* Security-specific encoding type (or 0) */ int debug_id; /* debug ID for printks */ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index fc88ffe1b050..e9e8f0ef3fd5 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -760,3 +760,23 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) atomic_dec(&rxnet->nr_calls); wait_var_event(&rxnet->nr_calls, !atomic_read(&rxnet->nr_calls)); } + +/** + * rxrpc_kernel_query_call_security - Query call's security parameters + * @call: The call to query + * @_service_id: Where to return the service ID + * @_enctype: Where to return the "encoding type" + * + * This queries the security parameters of a call, setting *@_service_id and + * *@_enctype and returning the security class. + * + * Return: The security class protocol number. + */ +u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call, + u16 *_service_id, u32 *_enctype) +{ + *_service_id = call->dest_srx.srx_service; + *_enctype = call->security_enctype; + return call->security_ix; +} +EXPORT_SYMBOL(rxrpc_kernel_query_call_security); diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 8b1ccdf8bc58..6175fc54ba90 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -443,6 +443,7 @@ static int rxgk_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) if (ret < 0) return ret; + call->security_enctype = gk->krb5->etype; txb->cksum = htons(gk->key_number); switch (call->conn->security_level) { @@ -590,6 +591,7 @@ static int rxgk_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) } } + call->security_enctype = gk->krb5->etype; switch (call->conn->security_level) { case RXRPC_SECURITY_PLAIN: return 0; -- cgit v1.2.3 From fba6995798c6085a0c2fc67e0cacd489a6971044 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:58 +0100 Subject: rxrpc: Add more CHALLENGE/RESPONSE packet tracing Add more tracing for CHALLENGE and RESPONSE packets. Currently, rxrpc only has client-relevant tracepoints (rx_challenge and tx_response), but add the server-side ones too. Further, record the service ID in the rx_challenge tracepoint as well. Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-14-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 78 +++++++++++++++++++++++++++++++++++++++++++- net/rxrpc/output.c | 2 ++ net/rxrpc/rxgk.c | 4 +++ net/rxrpc/rxkad.c | 2 ++ 4 files changed, 85 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 920439df1f6f..378d2dfc7392 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1201,6 +1201,39 @@ TRACE_EVENT(rxrpc_rx_conn_abort, __entry->abort_code) ); +TRACE_EVENT(rxrpc_tx_challenge, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, + u32 version, u32 nonce), + + TP_ARGS(conn, serial, version, nonce), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(rxrpc_serial_t, serial) + __field(u32, version) + __field(u32, nonce) + __field(u16, service_id) + __field(u8, security_ix) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = serial; + __entry->version = version; + __entry->nonce = nonce; + __entry->service_id = conn->service_id; + __entry->security_ix = conn->security_ix; + ), + + TP_printk("C=%08x CHALLENGE r=%08x sv=%u+%u v=%x n=%x", + __entry->conn, + __entry->serial, + __entry->service_id, + __entry->security_ix, + __entry->version, + __entry->nonce) + ); + TRACE_EVENT(rxrpc_rx_challenge, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 nonce, u32 min_level), @@ -1213,6 +1246,7 @@ TRACE_EVENT(rxrpc_rx_challenge, __field(u32, version) __field(u32, nonce) __field(u32, min_level) + __field(u16, service_id) __field(u8, security_ix) ), @@ -1222,18 +1256,60 @@ TRACE_EVENT(rxrpc_rx_challenge, __entry->version = version; __entry->nonce = nonce; __entry->min_level = min_level; + __entry->service_id = conn->service_id; __entry->security_ix = conn->security_ix; ), - TP_printk("C=%08x CHALLENGE r=%08x sx=%u v=%x n=%x ml=%x", + TP_printk("C=%08x CHALLENGE r=%08x sv=%u+%u v=%x n=%x ml=%x", __entry->conn, __entry->serial, + __entry->service_id, __entry->security_ix, __entry->version, __entry->nonce, __entry->min_level) ); +TRACE_EVENT(rxrpc_tx_response, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, + struct rxrpc_skb_priv *rsp), + + TP_ARGS(conn, serial, rsp), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(rxrpc_serial_t, serial) + __field(rxrpc_serial_t, challenge) + __field(u32, version) + __field(u32, kvno) + __field(u16, ticket_len) + __field(u16, appdata_len) + __field(u16, service_id) + __field(u8, security_ix) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = serial; + __entry->challenge = rsp->resp.challenge_serial; + __entry->version = rsp->resp.version; + __entry->kvno = rsp->resp.kvno; + __entry->ticket_len = rsp->resp.ticket_len; + __entry->service_id = conn->service_id; + __entry->security_ix = conn->security_ix; + ), + + TP_printk("C=%08x RESPONSE r=%08x cr=%08x sv=%u+%u v=%x kv=%x tl=%u", + __entry->conn, + __entry->serial, + __entry->challenge, + __entry->service_id, + __entry->security_ix, + __entry->version, + __entry->kvno, + __entry->ticket_len) + ); + TRACE_EVENT(rxrpc_rx_response, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 kvno, u32 ticket_len), diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 8138f35d7945..0af19bcdc80a 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -953,6 +953,8 @@ void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *response serial = rxrpc_get_next_serials(conn, 1); wserial = htonl(serial); + trace_rxrpc_tx_response(conn, serial, sp); + ret = skb_store_bits(response, offsetof(struct rxrpc_wire_header, serial), &wserial, sizeof(wserial)); if (ret < 0) diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 6175fc54ba90..ba8bc201b8d3 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -668,6 +668,8 @@ static int rxgk_issue_challenge(struct rxrpc_connection *conn) serial = rxrpc_get_next_serials(conn, 1); whdr->serial = htonl(serial); + trace_rxrpc_tx_challenge(conn, serial, 0, *(u32 *)&conn->rxgk.nonce); + ret = do_udp_sendmsg(conn->local->socket, &msg, len); if (ret > 0) conn->peer->last_tx_at = ktime_get_seconds(); @@ -1203,6 +1205,8 @@ static int rxgk_verify_response(struct rxrpc_connection *conn, if (xdr_round_up(token_len) + sizeof(__be32) > len) goto short_packet; + trace_rxrpc_rx_response(conn, sp->hdr.serial, 0, sp->hdr.cksum, token_len); + offset += xdr_round_up(token_len); len -= xdr_round_up(token_len); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 0b5e007c7de9..3657c0661cdc 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -685,6 +685,8 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) serial = rxrpc_get_next_serial(conn); whdr.serial = htonl(serial); + trace_rxrpc_tx_challenge(conn, serial, 0, conn->rxkad.nonce); + ret = kernel_sendmsg(conn->local->socket, &msg, iov, 2, len); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, -- cgit v1.2.3 From 6e83166dd8003e8611f253426b85e0c3d933e1c2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sun, 13 Apr 2025 11:34:32 +0200 Subject: mptcp: sched: remove mptcp_sched_data This is a follow-up of commit b68b106b0f15 ("mptcp: sched: reduce size for unused data"), now removing the mptcp_sched_data structure. Now is a good time to do that, because the previously mentioned WIP work has been updated, no longer depending on this structure. Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250413-net-next-mptcp-sched-mib-sft-misc-v2-1-0f83a4350150@kernel.org Signed-off-by: Jakub Kicinski --- include/net/mptcp.h | 13 ++----------- net/mptcp/sched.c | 18 +++++++----------- 2 files changed, 9 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index bfbad695951c..f7263fe2a2e4 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -101,18 +101,9 @@ struct mptcp_out_options { #define MPTCP_SCHED_MAX 128 #define MPTCP_SCHED_BUF_MAX (MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX) -#define MPTCP_SUBFLOWS_MAX 8 - -struct mptcp_sched_data { - u8 subflows; - struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; -}; - struct mptcp_sched_ops { - int (*get_send)(struct mptcp_sock *msk, - struct mptcp_sched_data *data); - int (*get_retrans)(struct mptcp_sock *msk, - struct mptcp_sched_data *data); + int (*get_send)(struct mptcp_sock *msk); + int (*get_retrans)(struct mptcp_sock *msk); char name[MPTCP_SCHED_NAME_MAX]; struct module *owner; diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index c16c6fbd4ba2..f09f7eb1d63f 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -16,8 +16,7 @@ static DEFINE_SPINLOCK(mptcp_sched_list_lock); static LIST_HEAD(mptcp_sched_list); -static int mptcp_sched_default_get_send(struct mptcp_sock *msk, - struct mptcp_sched_data *data) +static int mptcp_sched_default_get_send(struct mptcp_sock *msk) { struct sock *ssk; @@ -29,8 +28,7 @@ static int mptcp_sched_default_get_send(struct mptcp_sock *msk, return 0; } -static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk, - struct mptcp_sched_data *data) +static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk) { struct sock *ssk; @@ -157,7 +155,6 @@ void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, int mptcp_sched_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data *data = NULL; msk_owned_by_me(msk); @@ -178,14 +175,13 @@ int mptcp_sched_get_send(struct mptcp_sock *msk) } if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_send(msk, data); - return msk->sched->get_send(msk, data); + return mptcp_sched_default_get_send(msk); + return msk->sched->get_send(msk); } int mptcp_sched_get_retrans(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data *data = NULL; msk_owned_by_me(msk); @@ -199,8 +195,8 @@ int mptcp_sched_get_retrans(struct mptcp_sock *msk) } if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_retrans(msk, data); + return mptcp_sched_default_get_retrans(msk); if (msk->sched->get_retrans) - return msk->sched->get_retrans(msk, data); - return msk->sched->get_send(msk, data); + return msk->sched->get_retrans(msk); + return msk->sched->get_send(msk); } -- cgit v1.2.3 From f99564688f38458d86b64f099ebf03f19517cf77 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 13 Apr 2025 16:09:40 +0200 Subject: net: phy: remove device_phy_find_device AFAICS this function has never had a user. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/ab7b8094-2eea-4e82-a047-fd60117f220b@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 12 ------------ include/linux/phy.h | 6 ------ 2 files changed, 18 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index cc1bfd22fb81..cc6c209fe702 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3235,18 +3235,6 @@ struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode) } EXPORT_SYMBOL(fwnode_phy_find_device); -/** - * device_phy_find_device - For the given device, get the phy_device - * @dev: Pointer to the given device - * - * Refer return conditions of fwnode_phy_find_device(). - */ -struct phy_device *device_phy_find_device(struct device *dev) -{ - return fwnode_phy_find_device(dev_fwnode(dev)); -} -EXPORT_SYMBOL_GPL(device_phy_find_device); - /** * fwnode_get_phy_node - Get the phy_node using the named reference. * @fwnode: Pointer to fwnode from which phy_node has to be obtained. diff --git a/include/linux/phy.h b/include/linux/phy.h index a2bfae80c449..fb755358d965 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1757,7 +1757,6 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id); struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode); struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode); -struct phy_device *device_phy_find_device(struct device *dev); struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode); struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); @@ -1779,11 +1778,6 @@ struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode) return NULL; } -static inline struct phy_device *device_phy_find_device(struct device *dev) -{ - return NULL; -} - static inline struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode) { -- cgit v1.2.3 From 95d06e92a401928fe46fda7616e460f39cb7211b Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 14 Apr 2025 06:24:07 -0700 Subject: netlink: Introduce nlmsg_payload helper Create a new helper function, nlmsg_payload(), to simplify checking and retrieving Netlink message payloads. This reduces boilerplate code for users who need to verify the message length before accessing its data. Suggested-by: Jakub Kicinski Signed-off-by: Breno Leitao Reviewed-by: Jakub Kicinski Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250414-nlmsg-v2-1-3d90cb42c6af@debian.org Signed-off-by: Jakub Kicinski --- include/net/netlink.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index 29e0db940382..82e07e272290 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -611,6 +611,22 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh) return nlh->nlmsg_len - NLMSG_HDRLEN; } +/** + * nlmsg_payload - message payload if the data fits in the len + * @nlh: netlink message header + * @len: struct length + * + * Returns: The netlink message payload/data if the length is sufficient, + * otherwise NULL. + */ +static inline void *nlmsg_payload(const struct nlmsghdr *nlh, size_t len) +{ + if (nlh->nlmsg_len < nlmsg_msg_size(len)) + return NULL; + + return nlmsg_data(nlh); +} + /** * nlmsg_attrdata - head of attributes data * @nlh: netlink message header -- cgit v1.2.3 From 7c571ac57d9d97190dcba18212fabf99888b0c48 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 14 Apr 2025 14:26:30 -0700 Subject: net: ptp: introduce .supported_extts_flags to ptp_clock_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PTP_EXTTS_REQUEST(2) ioctl has a flags field which specifies how the external timestamp request should behave. This includes which edge of the signal to timestamp, as well as a specialized "offset" mode. It is expected that more flags will be added in the future. Driver authors routinely do not check the flags, often accepting requests with flags which they do not support. Even drivers which do check flags may not be future-proofed to reject flags not yet defined. Thus, any future flag additions often require manually updating drivers to reject these flags. This approach of hoping we catch flag checks during review, or playing whack-a-mole after the fact is the wrong approach. Introduce the "supported_extts_flags" field to the ptp_clock_info structure. This field defines the set of flags the device actually supports. Update the core character device logic to check this field and reject unsupported requests. Getting this right is somewhat tricky. First, to avoid unnecessary repetition and make basic functionality work when .supported_extts_flags is 0, the core always accepts the PTP_ENABLE_FEATURE flag. This flag is used to set the 'on' parameter to the .enable function and is thus always 'supported' by all drivers. For backwards compatibility, the PTP_RISING_EDGE and PTP_FALLING_EDGE flags are merely "hints" when using the old PTP_EXTTS_REQUEST ioctl, and are not expected to be enforced. If the user issues PTP_EXTTS_REQUEST2, the PTP_STRICT_FLAGS flag is added which is supposed to inform the driver to strictly validate the flags and reject unsupported requests. To handle this, first check if the driver reports PTP_STRICT_FLAGS support. If it does not, then always allow the PTP_RISING_EDGE and PTP_FALLING_EDGE flags. This keeps backwards compatibility with the original PTP_EXTTS_REQUEST ioctl where these flags are not guaranteed to be honored. This way, drivers which do not set the supported_extts_flags will continue to accept requests for the original PTP_EXTTS_REQUEST ioctl. The core will automatically reject requests with new flags, and correctly reject requests with PTP_STRICT_FLAGS, where the driver is supposed to strictly validate the flags. Update the various drivers, refactoring their validation logic into the .supported_extts_flags field. For consistency and readability, PTP_ENABLE_FEATURE is not set in the supported flags list, and PTP_EXTTS_EDGES is expanded to PTP_RISING_EDGE | PTP_FALLING_EDGE in all cases. Note the following driver files set n_ext_ts to a non-zero value but did not check flags at all: • drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c • drivers/net/ethernet/freescale/enetc/enetc_ptp.c • drivers/net/ethernet/intel/i40e/i40e_ptp.c • drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c • drivers/net/ethernet/renesas/ravb_ptp.c • drivers/net/ethernet/renesas/rtsn.c • drivers/net/ethernet/renesas/rtsn.h • drivers/net/ethernet/ti/am65-cpts.c • drivers/net/ethernet/ti/cpts.h • drivers/net/ethernet/ti/icssg/icss_iep.c • drivers/net/ethernet/xscale/ptp_ixp46x.c • drivers/net/phy/bcm-phy-ptp.c • drivers/ptp/ptp_ocp.c • drivers/ptp/ptp_pch.c • drivers/ptp/ptp_qoriq.c These drivers behavior does change slightly: they will now reject the PTP_EXTTS_REQUEST2 ioctl, because they do not strictly validate their flags. This also makes them no longer incorrectly accept PTP_EXT_OFFSET. Also note that the renesas ravb driver does not support PTP_STRICT_FLAGS. We could leave the .supported_extts_flags as 0, but I added the PTP_RISING_EDGE | PTP_FALLING_EDGE since the driver previously manually validated these flags. This is equivalent to 0 because the core will allow these flags regardless unless PTP_STRICT_FLAGS is also set. Signed-off-by: Jacob Keller Reviewed-by: Kory Maincent Link: https://patch.msgid.link/20250414-jk-supported-perout-flags-v2-1-f6b17d15475c@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/ptp.c | 11 ++++------- drivers/net/dsa/sja1105/sja1105_ptp.c | 10 +++------- drivers/net/ethernet/intel/ice/ice_ptp.c | 12 ++++-------- drivers/net/ethernet/intel/igb/igb_ptp.c | 20 ++++++-------------- drivers/net/ethernet/intel/igc/igc_ptp.c | 10 +++------- drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c | 11 ++++------- drivers/net/ethernet/microchip/lan743x_ptp.c | 9 +++------ drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c | 8 ++------ drivers/net/ethernet/renesas/ravb_ptp.c | 7 +------ drivers/net/phy/dp83640.c | 10 +++------- drivers/net/phy/micrel.c | 8 +++----- drivers/net/phy/nxp-c45-tja11xx.c | 9 +++------ drivers/ptp/ptp_chardev.c | 14 +++++++++++++- drivers/ptp/ptp_clockmatrix.c | 14 ++------------ drivers/ptp/ptp_fc3.c | 1 + drivers/ptp/ptp_idt82p33.c | 15 +++------------ include/linux/ptp_clock_kernel.h | 12 ++++++++++++ 17 files changed, 70 insertions(+), 111 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/mv88e6xxx/ptp.c b/drivers/net/dsa/mv88e6xxx/ptp.c index aed4a4b07f34..1d3b2c94c53e 100644 --- a/drivers/net/dsa/mv88e6xxx/ptp.c +++ b/drivers/net/dsa/mv88e6xxx/ptp.c @@ -332,13 +332,6 @@ static int mv88e6352_ptp_enable_extts(struct mv88e6xxx_chip *chip, int pin; int err; - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Reject requests to enable time stamping on both edges. */ if ((rq->extts.flags & PTP_STRICT_FLAGS) && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -566,6 +559,10 @@ int mv88e6xxx_ptp_setup(struct mv88e6xxx_chip *chip) chip->ptp_clock_info.verify = ptp_ops->ptp_verify; chip->ptp_clock_info.do_aux_work = mv88e6xxx_hwtstamp_work; + chip->ptp_clock_info.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; + if (ptp_ops->set_ptp_cpu_port) { struct dsa_port *dp; int upstream = 0; diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index 198e787e8560..3b979d88ca13 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -820,13 +820,6 @@ static int sja1105_extts_enable(struct sja1105_private *priv, if (extts->index != 0) return -EOPNOTSUPP; - /* Reject requests with unsupported flags */ - if (extts->flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* We can only enable time stamping on both edges, sadly. */ if ((extts->flags & PTP_STRICT_FLAGS) && (extts->flags & PTP_ENABLE_FEATURE) && @@ -912,6 +905,9 @@ int sja1105_ptp_clock_register(struct dsa_switch *ds) .n_pins = 1, .n_ext_ts = 1, .n_per_out = 1, + .supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS, }; /* Only used on SJA1105 */ diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 0ea5e52303f2..660b32126553 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -1627,14 +1627,6 @@ static int ice_ptp_cfg_extts(struct ice_pf *pf, struct ptp_extts_request *rq, int pin_desc_idx; u8 tmr_idx; - /* Reject requests with unsupported flags */ - - if (rq->flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned; chan = rq->index; @@ -2740,6 +2732,10 @@ static void ice_ptp_set_caps(struct ice_pf *pf) info->enable = ice_ptp_gpio_enable; info->verify = ice_verify_pin; + info->supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; + switch (pf->hw.mac_type) { case ICE_MAC_E810: ice_ptp_set_funcs_e810(pf); diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c index f323e1c1989f..793c96016288 100644 --- a/drivers/net/ethernet/intel/igb/igb_ptp.c +++ b/drivers/net/ethernet/intel/igb/igb_ptp.c @@ -502,13 +502,6 @@ static int igb_ptp_feature_enable_82580(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_EXTTS: - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Both the rising and falling edge are timestamped */ if (rq->extts.flags & PTP_STRICT_FLAGS && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -658,13 +651,6 @@ static int igb_ptp_feature_enable_i210(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_EXTTS: - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Reject requests failing to enable both edges. */ if ((rq->extts.flags & PTP_STRICT_FLAGS) && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -1356,6 +1342,9 @@ void igb_ptp_init(struct igb_adapter *adapter) adapter->ptp_caps.n_per_out = IGB_N_PEROUT; adapter->ptp_caps.n_pins = IGB_N_SDP; adapter->ptp_caps.pps = 0; + adapter->ptp_caps.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; adapter->ptp_caps.pin_config = adapter->sdp_config; adapter->ptp_caps.adjfine = igb_ptp_adjfine_82580; adapter->ptp_caps.adjtime = igb_ptp_adjtime_82576; @@ -1378,6 +1367,9 @@ void igb_ptp_init(struct igb_adapter *adapter) adapter->ptp_caps.n_ext_ts = IGB_N_EXTTS; adapter->ptp_caps.n_per_out = IGB_N_PEROUT; adapter->ptp_caps.n_pins = IGB_N_SDP; + adapter->ptp_caps.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; adapter->ptp_caps.pps = 1; adapter->ptp_caps.pin_config = adapter->sdp_config; adapter->ptp_caps.adjfine = igb_ptp_adjfine_82580; diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index 946edbad4302..0c6c0cc78fac 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -257,13 +257,6 @@ static int igc_ptp_feature_enable_i225(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_EXTTS: - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Reject requests failing to enable both edges. */ if ((rq->extts.flags & PTP_STRICT_FLAGS) && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -1146,6 +1139,9 @@ void igc_ptp_init(struct igc_adapter *adapter) adapter->ptp_caps.pin_config = adapter->sdp_config; adapter->ptp_caps.n_ext_ts = IGC_N_EXTTS; adapter->ptp_caps.n_per_out = IGC_N_PEROUT; + adapter->ptp_caps.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; adapter->ptp_caps.n_pins = IGC_N_SDP; adapter->ptp_caps.verify = igc_ptp_verify_pin; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index 65a94e46edcf..3eee84430ac9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -643,13 +643,6 @@ static int mlx5_extts_configure(struct ptp_clock_info *ptp, int pin = -1; int err = 0; - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Reject requests to enable time stamping on both edges. */ if ((rq->extts.flags & PTP_STRICT_FLAGS) && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -1034,6 +1027,10 @@ static void mlx5_init_pin_config(struct mlx5_core_dev *mdev) clock->ptp_info.verify = mlx5_ptp_verify; clock->ptp_info.pps = 1; + clock->ptp_info.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; + for (i = 0; i < clock->ptp_info.n_pins; i++) { snprintf(clock->ptp_info.pin_config[i].name, sizeof(clock->ptp_info.pin_config[i].name), diff --git a/drivers/net/ethernet/microchip/lan743x_ptp.c b/drivers/net/ethernet/microchip/lan743x_ptp.c index 0be44dcb3393..b171c893175b 100644 --- a/drivers/net/ethernet/microchip/lan743x_ptp.c +++ b/drivers/net/ethernet/microchip/lan743x_ptp.c @@ -942,12 +942,6 @@ static int lan743x_ptp_io_extts(struct lan743x_adapter *adapter, int on, extts = &ptp->extts[index]; - if (extts_request->flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - if (on) { extts_pin = ptp_find_pin(ptp->ptp_clock, PTP_PF_EXTTS, index); if (extts_pin < 0) @@ -1543,6 +1537,9 @@ int lan743x_ptp_open(struct lan743x_adapter *adapter) ptp->ptp_clock_info.n_per_out = LAN743X_PTP_N_EVENT_CHAN; ptp->ptp_clock_info.n_pins = n_pins; ptp->ptp_clock_info.pps = LAN743X_PTP_N_PPS; + ptp->ptp_clock_info.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; ptp->ptp_clock_info.pin_config = ptp->pin_config; ptp->ptp_clock_info.adjfine = lan743x_ptpci_adjfine; ptp->ptp_clock_info.adjtime = lan743x_ptpci_adjtime; diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c index 63905bb5a63a..8cf41b0977b2 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c @@ -917,12 +917,6 @@ static int lan966x_ptp_extts(struct ptp_clock_info *ptp, if (lan966x->ptp_ext_irq <= 0) return -EOPNOTSUPP; - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - pin = ptp_find_pin(phc->clock, PTP_PF_EXTTS, rq->extts.index); if (pin == -1 || pin >= LAN966X_PHC_PINS_NUM) return -EINVAL; @@ -978,6 +972,8 @@ static struct ptp_clock_info lan966x_ptp_clock_info = { .n_per_out = LAN966X_PHC_PINS_NUM, .n_ext_ts = LAN966X_PHC_PINS_NUM, .n_pins = LAN966X_PHC_PINS_NUM, + .supported_extts_flags = PTP_RISING_EDGE | + PTP_STRICT_FLAGS, }; static int lan966x_ptp_phc_init(struct lan966x *lan966x, diff --git a/drivers/net/ethernet/renesas/ravb_ptp.c b/drivers/net/ethernet/renesas/ravb_ptp.c index b4365906669f..ab1ffdc7ee4f 100644 --- a/drivers/net/ethernet/renesas/ravb_ptp.c +++ b/drivers/net/ethernet/renesas/ravb_ptp.c @@ -176,12 +176,6 @@ static int ravb_ptp_extts(struct ptp_clock_info *ptp, struct net_device *ndev = priv->ndev; unsigned long flags; - /* Reject requests with unsupported flags */ - if (req->flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE)) - return -EOPNOTSUPP; - if (req->index) return -EINVAL; @@ -287,6 +281,7 @@ static const struct ptp_clock_info ravb_ptp_info = { .max_adj = 50000000, .n_ext_ts = N_EXT_TS, .n_per_out = N_PER_OUT, + .supported_extts_flags = PTP_RISING_EDGE | PTP_FALLING_EDGE, .adjfine = ravb_ptp_adjfine, .adjtime = ravb_ptp_adjtime, .gettime64 = ravb_ptp_gettime64, diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c index 85e231451093..c89c255185a6 100644 --- a/drivers/net/phy/dp83640.c +++ b/drivers/net/phy/dp83640.c @@ -478,13 +478,6 @@ static int ptp_dp83640_enable(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_EXTTS: - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Reject requests to enable time stamping on both edges. */ if ((rq->extts.flags & PTP_STRICT_FLAGS) && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -1002,6 +995,9 @@ static void dp83640_clock_init(struct dp83640_clock *clock, struct mii_bus *bus) clock->caps.n_per_out = N_PER_OUT; clock->caps.n_pins = DP83640_N_PINS; clock->caps.pps = 0; + clock->caps.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; clock->caps.adjfine = ptp_dp83640_adjfine; clock->caps.adjtime = ptp_dp83640_adjtime; clock->caps.gettime64 = ptp_dp83640_gettime; diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 24882d30f685..61123ec4c878 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -3406,11 +3406,6 @@ static int lan8814_ptp_extts(struct ptp_clock_info *ptpci, struct phy_device *phydev = shared->phydev; int pin; - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_EXTTS_EDGES | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - pin = ptp_find_pin(shared->ptp_clock, PTP_PF_EXTTS, rq->extts.index); if (pin == -1 || pin != LAN8814_PTP_EXTTS_NUM) @@ -3917,6 +3912,9 @@ static int lan8814_ptp_probe_once(struct phy_device *phydev) shared->ptp_clock_info.n_ext_ts = LAN8814_PTP_EXTTS_NUM; shared->ptp_clock_info.n_pins = LAN8814_PTP_GPIO_NUM; shared->ptp_clock_info.pps = 0; + shared->ptp_clock_info.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; shared->ptp_clock_info.pin_config = shared->pin_config; shared->ptp_clock_info.n_per_out = LAN8814_PTP_PEROUT_NUM; shared->ptp_clock_info.adjfine = lan8814_ptpci_adjfine; diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c index 250a018d5546..8634b4cb1e70 100644 --- a/drivers/net/phy/nxp-c45-tja11xx.c +++ b/drivers/net/phy/nxp-c45-tja11xx.c @@ -861,12 +861,6 @@ static int nxp_c45_extts_enable(struct nxp_c45_phy *priv, const struct nxp_c45_phy_data *data = nxp_c45_get_data(priv->phydev); int pin; - if (extts->flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Sampling on both edges is not supported */ if ((extts->flags & PTP_RISING_EDGE) && (extts->flags & PTP_FALLING_EDGE) && @@ -962,6 +956,9 @@ static int nxp_c45_init_ptp_clock(struct nxp_c45_phy *priv) .n_pins = ARRAY_SIZE(nxp_c45_ptp_pins), .n_ext_ts = 1, .n_per_out = 1, + .supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS, }; priv->ptp_clock = ptp_clock_register(&priv->caps, diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 4380e6ddb849..c24228c13954 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -162,6 +162,7 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, { struct ptp_clock *ptp = container_of(pccontext->clk, struct ptp_clock, clock); + unsigned int i, pin_index, supported_extts_flags; struct ptp_sys_offset_extended *extoff = NULL; struct ptp_sys_offset_precise precise_offset; struct system_device_crosststamp xtstamp; @@ -172,7 +173,6 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, struct ptp_clock_request req; struct ptp_clock_caps caps; struct ptp_clock_time *pct; - unsigned int i, pin_index; struct ptp_pin_desc pd; struct timespec64 ts; int enable, err = 0; @@ -240,6 +240,18 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, err = -EINVAL; break; } + supported_extts_flags = ptp->info->supported_extts_flags; + /* The PTP_ENABLE_FEATURE flag is always supported. */ + supported_extts_flags |= PTP_ENABLE_FEATURE; + /* If the driver does not support strictly checking flags, the + * PTP_RISING_EDGE and PTP_FALLING_EDGE flags are merely + * hints which are not enforced. + */ + if (!(supported_extts_flags & PTP_STRICT_FLAGS)) + supported_extts_flags |= PTP_EXTTS_EDGES; + /* Reject unsupported flags */ + if (req.extts.flags & ~supported_extts_flags) + return -EOPNOTSUPP; req.type = PTP_CLK_REQ_EXTTS; enable = req.extts.flags & PTP_ENABLE_FEATURE ? 1 : 0; if (mutex_lock_interruptible(&ptp->pincfg_mux)) diff --git a/drivers/ptp/ptp_clockmatrix.c b/drivers/ptp/ptp_clockmatrix.c index fbb3fa8fc60b..b8d4df8c6da2 100644 --- a/drivers/ptp/ptp_clockmatrix.c +++ b/drivers/ptp/ptp_clockmatrix.c @@ -283,18 +283,6 @@ static int idtcm_extts_enable(struct idtcm_channel *channel, idtcm = channel->idtcm; old_mask = idtcm->extts_mask; - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - - /* Reject requests to enable time stamping on falling edge */ - if ((rq->extts.flags & PTP_ENABLE_FEATURE) && - (rq->extts.flags & PTP_FALLING_EDGE)) - return -EOPNOTSUPP; - if (index >= MAX_TOD) return -EINVAL; @@ -2043,6 +2031,7 @@ static const struct ptp_clock_info idtcm_caps = { .n_per_out = 12, .n_ext_ts = MAX_TOD, .n_pins = MAX_REF_CLK, + .supported_extts_flags = PTP_RISING_EDGE | PTP_STRICT_FLAGS, .adjphase = &idtcm_adjphase, .getmaxphase = &idtcm_getmaxphase, .adjfine = &idtcm_adjfine, @@ -2060,6 +2049,7 @@ static const struct ptp_clock_info idtcm_caps_deprecated = { .n_per_out = 12, .n_ext_ts = MAX_TOD, .n_pins = MAX_REF_CLK, + .supported_extts_flags = PTP_RISING_EDGE | PTP_STRICT_FLAGS, .adjphase = &idtcm_adjphase, .getmaxphase = &idtcm_getmaxphase, .adjfine = &idtcm_adjfine, diff --git a/drivers/ptp/ptp_fc3.c b/drivers/ptp/ptp_fc3.c index cfced36c70bc..70002500170e 100644 --- a/drivers/ptp/ptp_fc3.c +++ b/drivers/ptp/ptp_fc3.c @@ -592,6 +592,7 @@ static const struct ptp_clock_info idtfc3_caps = { .max_adj = MAX_FFO_PPB, .n_per_out = 1, .n_ext_ts = 1, + .supported_extts_flags = PTP_STRICT_FLAGS | PTP_EXT_OFFSET, .adjphase = &idtfc3_adjphase, .adjfine = &idtfc3_adjfine, .adjtime = &idtfc3_adjtime, diff --git a/drivers/ptp/ptp_idt82p33.c b/drivers/ptp/ptp_idt82p33.c index b2fd94d4f863..f01c50dfa44e 100644 --- a/drivers/ptp/ptp_idt82p33.c +++ b/drivers/ptp/ptp_idt82p33.c @@ -246,18 +246,6 @@ static int idt82p33_extts_enable(struct idt82p33_channel *channel, idt82p33 = channel->idt82p33; old_mask = idt82p33->extts_mask; - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - - /* Reject requests to enable time stamping on falling edge */ - if ((rq->extts.flags & PTP_ENABLE_FEATURE) && - (rq->extts.flags & PTP_FALLING_EDGE)) - return -EOPNOTSUPP; - if (index >= MAX_PHC_PLL) return -EINVAL; @@ -1187,6 +1175,9 @@ static void idt82p33_caps_init(u32 index, struct ptp_clock_info *caps, caps->pin_config = pin_cfg; + caps->supported_extts_flags = PTP_RISING_EDGE | + PTP_STRICT_FLAGS; + for (i = 0; i < max_pins; ++i) { ppd = &pin_cfg[i]; diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 0d68d09bedd1..25cba2e5ee69 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -68,6 +68,17 @@ struct ptp_system_timestamp { * @n_per_out: The number of programmable periodic signals. * @n_pins: The number of programmable pins. * @pps: Indicates whether the clock supports a PPS callback. + * + * @supported_extts_flags: The set of flags the driver supports for the + * PTP_EXTTS_REQUEST ioctl. The PTP core will use + * this list to reject unsupported requests. + * PTP_ENABLE_FEATURE is assumed and does not need to + * be included. If PTP_STRICT_FLAGS is *not* set, + * then both PTP_RISING_EDGE and PTP_FALLING_EDGE + * will be assumed. Note that PTP_STRICT_FLAGS must + * be set if the drivers wants to honor + * PTP_EXTTS_REQUEST2 and any future flags. + * * @pin_config: Array of length 'n_pins'. If the number of * programmable pins is nonzero, then drivers must * allocate and initialize this array. @@ -174,6 +185,7 @@ struct ptp_clock_info { int n_per_out; int n_pins; int pps; + unsigned int supported_extts_flags; struct ptp_pin_desc *pin_config; int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm); int (*adjphase)(struct ptp_clock_info *ptp, s32 phase); -- cgit v1.2.3 From d9f3e9ecc4562ae07aaf614cf0a6690ef7ca0e10 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 14 Apr 2025 14:26:31 -0700 Subject: net: ptp: introduce .supported_perout_flags to ptp_clock_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PTP_PEROUT_REQUEST2 ioctl has gained support for flags specifying specific output behavior including PTP_PEROUT_ONE_SHOT, PTP_PEROUT_DUTY_CYCLE, PTP_PEROUT_PHASE. Driver authors are notorious for not checking the flags of the request. This results in misinterpreting the request, generating an output signal that does not match the requested value. It is anticipated that even more flags will be added in the future, resulting in even more broken requests. Expecting these issues to be caught during review or playing whack-a-mole after the fact is not a great solution. Instead, introduce the supported_perout_flags field in the ptp_clock_info structure. Update the core character device logic to explicitly reject any request which has a flag not on this list. This ensures that drivers must 'opt in' to the flags they support. Drivers which don't set the .supported_perout_flags field will not need to check that unsupported flags aren't passed, as the core takes care of this. Update the drivers which do support flags to set this new field. Note the following driver files set n_per_out to a non-zero value but did not check the flags at all: • drivers/ptp/ptp_clockmatrix.c • drivers/ptp/ptp_idt82p33.c • drivers/ptp/ptp_fc3.c • drivers/net/ethernet/ti/am65-cpts.c • drivers/net/ethernet/aquantia/atlantic/aq_ptp.c • drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c • drivers/net/dsa/sja1105/sja1105_ptp.c • drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c • drivers/net/ethernet/mscc/ocelot_vsc7514.c • drivers/net/ethernet/intel/i40e/i40e_ptp.c Reviewed-by: Vadim Fedorenko Signed-off-by: Jacob Keller Reviewed-by: Kory Maincent Link: https://patch.msgid.link/20250414-jk-supported-perout-flags-v2-2-f6b17d15475c@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/sja1105/sja1105_ptp.c | 4 ---- drivers/net/ethernet/intel/ice/ice_ptp.c | 4 +--- drivers/net/ethernet/intel/igc/igc_ptp.c | 4 ---- drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c | 15 +++------------ drivers/net/ethernet/microchip/lan743x_ptp.c | 5 +---- drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c | 6 ++---- drivers/net/ethernet/mscc/ocelot_ptp.c | 5 ----- drivers/net/ethernet/mscc/ocelot_vsc7514.c | 2 ++ drivers/net/ethernet/renesas/ravb_ptp.c | 4 ---- drivers/net/phy/dp83640.c | 3 --- drivers/net/phy/micrel.c | 9 ++------- drivers/net/phy/microchip_rds_ptp.c | 5 +---- drivers/net/phy/nxp-c45-tja11xx.c | 4 +--- drivers/ptp/ptp_chardev.c | 2 ++ include/linux/ptp_clock_kernel.h | 6 ++++++ 15 files changed, 21 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index 3b979d88ca13..3abc64aec411 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -737,10 +737,6 @@ static int sja1105_per_out_enable(struct sja1105_private *priv, if (perout->index != 0) return -EOPNOTSUPP; - /* Reject requests with unsupported flags */ - if (perout->flags) - return -EOPNOTSUPP; - mutex_lock(&ptp_data->lock); rc = sja1105_change_ptp_clk_pin_func(priv, PTP_PF_PEROUT); diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 660b32126553..b79a148ed0f2 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -1797,9 +1797,6 @@ static int ice_ptp_cfg_perout(struct ice_pf *pf, struct ptp_perout_request *rq, struct ice_hw *hw = &pf->hw; int pin_desc_idx; - if (rq->flags & ~PTP_PEROUT_PHASE) - return -EOPNOTSUPP; - pin_desc_idx = ice_ptp_find_pin_idx(pf, PTP_PF_PEROUT, rq->index); if (pin_desc_idx < 0) return -EIO; @@ -2735,6 +2732,7 @@ static void ice_ptp_set_caps(struct ice_pf *pf) info->supported_extts_flags = PTP_RISING_EDGE | PTP_FALLING_EDGE | PTP_STRICT_FLAGS; + info->supported_perout_flags = PTP_PEROUT_PHASE; switch (pf->hw.mac_type) { case ICE_MAC_E810: diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index 0c6c0cc78fac..b6c60b3d0e3a 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -293,10 +293,6 @@ static int igc_ptp_feature_enable_i225(struct ptp_clock_info *ptp, return 0; case PTP_CLK_REQ_PEROUT: - /* Reject requests with unsupported flags */ - if (rq->perout.flags) - return -EOPNOTSUPP; - if (on) { pin = ptp_find_pin(igc->ptp_clock, PTP_PF_PEROUT, rq->perout.index); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index 3eee84430ac9..cec18efadc73 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -813,12 +813,6 @@ static int perout_conf_npps_real_time(struct mlx5_core_dev *mdev, struct ptp_clo return 0; } -static bool mlx5_perout_verify_flags(struct mlx5_core_dev *mdev, unsigned int flags) -{ - return ((!mlx5_npps_real_time_supported(mdev) && flags) || - (mlx5_npps_real_time_supported(mdev) && flags & ~PTP_PEROUT_DUTY_CYCLE)); -} - static int mlx5_perout_configure(struct ptp_clock_info *ptp, struct ptp_clock_request *rq, int on) @@ -854,12 +848,6 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, goto unlock; } - /* Reject requests with unsupported flags */ - if (mlx5_perout_verify_flags(mdev, rq->perout.flags)) { - err = -EOPNOTSUPP; - goto unlock; - } - if (on) { pin_mode = MLX5_PIN_MODE_OUT; pattern = MLX5_OUT_PATTERN_PERIODIC; @@ -1031,6 +1019,9 @@ static void mlx5_init_pin_config(struct mlx5_core_dev *mdev) PTP_FALLING_EDGE | PTP_STRICT_FLAGS; + if (mlx5_npps_real_time_supported(mdev)) + clock->ptp_info.supported_perout_flags = PTP_PEROUT_DUTY_CYCLE; + for (i = 0; i < clock->ptp_info.n_pins; i++) { snprintf(clock->ptp_info.pin_config[i].name, sizeof(clock->ptp_info.pin_config[i].name), diff --git a/drivers/net/ethernet/microchip/lan743x_ptp.c b/drivers/net/ethernet/microchip/lan743x_ptp.c index b171c893175b..b07f5b099a2b 100644 --- a/drivers/net/ethernet/microchip/lan743x_ptp.c +++ b/drivers/net/ethernet/microchip/lan743x_ptp.c @@ -463,10 +463,6 @@ static int lan743x_ptp_perout(struct lan743x_adapter *adapter, int on, struct lan743x_ptp_perout *perout = &ptp->perout[index]; int ret = 0; - /* Reject requests with unsupported flags */ - if (perout_request->flags & ~PTP_PEROUT_DUTY_CYCLE) - return -EOPNOTSUPP; - if (on) { perout_pin = ptp_find_pin(ptp->ptp_clock, PTP_PF_PEROUT, perout_request->index); @@ -1540,6 +1536,7 @@ int lan743x_ptp_open(struct lan743x_adapter *adapter) ptp->ptp_clock_info.supported_extts_flags = PTP_RISING_EDGE | PTP_FALLING_EDGE | PTP_STRICT_FLAGS; + ptp->ptp_clock_info.supported_perout_flags = PTP_PEROUT_DUTY_CYCLE; ptp->ptp_clock_info.pin_config = ptp->pin_config; ptp->ptp_clock_info.adjfine = lan743x_ptpci_adjfine; ptp->ptp_clock_info.adjtime = lan743x_ptpci_adjtime; diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c index 8cf41b0977b2..098406e2e5bb 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c @@ -815,10 +815,6 @@ static int lan966x_ptp_perout(struct ptp_clock_info *ptp, bool pps = false; int pin; - if (rq->perout.flags & ~(PTP_PEROUT_DUTY_CYCLE | - PTP_PEROUT_PHASE)) - return -EOPNOTSUPP; - pin = ptp_find_pin(phc->clock, PTP_PF_PEROUT, rq->perout.index); if (pin == -1 || pin >= LAN966X_PHC_PINS_NUM) return -EINVAL; @@ -974,6 +970,8 @@ static struct ptp_clock_info lan966x_ptp_clock_info = { .n_pins = LAN966X_PHC_PINS_NUM, .supported_extts_flags = PTP_RISING_EDGE | PTP_STRICT_FLAGS, + .supported_perout_flags = PTP_PEROUT_DUTY_CYCLE | + PTP_PEROUT_PHASE, }; static int lan966x_ptp_phc_init(struct lan966x *lan966x, diff --git a/drivers/net/ethernet/mscc/ocelot_ptp.c b/drivers/net/ethernet/mscc/ocelot_ptp.c index cc1088988da0..d2a0a32f75ea 100644 --- a/drivers/net/ethernet/mscc/ocelot_ptp.c +++ b/drivers/net/ethernet/mscc/ocelot_ptp.c @@ -211,11 +211,6 @@ int ocelot_ptp_enable(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_PEROUT: - /* Reject requests with unsupported flags */ - if (rq->perout.flags & ~(PTP_PEROUT_DUTY_CYCLE | - PTP_PEROUT_PHASE)) - return -EOPNOTSUPP; - pin = ptp_find_pin(ocelot->ptp_clock, PTP_PF_PEROUT, rq->perout.index); if (pin == 0) diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index 055b55651a49..498eec8ae61d 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -108,6 +108,8 @@ static struct ptp_clock_info ocelot_ptp_clock_info = { .n_ext_ts = 0, .n_per_out = OCELOT_PTP_PINS_NUM, .n_pins = OCELOT_PTP_PINS_NUM, + .supported_perout_flags = PTP_PEROUT_DUTY_CYCLE | + PTP_PEROUT_PHASE, .pps = 0, .gettime64 = ocelot_ptp_gettime64, .settime64 = ocelot_ptp_settime64, diff --git a/drivers/net/ethernet/renesas/ravb_ptp.c b/drivers/net/ethernet/renesas/ravb_ptp.c index ab1ffdc7ee4f..226c6c0ab945 100644 --- a/drivers/net/ethernet/renesas/ravb_ptp.c +++ b/drivers/net/ethernet/renesas/ravb_ptp.c @@ -206,10 +206,6 @@ static int ravb_ptp_perout(struct ptp_clock_info *ptp, unsigned long flags; int error = 0; - /* Reject requests with unsupported flags */ - if (req->flags) - return -EOPNOTSUPP; - if (req->index) return -EINVAL; diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c index c89c255185a6..daab555721df 100644 --- a/drivers/net/phy/dp83640.c +++ b/drivers/net/phy/dp83640.c @@ -506,9 +506,6 @@ static int ptp_dp83640_enable(struct ptp_clock_info *ptp, return 0; case PTP_CLK_REQ_PEROUT: - /* Reject requests with unsupported flags */ - if (rq->perout.flags) - return -EOPNOTSUPP; if (rq->perout.index >= N_PER_OUT) return -EINVAL; return periodic_output(clock, rq, on, rq->perout.index); diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 61123ec4c878..71fb4410c31b 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -3236,10 +3236,6 @@ static int lan8814_ptp_perout(struct ptp_clock_info *ptpci, int pulse_width; int pin, event; - /* Reject requests with unsupported flags */ - if (rq->perout.flags & ~PTP_PEROUT_DUTY_CYCLE) - return -EOPNOTSUPP; - mutex_lock(&shared->shared_lock); event = rq->perout.index; pin = ptp_find_pin(shared->ptp_clock, PTP_PF_PEROUT, event); @@ -3915,6 +3911,7 @@ static int lan8814_ptp_probe_once(struct phy_device *phydev) shared->ptp_clock_info.supported_extts_flags = PTP_RISING_EDGE | PTP_FALLING_EDGE | PTP_STRICT_FLAGS; + shared->ptp_clock_info.supported_perout_flags = PTP_PEROUT_DUTY_CYCLE; shared->ptp_clock_info.pin_config = shared->pin_config; shared->ptp_clock_info.n_per_out = LAN8814_PTP_PEROUT_NUM; shared->ptp_clock_info.adjfine = lan8814_ptpci_adjfine; @@ -5066,9 +5063,6 @@ static int lan8841_ptp_perout(struct ptp_clock_info *ptp, int pin; int ret; - if (rq->perout.flags & ~PTP_PEROUT_DUTY_CYCLE) - return -EOPNOTSUPP; - pin = ptp_find_pin(ptp_priv->ptp_clock, PTP_PF_PEROUT, rq->perout.index); if (pin == -1 || pin >= LAN8841_PTP_GPIO_NUM) return -EINVAL; @@ -5312,6 +5306,7 @@ static struct ptp_clock_info lan8841_ptp_clock_info = { .n_per_out = LAN8841_PTP_GPIO_NUM, .n_ext_ts = LAN8841_PTP_GPIO_NUM, .n_pins = LAN8841_PTP_GPIO_NUM, + .supported_perout_flags = PTP_PEROUT_DUTY_CYCLE, }; #define LAN8841_OPERATION_MODE_STRAP_LOW_REGISTER 3 diff --git a/drivers/net/phy/microchip_rds_ptp.c b/drivers/net/phy/microchip_rds_ptp.c index 3e6bf10cdeed..e6514ce04c29 100644 --- a/drivers/net/phy/microchip_rds_ptp.c +++ b/drivers/net/phy/microchip_rds_ptp.c @@ -224,10 +224,6 @@ static int mchp_rds_ptp_perout(struct ptp_clock_info *ptpci, struct phy_device *phydev = clock->phydev; int ret, event_pin, pulsewidth; - /* Reject requests with unsupported flags */ - if (perout->flags & ~PTP_PEROUT_DUTY_CYCLE) - return -EOPNOTSUPP; - event_pin = ptp_find_pin(clock->ptp_clock, PTP_PF_PEROUT, perout->index); if (event_pin != clock->event_pin) @@ -1259,6 +1255,7 @@ struct mchp_rds_ptp_clock *mchp_rds_ptp_probe(struct phy_device *phydev, u8 mmd, clock->caps.pps = 0; clock->caps.n_pins = MCHP_RDS_PTP_N_PIN; clock->caps.n_per_out = MCHP_RDS_PTP_N_PEROUT; + clock->caps.supported_perout_flags = PTP_PEROUT_DUTY_CYCLE; clock->caps.pin_config = clock->pin_config; clock->caps.adjfine = mchp_rds_ptp_ltc_adjfine; clock->caps.adjtime = mchp_rds_ptp_ltc_adjtime; diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c index 8634b4cb1e70..f11dd32494c3 100644 --- a/drivers/net/phy/nxp-c45-tja11xx.c +++ b/drivers/net/phy/nxp-c45-tja11xx.c @@ -763,9 +763,6 @@ static int nxp_c45_perout_enable(struct nxp_c45_phy *priv, struct phy_device *phydev = priv->phydev; int pin; - if (perout->flags & ~PTP_PEROUT_PHASE) - return -EOPNOTSUPP; - pin = ptp_find_pin(priv->ptp_clock, PTP_PF_PEROUT, perout->index); if (pin < 0) return pin; @@ -959,6 +956,7 @@ static int nxp_c45_init_ptp_clock(struct nxp_c45_phy *priv) .supported_extts_flags = PTP_RISING_EDGE | PTP_FALLING_EDGE | PTP_STRICT_FLAGS, + .supported_perout_flags = PTP_PEROUT_PHASE, }; priv->ptp_clock = ptp_clock_register(&priv->caps, diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index c24228c13954..4bf421765d03 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -324,6 +324,8 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, err = -EINVAL; break; } + if (req.perout.flags & ~ptp->info->supported_perout_flags) + return -EOPNOTSUPP; req.type = PTP_CLK_REQ_PEROUT; enable = req.perout.period.sec || req.perout.period.nsec; if (mutex_lock_interruptible(&ptp->pincfg_mux)) diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 25cba2e5ee69..eced7e9bf69a 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -69,6 +69,11 @@ struct ptp_system_timestamp { * @n_pins: The number of programmable pins. * @pps: Indicates whether the clock supports a PPS callback. * + * @supported_perout_flags: The set of flags the driver supports for the + * PTP_PEROUT_REQUEST ioctl. The PTP core will + * reject a request with any flag not specified + * here. + * * @supported_extts_flags: The set of flags the driver supports for the * PTP_EXTTS_REQUEST ioctl. The PTP core will use * this list to reject unsupported requests. @@ -185,6 +190,7 @@ struct ptp_clock_info { int n_per_out; int n_pins; int pps; + unsigned int supported_perout_flags; unsigned int supported_extts_flags; struct ptp_pin_desc *pin_config; int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm); -- cgit v1.2.3 From 43eca05b6a3b917c600e10cc6b06bfa57fa57401 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Fri, 11 Apr 2025 10:49:56 +0300 Subject: xfrm: Add explicit dev to .xdo_dev_state_{add,delete,free} Previously, device driver IPSec offload implementations would fall into two categories: 1. Those that used xso.dev to determine the offload device. 2. Those that used xso.real_dev to determine the offload device. The first category didn't work with bonding while the second did. In a non-bonding setup the two pointers are the same. This commit adds explicit pointers for the offload netdevice to .xdo_dev_state_add() / .xdo_dev_state_delete() / .xdo_dev_state_free() which eliminates the confusion and allows drivers from the first category to work with bonding. xso.real_dev now becomes a private pointer managed by the bonding driver. Signed-off-by: Cosmin Ratiu Reviewed-by: Leon Romanovsky Reviewed-by: Nikolay Aleksandrov Signed-off-by: Steffen Klassert --- Documentation/networking/xfrm_device.rst | 10 ++++-- drivers/net/bonding/bond_main.c | 33 +++++++++-------- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 20 ++++++----- .../chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c | 18 ++++++---- drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 41 ++++++++++++---------- drivers/net/ethernet/intel/ixgbevf/ipsec.c | 21 ++++++----- .../ethernet/marvell/octeontx2/nic/cn10k_ipsec.c | 18 +++++----- .../ethernet/mellanox/mlx5/core/en_accel/ipsec.c | 12 +++---- drivers/net/ethernet/netronome/nfp/crypto/ipsec.c | 11 +++--- drivers/net/netdevsim/ipsec.c | 15 ++++---- include/linux/netdevice.h | 10 ++++-- include/net/xfrm.h | 8 +++++ net/xfrm/xfrm_device.c | 4 +-- net/xfrm/xfrm_state.c | 14 ++++---- 14 files changed, 136 insertions(+), 99 deletions(-) (limited to 'include') diff --git a/Documentation/networking/xfrm_device.rst b/Documentation/networking/xfrm_device.rst index 7f24c09f2694..122204da0fff 100644 --- a/Documentation/networking/xfrm_device.rst +++ b/Documentation/networking/xfrm_device.rst @@ -65,9 +65,13 @@ Callbacks to implement /* from include/linux/netdevice.h */ struct xfrmdev_ops { /* Crypto and Packet offload callbacks */ - int (*xdo_dev_state_add) (struct xfrm_state *x, struct netlink_ext_ack *extack); - void (*xdo_dev_state_delete) (struct xfrm_state *x); - void (*xdo_dev_state_free) (struct xfrm_state *x); + int (*xdo_dev_state_add)(struct net_device *dev, + struct xfrm_state *x, + struct netlink_ext_ack *extack); + void (*xdo_dev_state_delete)(struct net_device *dev, + struct xfrm_state *x); + void (*xdo_dev_state_free)(struct net_device *dev, + struct xfrm_state *x); bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 950d8e4d86f8..4ba525a564c5 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -453,13 +453,14 @@ static struct net_device *bond_ipsec_dev(struct xfrm_state *xs) /** * bond_ipsec_add_sa - program device with a security association + * @bond_dev: pointer to the bond net device * @xs: pointer to transformer state struct * @extack: extack point to fill failure reason **/ -static int bond_ipsec_add_sa(struct xfrm_state *xs, +static int bond_ipsec_add_sa(struct net_device *bond_dev, + struct xfrm_state *xs, struct netlink_ext_ack *extack) { - struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker; struct bond_ipsec *ipsec; @@ -496,7 +497,7 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs, } xs->xso.real_dev = real_dev; - err = real_dev->xfrmdev_ops->xdo_dev_state_add(xs, extack); + err = real_dev->xfrmdev_ops->xdo_dev_state_add(real_dev, xs, extack); if (!err) { ipsec->xs = xs; INIT_LIST_HEAD(&ipsec->list); @@ -540,7 +541,8 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) continue; ipsec->xs->xso.real_dev = real_dev; - if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) { + if (real_dev->xfrmdev_ops->xdo_dev_state_add(real_dev, + ipsec->xs, NULL)) { slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); ipsec->xs->xso.real_dev = NULL; } @@ -551,11 +553,12 @@ out: /** * bond_ipsec_del_sa - clear out this specific SA + * @bond_dev: pointer to the bond net device * @xs: pointer to transformer state struct **/ -static void bond_ipsec_del_sa(struct xfrm_state *xs) +static void bond_ipsec_del_sa(struct net_device *bond_dev, + struct xfrm_state *xs) { - struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker; struct bond_ipsec *ipsec; @@ -587,7 +590,7 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs) goto out; } - real_dev->xfrmdev_ops->xdo_dev_state_delete(xs); + real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, xs); out: netdev_put(real_dev, &tracker); mutex_lock(&bond->ipsec_lock); @@ -624,18 +627,20 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__); - } else { - real_dev->xfrmdev_ops->xdo_dev_state_delete(ipsec->xs); - if (real_dev->xfrmdev_ops->xdo_dev_state_free) - real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs); + continue; } + real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, + ipsec->xs); + if (real_dev->xfrmdev_ops->xdo_dev_state_free) + real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev, + ipsec->xs); } mutex_unlock(&bond->ipsec_lock); } -static void bond_ipsec_free_sa(struct xfrm_state *xs) +static void bond_ipsec_free_sa(struct net_device *bond_dev, + struct xfrm_state *xs) { - struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; netdevice_tracker tracker; struct bonding *bond; @@ -661,7 +666,7 @@ static void bond_ipsec_free_sa(struct xfrm_state *xs) if (real_dev && real_dev->xfrmdev_ops && real_dev->xfrmdev_ops->xdo_dev_state_free) - real_dev->xfrmdev_ops->xdo_dev_state_free(xs); + real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev, xs); out: netdev_put(real_dev, &tracker); } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 551c279dc14b..51395c96b2e9 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -6480,10 +6480,11 @@ static const struct tlsdev_ops cxgb4_ktls_ops = { #if IS_ENABLED(CONFIG_CHELSIO_IPSEC_INLINE) -static int cxgb4_xfrm_add_state(struct xfrm_state *x, +static int cxgb4_xfrm_add_state(struct net_device *dev, + struct xfrm_state *x, struct netlink_ext_ack *extack) { - struct adapter *adap = netdev2adap(x->xso.dev); + struct adapter *adap = netdev2adap(dev); int ret; if (!mutex_trylock(&uld_mutex)) { @@ -6494,7 +6495,8 @@ static int cxgb4_xfrm_add_state(struct xfrm_state *x, if (ret) goto out_unlock; - ret = adap->uld[CXGB4_ULD_IPSEC].xfrmdev_ops->xdo_dev_state_add(x, extack); + ret = adap->uld[CXGB4_ULD_IPSEC].xfrmdev_ops->xdo_dev_state_add(dev, x, + extack); out_unlock: mutex_unlock(&uld_mutex); @@ -6502,9 +6504,9 @@ out_unlock: return ret; } -static void cxgb4_xfrm_del_state(struct xfrm_state *x) +static void cxgb4_xfrm_del_state(struct net_device *dev, struct xfrm_state *x) { - struct adapter *adap = netdev2adap(x->xso.dev); + struct adapter *adap = netdev2adap(dev); if (!mutex_trylock(&uld_mutex)) { dev_dbg(adap->pdev_dev, @@ -6514,15 +6516,15 @@ static void cxgb4_xfrm_del_state(struct xfrm_state *x) if (chcr_offload_state(adap, CXGB4_XFRMDEV_OPS)) goto out_unlock; - adap->uld[CXGB4_ULD_IPSEC].xfrmdev_ops->xdo_dev_state_delete(x); + adap->uld[CXGB4_ULD_IPSEC].xfrmdev_ops->xdo_dev_state_delete(dev, x); out_unlock: mutex_unlock(&uld_mutex); } -static void cxgb4_xfrm_free_state(struct xfrm_state *x) +static void cxgb4_xfrm_free_state(struct net_device *dev, struct xfrm_state *x) { - struct adapter *adap = netdev2adap(x->xso.dev); + struct adapter *adap = netdev2adap(dev); if (!mutex_trylock(&uld_mutex)) { dev_dbg(adap->pdev_dev, @@ -6532,7 +6534,7 @@ static void cxgb4_xfrm_free_state(struct xfrm_state *x) if (chcr_offload_state(adap, CXGB4_XFRMDEV_OPS)) goto out_unlock; - adap->uld[CXGB4_ULD_IPSEC].xfrmdev_ops->xdo_dev_state_free(x); + adap->uld[CXGB4_ULD_IPSEC].xfrmdev_ops->xdo_dev_state_free(dev, x); out_unlock: mutex_unlock(&uld_mutex); diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c index baba96883f48..ecd9a0bd5e18 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c @@ -75,9 +75,12 @@ static int ch_ipsec_uld_state_change(void *handle, enum cxgb4_state new_state); static int ch_ipsec_xmit(struct sk_buff *skb, struct net_device *dev); static void *ch_ipsec_uld_add(const struct cxgb4_lld_info *infop); static void ch_ipsec_advance_esn_state(struct xfrm_state *x); -static void ch_ipsec_xfrm_free_state(struct xfrm_state *x); -static void ch_ipsec_xfrm_del_state(struct xfrm_state *x); -static int ch_ipsec_xfrm_add_state(struct xfrm_state *x, +static void ch_ipsec_xfrm_free_state(struct net_device *dev, + struct xfrm_state *x); +static void ch_ipsec_xfrm_del_state(struct net_device *dev, + struct xfrm_state *x); +static int ch_ipsec_xfrm_add_state(struct net_device *dev, + struct xfrm_state *x, struct netlink_ext_ack *extack); static const struct xfrmdev_ops ch_ipsec_xfrmdev_ops = { @@ -223,7 +226,8 @@ out: * returns 0 on success, negative error if failed to send message to FPGA * positive error if FPGA returned a bad response */ -static int ch_ipsec_xfrm_add_state(struct xfrm_state *x, +static int ch_ipsec_xfrm_add_state(struct net_device *dev, + struct xfrm_state *x, struct netlink_ext_ack *extack) { struct ipsec_sa_entry *sa_entry; @@ -302,14 +306,16 @@ out: return res; } -static void ch_ipsec_xfrm_del_state(struct xfrm_state *x) +static void ch_ipsec_xfrm_del_state(struct net_device *dev, + struct xfrm_state *x) { /* do nothing */ if (!x->xso.offload_handle) return; } -static void ch_ipsec_xfrm_free_state(struct xfrm_state *x) +static void ch_ipsec_xfrm_free_state(struct net_device *dev, + struct xfrm_state *x) { struct ipsec_sa_entry *sa_entry; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c index 07ea1954a276..796e90d741f0 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c @@ -9,7 +9,7 @@ #define IXGBE_IPSEC_KEY_BITS 160 static const char aes_gcm_name[] = "rfc4106(gcm(aes))"; -static void ixgbe_ipsec_del_sa(struct xfrm_state *xs); +static void ixgbe_ipsec_del_sa(struct net_device *dev, struct xfrm_state *xs); /** * ixgbe_ipsec_set_tx_sa - set the Tx SA registers @@ -321,7 +321,7 @@ void ixgbe_ipsec_restore(struct ixgbe_adapter *adapter) if (r->used) { if (r->mode & IXGBE_RXTXMOD_VF) - ixgbe_ipsec_del_sa(r->xs); + ixgbe_ipsec_del_sa(adapter->netdev, r->xs); else ixgbe_ipsec_set_rx_sa(hw, i, r->xs->id.spi, r->key, r->salt, @@ -330,7 +330,7 @@ void ixgbe_ipsec_restore(struct ixgbe_adapter *adapter) if (t->used) { if (t->mode & IXGBE_RXTXMOD_VF) - ixgbe_ipsec_del_sa(t->xs); + ixgbe_ipsec_del_sa(adapter->netdev, t->xs); else ixgbe_ipsec_set_tx_sa(hw, i, t->key, t->salt); } @@ -417,6 +417,7 @@ static struct xfrm_state *ixgbe_ipsec_find_rx_state(struct ixgbe_ipsec *ipsec, /** * ixgbe_ipsec_parse_proto_keys - find the key and salt based on the protocol + * @dev: pointer to net device * @xs: pointer to xfrm_state struct * @mykey: pointer to key array to populate * @mysalt: pointer to salt value to populate @@ -424,10 +425,10 @@ static struct xfrm_state *ixgbe_ipsec_find_rx_state(struct ixgbe_ipsec *ipsec, * This copies the protocol keys and salt to our own data tables. The * 82599 family only supports the one algorithm. **/ -static int ixgbe_ipsec_parse_proto_keys(struct xfrm_state *xs, +static int ixgbe_ipsec_parse_proto_keys(struct net_device *dev, + struct xfrm_state *xs, u32 *mykey, u32 *mysalt) { - struct net_device *dev = xs->xso.real_dev; unsigned char *key_data; char *alg_name = NULL; int key_len; @@ -473,11 +474,12 @@ static int ixgbe_ipsec_parse_proto_keys(struct xfrm_state *xs, /** * ixgbe_ipsec_check_mgmt_ip - make sure there is no clash with mgmt IP filters + * @dev: pointer to net device * @xs: pointer to transformer state struct **/ -static int ixgbe_ipsec_check_mgmt_ip(struct xfrm_state *xs) +static int ixgbe_ipsec_check_mgmt_ip(struct net_device *dev, + struct xfrm_state *xs) { - struct net_device *dev = xs->xso.real_dev; struct ixgbe_adapter *adapter = netdev_priv(dev); struct ixgbe_hw *hw = &adapter->hw; u32 mfval, manc, reg; @@ -556,13 +558,14 @@ static int ixgbe_ipsec_check_mgmt_ip(struct xfrm_state *xs) /** * ixgbe_ipsec_add_sa - program device with a security association + * @dev: pointer to device to program * @xs: pointer to transformer state struct * @extack: extack point to fill failure reason **/ -static int ixgbe_ipsec_add_sa(struct xfrm_state *xs, +static int ixgbe_ipsec_add_sa(struct net_device *dev, + struct xfrm_state *xs, struct netlink_ext_ack *extack) { - struct net_device *dev = xs->xso.real_dev; struct ixgbe_adapter *adapter = netdev_priv(dev); struct ixgbe_ipsec *ipsec = adapter->ipsec; struct ixgbe_hw *hw = &adapter->hw; @@ -581,7 +584,7 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs, return -EINVAL; } - if (ixgbe_ipsec_check_mgmt_ip(xs)) { + if (ixgbe_ipsec_check_mgmt_ip(dev, xs)) { NL_SET_ERR_MSG_MOD(extack, "IPsec IP addr clash with mgmt filters"); return -EINVAL; } @@ -615,7 +618,7 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs, rsa.decrypt = xs->ealg || xs->aead; /* get the key and salt */ - ret = ixgbe_ipsec_parse_proto_keys(xs, rsa.key, &rsa.salt); + ret = ixgbe_ipsec_parse_proto_keys(dev, xs, rsa.key, &rsa.salt); if (ret) { NL_SET_ERR_MSG_MOD(extack, "Failed to get key data for Rx SA table"); return ret; @@ -724,7 +727,7 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs, if (xs->id.proto & IPPROTO_ESP) tsa.encrypt = xs->ealg || xs->aead; - ret = ixgbe_ipsec_parse_proto_keys(xs, tsa.key, &tsa.salt); + ret = ixgbe_ipsec_parse_proto_keys(dev, xs, tsa.key, &tsa.salt); if (ret) { NL_SET_ERR_MSG_MOD(extack, "Failed to get key data for Tx SA table"); memset(&tsa, 0, sizeof(tsa)); @@ -752,11 +755,11 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs, /** * ixgbe_ipsec_del_sa - clear out this specific SA + * @dev: pointer to device to program * @xs: pointer to transformer state struct **/ -static void ixgbe_ipsec_del_sa(struct xfrm_state *xs) +static void ixgbe_ipsec_del_sa(struct net_device *dev, struct xfrm_state *xs) { - struct net_device *dev = xs->xso.real_dev; struct ixgbe_adapter *adapter = netdev_priv(dev); struct ixgbe_ipsec *ipsec = adapter->ipsec; struct ixgbe_hw *hw = &adapter->hw; @@ -841,7 +844,8 @@ void ixgbe_ipsec_vf_clear(struct ixgbe_adapter *adapter, u32 vf) continue; if (ipsec->rx_tbl[i].mode & IXGBE_RXTXMOD_VF && ipsec->rx_tbl[i].vf == vf) - ixgbe_ipsec_del_sa(ipsec->rx_tbl[i].xs); + ixgbe_ipsec_del_sa(adapter->netdev, + ipsec->rx_tbl[i].xs); } /* search tx sa table */ @@ -850,7 +854,8 @@ void ixgbe_ipsec_vf_clear(struct ixgbe_adapter *adapter, u32 vf) continue; if (ipsec->tx_tbl[i].mode & IXGBE_RXTXMOD_VF && ipsec->tx_tbl[i].vf == vf) - ixgbe_ipsec_del_sa(ipsec->tx_tbl[i].xs); + ixgbe_ipsec_del_sa(adapter->netdev, + ipsec->tx_tbl[i].xs); } } @@ -930,7 +935,7 @@ int ixgbe_ipsec_vf_add_sa(struct ixgbe_adapter *adapter, u32 *msgbuf, u32 vf) memcpy(xs->aead->alg_name, aes_gcm_name, sizeof(aes_gcm_name)); /* set up the HW offload */ - err = ixgbe_ipsec_add_sa(xs, NULL); + err = ixgbe_ipsec_add_sa(adapter->netdev, xs, NULL); if (err) goto err_aead; @@ -1034,7 +1039,7 @@ int ixgbe_ipsec_vf_del_sa(struct ixgbe_adapter *adapter, u32 *msgbuf, u32 vf) xs = ipsec->tx_tbl[sa_idx].xs; } - ixgbe_ipsec_del_sa(xs); + ixgbe_ipsec_del_sa(adapter->netdev, xs); /* remove the xs that was made-up in the add request */ kfree_sensitive(xs); diff --git a/drivers/net/ethernet/intel/ixgbevf/ipsec.c b/drivers/net/ethernet/intel/ixgbevf/ipsec.c index 8ba037e3d9c2..65580b9cb06f 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ipsec.c +++ b/drivers/net/ethernet/intel/ixgbevf/ipsec.c @@ -201,6 +201,7 @@ struct xfrm_state *ixgbevf_ipsec_find_rx_state(struct ixgbevf_ipsec *ipsec, /** * ixgbevf_ipsec_parse_proto_keys - find the key and salt based on the protocol + * @dev: pointer to net device to program * @xs: pointer to xfrm_state struct * @mykey: pointer to key array to populate * @mysalt: pointer to salt value to populate @@ -208,10 +209,10 @@ struct xfrm_state *ixgbevf_ipsec_find_rx_state(struct ixgbevf_ipsec *ipsec, * This copies the protocol keys and salt to our own data tables. The * 82599 family only supports the one algorithm. **/ -static int ixgbevf_ipsec_parse_proto_keys(struct xfrm_state *xs, +static int ixgbevf_ipsec_parse_proto_keys(struct net_device *dev, + struct xfrm_state *xs, u32 *mykey, u32 *mysalt) { - struct net_device *dev = xs->xso.real_dev; unsigned char *key_data; char *alg_name = NULL; int key_len; @@ -256,13 +257,14 @@ static int ixgbevf_ipsec_parse_proto_keys(struct xfrm_state *xs, /** * ixgbevf_ipsec_add_sa - program device with a security association + * @dev: pointer to net device to program * @xs: pointer to transformer state struct * @extack: extack point to fill failure reason **/ -static int ixgbevf_ipsec_add_sa(struct xfrm_state *xs, +static int ixgbevf_ipsec_add_sa(struct net_device *dev, + struct xfrm_state *xs, struct netlink_ext_ack *extack) { - struct net_device *dev = xs->xso.real_dev; struct ixgbevf_adapter *adapter; struct ixgbevf_ipsec *ipsec; u16 sa_idx; @@ -310,7 +312,8 @@ static int ixgbevf_ipsec_add_sa(struct xfrm_state *xs, rsa.decrypt = xs->ealg || xs->aead; /* get the key and salt */ - ret = ixgbevf_ipsec_parse_proto_keys(xs, rsa.key, &rsa.salt); + ret = ixgbevf_ipsec_parse_proto_keys(dev, xs, rsa.key, + &rsa.salt); if (ret) { NL_SET_ERR_MSG_MOD(extack, "Failed to get key data for Rx SA table"); return ret; @@ -363,7 +366,8 @@ static int ixgbevf_ipsec_add_sa(struct xfrm_state *xs, if (xs->id.proto & IPPROTO_ESP) tsa.encrypt = xs->ealg || xs->aead; - ret = ixgbevf_ipsec_parse_proto_keys(xs, tsa.key, &tsa.salt); + ret = ixgbevf_ipsec_parse_proto_keys(dev, xs, tsa.key, + &tsa.salt); if (ret) { NL_SET_ERR_MSG_MOD(extack, "Failed to get key data for Tx SA table"); memset(&tsa, 0, sizeof(tsa)); @@ -388,11 +392,12 @@ static int ixgbevf_ipsec_add_sa(struct xfrm_state *xs, /** * ixgbevf_ipsec_del_sa - clear out this specific SA + * @dev: pointer to net device to program * @xs: pointer to transformer state struct **/ -static void ixgbevf_ipsec_del_sa(struct xfrm_state *xs) +static void ixgbevf_ipsec_del_sa(struct net_device *dev, + struct xfrm_state *xs) { - struct net_device *dev = xs->xso.real_dev; struct ixgbevf_adapter *adapter; struct ixgbevf_ipsec *ipsec; u16 sa_idx; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c index fc59e50bafce..a6500e3673f2 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c @@ -663,10 +663,10 @@ static int cn10k_ipsec_inb_add_state(struct xfrm_state *x, return -EOPNOTSUPP; } -static int cn10k_ipsec_outb_add_state(struct xfrm_state *x, +static int cn10k_ipsec_outb_add_state(struct net_device *dev, + struct xfrm_state *x, struct netlink_ext_ack *extack) { - struct net_device *netdev = x->xso.dev; struct cn10k_tx_sa_s *sa_entry; struct qmem *sa_info; struct otx2_nic *pf; @@ -676,7 +676,7 @@ static int cn10k_ipsec_outb_add_state(struct xfrm_state *x, if (err) return err; - pf = netdev_priv(netdev); + pf = netdev_priv(dev); err = qmem_alloc(pf->dev, &sa_info, pf->ipsec.sa_size, OTX2_ALIGN); if (err) @@ -700,18 +700,18 @@ static int cn10k_ipsec_outb_add_state(struct xfrm_state *x, return 0; } -static int cn10k_ipsec_add_state(struct xfrm_state *x, +static int cn10k_ipsec_add_state(struct net_device *dev, + struct xfrm_state *x, struct netlink_ext_ack *extack) { if (x->xso.dir == XFRM_DEV_OFFLOAD_IN) return cn10k_ipsec_inb_add_state(x, extack); else - return cn10k_ipsec_outb_add_state(x, extack); + return cn10k_ipsec_outb_add_state(dev, x, extack); } -static void cn10k_ipsec_del_state(struct xfrm_state *x) +static void cn10k_ipsec_del_state(struct net_device *dev, struct xfrm_state *x) { - struct net_device *netdev = x->xso.dev; struct cn10k_tx_sa_s *sa_entry; struct qmem *sa_info; struct otx2_nic *pf; @@ -720,7 +720,7 @@ static void cn10k_ipsec_del_state(struct xfrm_state *x) if (x->xso.dir == XFRM_DEV_OFFLOAD_IN) return; - pf = netdev_priv(netdev); + pf = netdev_priv(dev); sa_info = (struct qmem *)x->xso.offload_handle; sa_entry = (struct cn10k_tx_sa_s *)sa_info->base; @@ -732,7 +732,7 @@ static void cn10k_ipsec_del_state(struct xfrm_state *x) err = cn10k_outb_write_sa(pf, sa_info); if (err) - netdev_err(netdev, "Error (%d) deleting SA\n", err); + netdev_err(dev, "Error (%d) deleting SA\n", err); x->xso.offload_handle = 0; qmem_free(pf->dev, sa_info); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index 0dfbbe21936f..77f61cd28a79 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -689,17 +689,17 @@ static int mlx5e_ipsec_create_dwork(struct mlx5e_ipsec_sa_entry *sa_entry) return 0; } -static int mlx5e_xfrm_add_state(struct xfrm_state *x, +static int mlx5e_xfrm_add_state(struct net_device *dev, + struct xfrm_state *x, struct netlink_ext_ack *extack) { struct mlx5e_ipsec_sa_entry *sa_entry = NULL; - struct net_device *netdev = x->xso.real_dev; struct mlx5e_ipsec *ipsec; struct mlx5e_priv *priv; gfp_t gfp; int err; - priv = netdev_priv(netdev); + priv = netdev_priv(dev); if (!priv->ipsec) return -EOPNOTSUPP; @@ -710,7 +710,7 @@ static int mlx5e_xfrm_add_state(struct xfrm_state *x, return -ENOMEM; sa_entry->x = x; - sa_entry->dev = netdev; + sa_entry->dev = dev; sa_entry->ipsec = ipsec; /* Check if this SA is originated from acquire flow temporary SA */ if (x->xso.flags & XFRM_DEV_OFFLOAD_FLAG_ACQ) @@ -807,7 +807,7 @@ err_xfrm: return err; } -static void mlx5e_xfrm_del_state(struct xfrm_state *x) +static void mlx5e_xfrm_del_state(struct net_device *dev, struct xfrm_state *x) { struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x); struct mlx5e_ipsec *ipsec = sa_entry->ipsec; @@ -820,7 +820,7 @@ static void mlx5e_xfrm_del_state(struct xfrm_state *x) WARN_ON(old != sa_entry); } -static void mlx5e_xfrm_free_state(struct xfrm_state *x) +static void mlx5e_xfrm_free_state(struct net_device *dev, struct xfrm_state *x) { struct mlx5e_ipsec_sa_entry *sa_entry = to_ipsec_sa_entry(x); struct mlx5e_ipsec *ipsec = sa_entry->ipsec; diff --git a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c index 671af5d4c5d2..9e7c285eaa6b 100644 --- a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c +++ b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c @@ -266,17 +266,17 @@ static void set_sha2_512hmac(struct nfp_ipsec_cfg_add_sa *cfg, int *trunc_len) } } -static int nfp_net_xfrm_add_state(struct xfrm_state *x, +static int nfp_net_xfrm_add_state(struct net_device *dev, + struct xfrm_state *x, struct netlink_ext_ack *extack) { - struct net_device *netdev = x->xso.real_dev; struct nfp_ipsec_cfg_mssg msg = {}; int i, key_len, trunc_len, err = 0; struct nfp_ipsec_cfg_add_sa *cfg; struct nfp_net *nn; unsigned int saidx; - nn = netdev_priv(netdev); + nn = netdev_priv(dev); cfg = &msg.cfg_add_sa; /* General */ @@ -546,17 +546,16 @@ static int nfp_net_xfrm_add_state(struct xfrm_state *x, return 0; } -static void nfp_net_xfrm_del_state(struct xfrm_state *x) +static void nfp_net_xfrm_del_state(struct net_device *dev, struct xfrm_state *x) { struct nfp_ipsec_cfg_mssg msg = { .cmd = NFP_IPSEC_CFG_MSSG_INV_SA, .sa_idx = x->xso.offload_handle - 1, }; - struct net_device *netdev = x->xso.real_dev; struct nfp_net *nn; int err; - nn = netdev_priv(netdev); + nn = netdev_priv(dev); err = nfp_net_sched_mbox_amsg_work(nn, NFP_NET_CFG_MBOX_CMD_IPSEC, &msg, sizeof(msg), nfp_net_ipsec_cfg); if (err) diff --git a/drivers/net/netdevsim/ipsec.c b/drivers/net/netdevsim/ipsec.c index d88bdb9a1717..47cdee5577d4 100644 --- a/drivers/net/netdevsim/ipsec.c +++ b/drivers/net/netdevsim/ipsec.c @@ -85,11 +85,11 @@ static int nsim_ipsec_find_empty_idx(struct nsim_ipsec *ipsec) return -ENOSPC; } -static int nsim_ipsec_parse_proto_keys(struct xfrm_state *xs, +static int nsim_ipsec_parse_proto_keys(struct net_device *dev, + struct xfrm_state *xs, u32 *mykey, u32 *mysalt) { const char aes_gcm_name[] = "rfc4106(gcm(aes))"; - struct net_device *dev = xs->xso.real_dev; unsigned char *key_data; char *alg_name = NULL; int key_len; @@ -129,17 +129,16 @@ static int nsim_ipsec_parse_proto_keys(struct xfrm_state *xs, return 0; } -static int nsim_ipsec_add_sa(struct xfrm_state *xs, +static int nsim_ipsec_add_sa(struct net_device *dev, + struct xfrm_state *xs, struct netlink_ext_ack *extack) { struct nsim_ipsec *ipsec; - struct net_device *dev; struct netdevsim *ns; struct nsim_sa sa; u16 sa_idx; int ret; - dev = xs->xso.real_dev; ns = netdev_priv(dev); ipsec = &ns->ipsec; @@ -174,7 +173,7 @@ static int nsim_ipsec_add_sa(struct xfrm_state *xs, sa.crypt = xs->ealg || xs->aead; /* get the key and salt */ - ret = nsim_ipsec_parse_proto_keys(xs, sa.key, &sa.salt); + ret = nsim_ipsec_parse_proto_keys(dev, xs, sa.key, &sa.salt); if (ret) { NL_SET_ERR_MSG_MOD(extack, "Failed to get key data for SA table"); return ret; @@ -200,9 +199,9 @@ static int nsim_ipsec_add_sa(struct xfrm_state *xs, return 0; } -static void nsim_ipsec_del_sa(struct xfrm_state *xs) +static void nsim_ipsec_del_sa(struct net_device *dev, struct xfrm_state *xs) { - struct netdevsim *ns = netdev_priv(xs->xso.real_dev); + struct netdevsim *ns = netdev_priv(dev); struct nsim_ipsec *ipsec = &ns->ipsec; u16 sa_idx; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d8544f6a680c..88dfb8aeed3c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1013,9 +1013,13 @@ struct netdev_bpf { #ifdef CONFIG_XFRM_OFFLOAD struct xfrmdev_ops { - int (*xdo_dev_state_add) (struct xfrm_state *x, struct netlink_ext_ack *extack); - void (*xdo_dev_state_delete) (struct xfrm_state *x); - void (*xdo_dev_state_free) (struct xfrm_state *x); + int (*xdo_dev_state_add)(struct net_device *dev, + struct xfrm_state *x, + struct netlink_ext_ack *extack); + void (*xdo_dev_state_delete)(struct net_device *dev, + struct xfrm_state *x); + void (*xdo_dev_state_free)(struct net_device *dev, + struct xfrm_state *x); bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 39365fd2ea17..3d2f6c879311 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -147,8 +147,16 @@ enum { }; struct xfrm_dev_offload { + /* The device for this offload. + * Device drivers should not use this directly, as that will prevent + * them from working with bonding device. Instead, the device passed + * to the add/delete callbacks should be used. + */ struct net_device *dev; netdevice_tracker dev_tracker; + /* This is a private pointer used by the bonding driver. + * Device drivers should not use it. + */ struct net_device *real_dev; unsigned long offload_handle; u8 dir : 2; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 0be5f7ffd019..3be0139373f7 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -309,7 +309,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, xso->dev = dev; netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC); - xso->real_dev = dev; if (xuo->flags & XFRM_OFFLOAD_INBOUND) xso->dir = XFRM_DEV_OFFLOAD_IN; @@ -321,11 +320,10 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, else xso->type = XFRM_DEV_OFFLOAD_CRYPTO; - err = dev->xfrmdev_ops->xdo_dev_state_add(x, extack); + err = dev->xfrmdev_ops->xdo_dev_state_add(dev, x, extack); if (err) { xso->dev = NULL; xso->dir = 0; - xso->real_dev = NULL; netdev_put(dev, &xso->dev_tracker); xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index d67420e76389..3c2e27e5a1e3 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -767,7 +767,7 @@ void xfrm_dev_state_delete(struct xfrm_state *x) struct net_device *dev = READ_ONCE(xso->dev); if (dev) { - dev->xfrmdev_ops->xdo_dev_state_delete(x); + dev->xfrmdev_ops->xdo_dev_state_delete(dev, x); spin_lock_bh(&xfrm_state_dev_gc_lock); hlist_add_head(&x->dev_gclist, &xfrm_state_dev_gc_list); spin_unlock_bh(&xfrm_state_dev_gc_lock); @@ -789,7 +789,7 @@ void xfrm_dev_state_free(struct xfrm_state *x) spin_unlock_bh(&xfrm_state_dev_gc_lock); if (dev->xfrmdev_ops->xdo_dev_state_free) - dev->xfrmdev_ops->xdo_dev_state_free(x); + dev->xfrmdev_ops->xdo_dev_state_free(dev, x); WRITE_ONCE(xso->dev, NULL); xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; netdev_put(dev, &xso->dev_tracker); @@ -1551,16 +1551,18 @@ found: if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { struct xfrm_dev_offload *xdo = &pol->xdo; struct xfrm_dev_offload *xso = &x->xso; + struct net_device *dev = xdo->dev; xso->type = XFRM_DEV_OFFLOAD_PACKET; xso->dir = xdo->dir; - xso->dev = xdo->dev; + xso->dev = dev; xso->flags = XFRM_DEV_OFFLOAD_FLAG_ACQ; - netdev_hold(xso->dev, &xso->dev_tracker, GFP_ATOMIC); - error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x, NULL); + netdev_hold(dev, &xso->dev_tracker, GFP_ATOMIC); + error = dev->xfrmdev_ops->xdo_dev_state_add(dev, x, + NULL); if (error) { xso->dir = 0; - netdev_put(xso->dev, &xso->dev_tracker); + netdev_put(dev, &xso->dev_tracker); xso->dev = NULL; xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; x->km.state = XFRM_STATE_DEAD; -- cgit v1.2.3 From d2fddbd3479928e52061e1c8dd302006b6283ce8 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Fri, 11 Apr 2025 10:49:58 +0300 Subject: bonding: Fix multiple long standing offload races Refactor the bonding ipsec offload operations to fix a number of long-standing control plane races between state migration and user deletion and a few other issues. xfrm state deletion can happen concurrently with bond_change_active_slave() operation. This manifests itself as a bond_ipsec_del_sa() call with x->lock held, followed by a bond_ipsec_free_sa() a bit later from a wq. The alternate path of these calls coming from xfrm_dev_state_flush() can't happen, as that needs the RTNL lock and bond_change_active_slave() already holds it. 1. bond_ipsec_del_sa_all() might call xdo_dev_state_delete() a second time on an xfrm state that was concurrently killed. This is bad. 2. bond_ipsec_add_sa_all() can add a state on the new device, but pending bond_ipsec_free_sa() calls from the old device will then hit the WARN_ON() and then, worse, call xdo_dev_state_free() on the new device without a corresponding xdo_dev_state_delete(). 3. Resolve a sleeping in atomic context introduced by the mentioned "Fixes" commit. bond_ipsec_del_sa_all() and bond_ipsec_add_sa_all() now acquire x->lock and check for x->km.state to help with problems 1 and 2. And since xso.real_dev is now a private pointer managed by the bonding driver in xfrm state, make better use of it to fully fix problems 1 and 2. In bond_ipsec_del_sa_all(), set xso.real_dev to NULL while holding both the mutex and x->lock, which makes sure that neither bond_ipsec_del_sa() nor bond_ipsec_free_sa() could run concurrently. Fix problem 3 by moving the list cleanup (which requires the mutex) from bond_ipsec_del_sa() (called from atomic context) to bond_ipsec_free_sa() Finally, simplify bond_ipsec_del_sa() and bond_ipsec_free_sa() by using xso->real_dev directly, since it's now protected by locks and can be trusted to always reflect the offload device. Fixes: 2aeeef906d5a ("bonding: change ipsec_lock from spin lock to mutex") Signed-off-by: Cosmin Ratiu Reviewed-by: Leon Romanovsky Reviewed-by: Nikolay Aleksandrov Reviewed-by: Hangbin Liu Tested-by: Hangbin Liu Signed-off-by: Steffen Klassert --- drivers/net/bonding/bond_main.c | 82 ++++++++++++++++++----------------------- include/net/xfrm.h | 7 +++- 2 files changed, 41 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 14f7c9712ad4..8ed8c29659a0 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -545,7 +545,20 @@ static void bond_ipsec_add_sa_all(struct bonding *bond) slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); continue; } + + spin_lock_bh(&ipsec->xs->lock); + /* xs might have been killed by the user during the migration + * to the new dev, but bond_ipsec_del_sa() should have done + * nothing, as xso.real_dev is NULL. + * Delete it from the device we just added it to. The pending + * bond_ipsec_free_sa() call will do the rest of the cleanup. + */ + if (ipsec->xs->km.state == XFRM_STATE_DEAD && + real_dev->xfrmdev_ops->xdo_dev_state_delete) + real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, + ipsec->xs); ipsec->xs->xso.real_dev = real_dev; + spin_unlock_bh(&ipsec->xs->lock); } out: mutex_unlock(&bond->ipsec_lock); @@ -560,48 +573,20 @@ static void bond_ipsec_del_sa(struct net_device *bond_dev, struct xfrm_state *xs) { struct net_device *real_dev; - netdevice_tracker tracker; - struct bond_ipsec *ipsec; - struct bonding *bond; - struct slave *slave; - if (!bond_dev) + if (!bond_dev || !xs->xso.real_dev) return; - rcu_read_lock(); - bond = netdev_priv(bond_dev); - slave = rcu_dereference(bond->curr_active_slave); - real_dev = slave ? slave->dev : NULL; - netdev_hold(real_dev, &tracker, GFP_ATOMIC); - rcu_read_unlock(); - - if (!slave) - goto out; - - if (!xs->xso.real_dev) - goto out; - - WARN_ON(xs->xso.real_dev != real_dev); + real_dev = xs->xso.real_dev; if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_delete || netif_is_bond_master(real_dev)) { slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__); - goto out; + return; } real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, xs); -out: - netdev_put(real_dev, &tracker); - mutex_lock(&bond->ipsec_lock); - list_for_each_entry(ipsec, &bond->ipsec_list, list) { - if (ipsec->xs == xs) { - list_del(&ipsec->list); - kfree(ipsec); - break; - } - } - mutex_unlock(&bond->ipsec_lock); } static void bond_ipsec_del_sa_all(struct bonding *bond) @@ -629,9 +614,15 @@ static void bond_ipsec_del_sa_all(struct bonding *bond) __func__); continue; } + + spin_lock_bh(&ipsec->xs->lock); ipsec->xs->xso.real_dev = NULL; - real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, - ipsec->xs); + /* Don't double delete states killed by the user. */ + if (ipsec->xs->km.state != XFRM_STATE_DEAD) + real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, + ipsec->xs); + spin_unlock_bh(&ipsec->xs->lock); + if (real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev, ipsec->xs); @@ -643,34 +634,33 @@ static void bond_ipsec_free_sa(struct net_device *bond_dev, struct xfrm_state *xs) { struct net_device *real_dev; - netdevice_tracker tracker; + struct bond_ipsec *ipsec; struct bonding *bond; - struct slave *slave; if (!bond_dev) return; - rcu_read_lock(); bond = netdev_priv(bond_dev); - slave = rcu_dereference(bond->curr_active_slave); - real_dev = slave ? slave->dev : NULL; - netdev_hold(real_dev, &tracker, GFP_ATOMIC); - rcu_read_unlock(); - - if (!slave) - goto out; + mutex_lock(&bond->ipsec_lock); if (!xs->xso.real_dev) goto out; - WARN_ON(xs->xso.real_dev != real_dev); + real_dev = xs->xso.real_dev; xs->xso.real_dev = NULL; - if (real_dev && real_dev->xfrmdev_ops && + if (real_dev->xfrmdev_ops && real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev, xs); out: - netdev_put(real_dev, &tracker); + list_for_each_entry(ipsec, &bond->ipsec_list, list) { + if (ipsec->xs == xs) { + list_del(&ipsec->list); + kfree(ipsec); + break; + } + } + mutex_unlock(&bond->ipsec_lock); } /** diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 3d2f6c879311..b7e8f3f49627 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -154,8 +154,11 @@ struct xfrm_dev_offload { */ struct net_device *dev; netdevice_tracker dev_tracker; - /* This is a private pointer used by the bonding driver. - * Device drivers should not use it. + /* This is a private pointer used by the bonding driver (and eventually + * should be moved there). Device drivers should not use it. + * Protected by xfrm_state.lock AND bond.ipsec_lock in most cases, + * except in the .xdo_dev_state_del() flow, where only xfrm_state.lock + * is held. */ struct net_device *real_dev; unsigned long offload_handle; -- cgit v1.2.3 From cd1fafe7da1f6f2aa25723e317f6e8e9d0c050a1 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 15 Apr 2025 05:24:58 +0000 Subject: eth: bnxt: add support rx side device memory TCP Currently, bnxt_en driver satisfies the requirements of the Device memory TCP, which is HDS. So, it implements rx-side Device memory TCP for bnxt_en driver. It requires only converting the page API to netmem API. `struct page` of agg rings are changed to `netmem_ref netmem` and corresponding functions are changed to a variant of netmem API. It also passes PP_FLAG_ALLOW_UNREADABLE_NETMEM flag to a parameter of page_pool. The netmem will be activated only when a user requests devmem TCP. When netmem is activated, received data is unreadable and netmem is disabled, received data is readable. But drivers don't need to handle both cases because netmem core API will handle it properly. So, using proper netmem API is enough for drivers. Device memory TCP can be tested with tools/testing/selftests/drivers/net/hw/ncdevmem. This is tested with BCM57504-N425G and firmware version 232.0.155.8/pkg 232.1.132.8. Reviewed-by: Mina Almasry Tested-by: David Wei Signed-off-by: Taehee Yoo Link: https://patch.msgid.link/20250415052458.1260575-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 201 ++++++++++++++++++------------ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 3 +- include/net/page_pool/helpers.h | 11 ++ 3 files changed, 133 insertions(+), 82 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 8725e1e13908..83608b7447f4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -893,9 +893,9 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget) bnapi->events &= ~BNXT_TX_CMP_EVENT; } -static bool bnxt_separate_head_pool(void) +static bool bnxt_separate_head_pool(struct bnxt_rx_ring_info *rxr) { - return PAGE_SIZE > BNXT_RX_PAGE_SIZE; + return rxr->need_head_pool || PAGE_SIZE > BNXT_RX_PAGE_SIZE; } static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, @@ -919,6 +919,20 @@ static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, return page; } +static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping, + struct bnxt_rx_ring_info *rxr, + gfp_t gfp) +{ + netmem_ref netmem; + + netmem = page_pool_alloc_netmems(rxr->page_pool, gfp); + if (!netmem) + return 0; + + *mapping = page_pool_get_dma_addr_netmem(netmem); + return netmem; +} + static inline u8 *__bnxt_alloc_rx_frag(struct bnxt *bp, dma_addr_t *mapping, struct bnxt_rx_ring_info *rxr, gfp_t gfp) @@ -999,21 +1013,19 @@ static inline u16 bnxt_find_next_agg_idx(struct bnxt_rx_ring_info *rxr, u16 idx) return next; } -static inline int bnxt_alloc_rx_page(struct bnxt *bp, - struct bnxt_rx_ring_info *rxr, - u16 prod, gfp_t gfp) +static int bnxt_alloc_rx_netmem(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, + u16 prod, gfp_t gfp) { struct rx_bd *rxbd = &rxr->rx_agg_desc_ring[RX_AGG_RING(bp, prod)][RX_IDX(prod)]; struct bnxt_sw_rx_agg_bd *rx_agg_buf; - struct page *page; - dma_addr_t mapping; u16 sw_prod = rxr->rx_sw_agg_prod; unsigned int offset = 0; + dma_addr_t mapping; + netmem_ref netmem; - page = __bnxt_alloc_rx_page(bp, &mapping, rxr, &offset, gfp); - - if (!page) + netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, gfp); + if (!netmem) return -ENOMEM; if (unlikely(test_bit(sw_prod, rxr->rx_agg_bmap))) @@ -1023,7 +1035,7 @@ static inline int bnxt_alloc_rx_page(struct bnxt *bp, rx_agg_buf = &rxr->rx_agg_ring[sw_prod]; rxr->rx_sw_agg_prod = RING_RX_AGG(bp, NEXT_RX_AGG(sw_prod)); - rx_agg_buf->page = page; + rx_agg_buf->netmem = netmem; rx_agg_buf->offset = offset; rx_agg_buf->mapping = mapping; rxbd->rx_bd_haddr = cpu_to_le64(mapping); @@ -1067,11 +1079,11 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx, p5_tpa = true; for (i = 0; i < agg_bufs; i++) { - u16 cons; - struct rx_agg_cmp *agg; struct bnxt_sw_rx_agg_bd *cons_rx_buf, *prod_rx_buf; + struct rx_agg_cmp *agg; struct rx_bd *prod_bd; - struct page *page; + netmem_ref netmem; + u16 cons; if (p5_tpa) agg = bnxt_get_tpa_agg_p5(bp, rxr, idx, start + i); @@ -1088,11 +1100,11 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx, cons_rx_buf = &rxr->rx_agg_ring[cons]; /* It is possible for sw_prod to be equal to cons, so - * set cons_rx_buf->page to NULL first. + * set cons_rx_buf->netmem to 0 first. */ - page = cons_rx_buf->page; - cons_rx_buf->page = NULL; - prod_rx_buf->page = page; + netmem = cons_rx_buf->netmem; + cons_rx_buf->netmem = 0; + prod_rx_buf->netmem = netmem; prod_rx_buf->offset = cons_rx_buf->offset; prod_rx_buf->mapping = cons_rx_buf->mapping; @@ -1218,29 +1230,35 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp, return skb; } -static u32 __bnxt_rx_agg_pages(struct bnxt *bp, - struct bnxt_cp_ring_info *cpr, - struct skb_shared_info *shinfo, - u16 idx, u32 agg_bufs, bool tpa, - struct xdp_buff *xdp) +static u32 __bnxt_rx_agg_netmems(struct bnxt *bp, + struct bnxt_cp_ring_info *cpr, + u16 idx, u32 agg_bufs, bool tpa, + struct sk_buff *skb, + struct xdp_buff *xdp) { struct bnxt_napi *bnapi = cpr->bnapi; - struct pci_dev *pdev = bp->pdev; - struct bnxt_rx_ring_info *rxr = bnapi->rx_ring; - u16 prod = rxr->rx_agg_prod; + struct skb_shared_info *shinfo; + struct bnxt_rx_ring_info *rxr; u32 i, total_frag_len = 0; bool p5_tpa = false; + u16 prod; + + rxr = bnapi->rx_ring; + prod = rxr->rx_agg_prod; if ((bp->flags & BNXT_FLAG_CHIP_P5_PLUS) && tpa) p5_tpa = true; + if (skb) + shinfo = skb_shinfo(skb); + else + shinfo = xdp_get_shared_info_from_buff(xdp); + for (i = 0; i < agg_bufs; i++) { - skb_frag_t *frag = &shinfo->frags[i]; - u16 cons, frag_len; - struct rx_agg_cmp *agg; struct bnxt_sw_rx_agg_bd *cons_rx_buf; - struct page *page; - dma_addr_t mapping; + struct rx_agg_cmp *agg; + u16 cons, frag_len; + netmem_ref netmem; if (p5_tpa) agg = bnxt_get_tpa_agg_p5(bp, rxr, idx, i); @@ -1251,27 +1269,41 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT; cons_rx_buf = &rxr->rx_agg_ring[cons]; - skb_frag_fill_page_desc(frag, cons_rx_buf->page, - cons_rx_buf->offset, frag_len); - shinfo->nr_frags = i + 1; + if (skb) { + skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem, + cons_rx_buf->offset, + frag_len, BNXT_RX_PAGE_SIZE); + } else { + skb_frag_t *frag = &shinfo->frags[i]; + + skb_frag_fill_netmem_desc(frag, cons_rx_buf->netmem, + cons_rx_buf->offset, + frag_len); + shinfo->nr_frags = i + 1; + } __clear_bit(cons, rxr->rx_agg_bmap); - /* It is possible for bnxt_alloc_rx_page() to allocate + /* It is possible for bnxt_alloc_rx_netmem() to allocate * a sw_prod index that equals the cons index, so we * need to clear the cons entry now. */ - mapping = cons_rx_buf->mapping; - page = cons_rx_buf->page; - cons_rx_buf->page = NULL; + netmem = cons_rx_buf->netmem; + cons_rx_buf->netmem = 0; - if (xdp && page_is_pfmemalloc(page)) + if (xdp && netmem_is_pfmemalloc(netmem)) xdp_buff_set_frag_pfmemalloc(xdp); - if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_ATOMIC) != 0) { + if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_ATOMIC) != 0) { + if (skb) { + skb->len -= frag_len; + skb->data_len -= frag_len; + skb->truesize -= BNXT_RX_PAGE_SIZE; + } + --shinfo->nr_frags; - cons_rx_buf->page = page; + cons_rx_buf->netmem = netmem; - /* Update prod since possibly some pages have been + /* Update prod since possibly some netmems have been * allocated already. */ rxr->rx_agg_prod = prod; @@ -1279,8 +1311,8 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, return 0; } - dma_sync_single_for_cpu(&pdev->dev, mapping, BNXT_RX_PAGE_SIZE, - bp->rx_dir); + page_pool_dma_sync_netmem_for_cpu(rxr->page_pool, netmem, 0, + BNXT_RX_PAGE_SIZE); total_frag_len += frag_len; prod = NEXT_RX_AGG(prod); @@ -1289,32 +1321,28 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, return total_frag_len; } -static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp, - struct bnxt_cp_ring_info *cpr, - struct sk_buff *skb, u16 idx, - u32 agg_bufs, bool tpa) +static struct sk_buff *bnxt_rx_agg_netmems_skb(struct bnxt *bp, + struct bnxt_cp_ring_info *cpr, + struct sk_buff *skb, u16 idx, + u32 agg_bufs, bool tpa) { - struct skb_shared_info *shinfo = skb_shinfo(skb); u32 total_frag_len = 0; - total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, idx, - agg_bufs, tpa, NULL); + total_frag_len = __bnxt_rx_agg_netmems(bp, cpr, idx, agg_bufs, tpa, + skb, NULL); if (!total_frag_len) { skb_mark_for_recycle(skb); dev_kfree_skb(skb); return NULL; } - skb->data_len += total_frag_len; - skb->len += total_frag_len; - skb->truesize += BNXT_RX_PAGE_SIZE * agg_bufs; return skb; } -static u32 bnxt_rx_agg_pages_xdp(struct bnxt *bp, - struct bnxt_cp_ring_info *cpr, - struct xdp_buff *xdp, u16 idx, - u32 agg_bufs, bool tpa) +static u32 bnxt_rx_agg_netmems_xdp(struct bnxt *bp, + struct bnxt_cp_ring_info *cpr, + struct xdp_buff *xdp, u16 idx, + u32 agg_bufs, bool tpa) { struct skb_shared_info *shinfo = xdp_get_shared_info_from_buff(xdp); u32 total_frag_len = 0; @@ -1322,8 +1350,8 @@ static u32 bnxt_rx_agg_pages_xdp(struct bnxt *bp, if (!xdp_buff_has_frags(xdp)) shinfo->nr_frags = 0; - total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, - idx, agg_bufs, tpa, xdp); + total_frag_len = __bnxt_rx_agg_netmems(bp, cpr, idx, agg_bufs, tpa, + NULL, xdp); if (total_frag_len) { xdp_buff_set_frags_flag(xdp); shinfo->nr_frags = agg_bufs; @@ -1895,7 +1923,8 @@ static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp, } if (agg_bufs) { - skb = bnxt_rx_agg_pages_skb(bp, cpr, skb, idx, agg_bufs, true); + skb = bnxt_rx_agg_netmems_skb(bp, cpr, skb, idx, agg_bufs, + true); if (!skb) { /* Page reuse already handled by bnxt_rx_pages(). */ cpr->sw_stats->rx.rx_oom_discards += 1; @@ -2175,9 +2204,10 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, if (bnxt_xdp_attached(bp, rxr)) { bnxt_xdp_buff_init(bp, rxr, cons, data_ptr, len, &xdp); if (agg_bufs) { - u32 frag_len = bnxt_rx_agg_pages_xdp(bp, cpr, &xdp, - cp_cons, agg_bufs, - false); + u32 frag_len = bnxt_rx_agg_netmems_xdp(bp, cpr, &xdp, + cp_cons, + agg_bufs, + false); if (!frag_len) goto oom_next_rx; @@ -2229,7 +2259,8 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, if (agg_bufs) { if (!xdp_active) { - skb = bnxt_rx_agg_pages_skb(bp, cpr, skb, cp_cons, agg_bufs, false); + skb = bnxt_rx_agg_netmems_skb(bp, cpr, skb, cp_cons, + agg_bufs, false); if (!skb) goto oom_next_rx; } else { @@ -3445,15 +3476,15 @@ static void bnxt_free_one_rx_agg_ring(struct bnxt *bp, struct bnxt_rx_ring_info for (i = 0; i < max_idx; i++) { struct bnxt_sw_rx_agg_bd *rx_agg_buf = &rxr->rx_agg_ring[i]; - struct page *page = rx_agg_buf->page; + netmem_ref netmem = rx_agg_buf->netmem; - if (!page) + if (!netmem) continue; - rx_agg_buf->page = NULL; + rx_agg_buf->netmem = 0; __clear_bit(i, rxr->rx_agg_bmap); - page_pool_recycle_direct(rxr->page_pool, page); + page_pool_recycle_direct_netmem(rxr->page_pool, netmem); } } @@ -3746,7 +3777,7 @@ static void bnxt_free_rx_rings(struct bnxt *bp) xdp_rxq_info_unreg(&rxr->xdp_rxq); page_pool_destroy(rxr->page_pool); - if (bnxt_separate_head_pool()) + if (bnxt_separate_head_pool(rxr)) page_pool_destroy(rxr->head_pool); rxr->page_pool = rxr->head_pool = NULL; @@ -3777,15 +3808,19 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, pp.dev = &bp->pdev->dev; pp.dma_dir = bp->rx_dir; pp.max_len = PAGE_SIZE; - pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | + PP_FLAG_ALLOW_UNREADABLE_NETMEM; + pp.queue_idx = rxr->bnapi->index; pool = page_pool_create(&pp); if (IS_ERR(pool)) return PTR_ERR(pool); rxr->page_pool = pool; - if (bnxt_separate_head_pool()) { + rxr->need_head_pool = page_pool_is_unreadable(pool); + if (bnxt_separate_head_pool(rxr)) { pp.pool_size = max(bp->rx_ring_size, 1024); + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; pool = page_pool_create(&pp); if (IS_ERR(pool)) goto err_destroy_pp; @@ -4197,6 +4232,8 @@ static void bnxt_reset_rx_ring_struct(struct bnxt *bp, rxr->page_pool->p.napi = NULL; rxr->page_pool = NULL; + rxr->head_pool->p.napi = NULL; + rxr->head_pool = NULL; memset(&rxr->xdp_rxq, 0, sizeof(struct xdp_rxq_info)); ring = &rxr->rx_ring_struct; @@ -4321,16 +4358,16 @@ static void bnxt_alloc_one_rx_ring_skb(struct bnxt *bp, rxr->rx_prod = prod; } -static void bnxt_alloc_one_rx_ring_page(struct bnxt *bp, - struct bnxt_rx_ring_info *rxr, - int ring_nr) +static void bnxt_alloc_one_rx_ring_netmem(struct bnxt *bp, + struct bnxt_rx_ring_info *rxr, + int ring_nr) { u32 prod; int i; prod = rxr->rx_agg_prod; for (i = 0; i < bp->rx_agg_ring_size; i++) { - if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_KERNEL)) { + if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_KERNEL)) { netdev_warn(bp->dev, "init'ed rx ring %d with %d/%d pages only\n", ring_nr, i, bp->rx_ring_size); break; @@ -4371,7 +4408,7 @@ static int bnxt_alloc_one_rx_ring(struct bnxt *bp, int ring_nr) if (!(bp->flags & BNXT_FLAG_AGG_RINGS)) return 0; - bnxt_alloc_one_rx_ring_page(bp, rxr, ring_nr); + bnxt_alloc_one_rx_ring_netmem(bp, rxr, ring_nr); if (rxr->rx_tpa) { rc = bnxt_alloc_one_tpa_info_data(bp, rxr); @@ -15708,6 +15745,7 @@ static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx) clone->rx_agg_prod = 0; clone->rx_sw_agg_prod = 0; clone->rx_next_cons = 0; + clone->need_head_pool = false; rc = bnxt_alloc_rx_page_pool(bp, clone, rxr->page_pool->p.nid); if (rc) @@ -15750,7 +15788,7 @@ static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx) bnxt_alloc_one_rx_ring_skb(bp, clone, idx); if (bp->flags & BNXT_FLAG_AGG_RINGS) - bnxt_alloc_one_rx_ring_page(bp, clone, idx); + bnxt_alloc_one_rx_ring_netmem(bp, clone, idx); if (bp->flags & BNXT_FLAG_TPA) bnxt_alloc_one_tpa_info_data(bp, clone); @@ -15766,7 +15804,7 @@ err_rxq_info_unreg: xdp_rxq_info_unreg(&clone->xdp_rxq); err_page_pool_destroy: page_pool_destroy(clone->page_pool); - if (bnxt_separate_head_pool()) + if (bnxt_separate_head_pool(clone)) page_pool_destroy(clone->head_pool); clone->page_pool = NULL; clone->head_pool = NULL; @@ -15785,7 +15823,7 @@ static void bnxt_queue_mem_free(struct net_device *dev, void *qmem) xdp_rxq_info_unreg(&rxr->xdp_rxq); page_pool_destroy(rxr->page_pool); - if (bnxt_separate_head_pool()) + if (bnxt_separate_head_pool(rxr)) page_pool_destroy(rxr->head_pool); rxr->page_pool = NULL; rxr->head_pool = NULL; @@ -15876,6 +15914,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx) rxr->page_pool = clone->page_pool; rxr->head_pool = clone->head_pool; rxr->xdp_rxq = clone->xdp_rxq; + rxr->need_head_pool = clone->need_head_pool; bnxt_copy_rx_ring(bp, rxr, clone); @@ -15961,7 +16000,7 @@ static int bnxt_queue_stop(struct net_device *dev, void *qmem, int idx) bnxt_hwrm_rx_ring_free(bp, rxr, false); bnxt_hwrm_rx_agg_ring_free(bp, rxr, false); page_pool_disable_direct_recycling(rxr->page_pool); - if (bnxt_separate_head_pool()) + if (bnxt_separate_head_pool(rxr)) page_pool_disable_direct_recycling(rxr->head_pool); if (bp->flags & BNXT_FLAG_SHARED_RINGS) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 21726cf56586..868a2e5a5b02 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -903,7 +903,7 @@ struct bnxt_sw_rx_bd { }; struct bnxt_sw_rx_agg_bd { - struct page *page; + netmem_ref netmem; unsigned int offset; dma_addr_t mapping; }; @@ -1106,6 +1106,7 @@ struct bnxt_rx_ring_info { unsigned long *rx_agg_bmap; u16 rx_agg_bmap_size; + bool need_head_pool; dma_addr_t rx_desc_mapping[MAX_RX_PAGES]; dma_addr_t rx_agg_desc_mapping[MAX_RX_AGG_PAGES]; diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 582a3d00cbe2..93f2c31baf9b 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -395,6 +395,12 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, page_pool_put_full_page(pool, page, true); } +static inline void page_pool_recycle_direct_netmem(struct page_pool *pool, + netmem_ref netmem) +{ + page_pool_put_full_netmem(pool, netmem, true); +} + #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ (sizeof(dma_addr_t) > sizeof(unsigned long)) @@ -492,4 +498,9 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) page_pool_update_nid(pool, new_nid); } +static inline bool page_pool_is_unreadable(struct page_pool *pool) +{ + return !!pool->mp_ops; +} + #endif /* _NET_PAGE_POOL_HELPERS_H */ -- cgit v1.2.3 From ab244a394c7f13f6573744b9ca72bb22151a3ec4 Mon Sep 17 00:00:00 2001 From: Chiachang Wang Date: Thu, 13 Mar 2025 02:36:40 +0000 Subject: xfrm: Migrate offload configuration Add hardware offload configuration to XFRM_MSG_MIGRATE using an option netlink attribute XFRMA_OFFLOAD_DEV. In the existing xfrm_state_migrate(), the xfrm_init_state() is called assuming no hardware offload by default. Even the original xfrm_state is configured with offload, the setting will be reset. If the device is configured with hardware offload, it's reasonable to allow the device to maintain its hardware offload mode. But the device will end up with offload disabled after receiving a migration event when the device migrates the connection from one netdev to another one. The devices that support migration may work with different underlying networks, such as mobile devices. The hardware setting should be forwarded to the different netdev based on the migration configuration. This change provides the capability for user space to migrate from one netdev to another. Test: Tested with kernel test in the Android tree located in https://android.googlesource.com/kernel/tests/ The xfrm_tunnel_test.py under the tests folder in particular. Signed-off-by: Chiachang Wang Reviewed-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 8 ++++++-- net/key/af_key.c | 2 +- net/xfrm/xfrm_policy.c | 4 ++-- net/xfrm/xfrm_state.c | 9 ++++++++- net/xfrm/xfrm_user.c | 15 ++++++++++++--- 5 files changed, 29 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index b7e8f3f49627..466423a1a70a 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1904,12 +1904,16 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n u32 if_id); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, - struct xfrm_encap_tmpl *encap); + struct xfrm_encap_tmpl *encap, + struct net *net, + struct xfrm_user_offload *xuo, + struct netlink_ext_ack *extack); int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap, u32 if_id, - struct netlink_ext_ack *extack); + struct netlink_ext_ack *extack, + struct xfrm_user_offload *xuo); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); diff --git a/net/key/af_key.c b/net/key/af_key.c index c56bb4f451e6..efc2a91f4c48 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2630,7 +2630,7 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, } return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, - kma ? &k : NULL, net, NULL, 0, NULL); + kma ? &k : NULL, net, NULL, 0, NULL, NULL); out: return err; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 143ac3aa7537..7200ba8de936 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4630,7 +4630,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap, u32 if_id, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, struct xfrm_user_offload *xuo) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; @@ -4663,7 +4663,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, if ((x = xfrm_migrate_state_find(mp, net, if_id))) { x_cur[nx_cur] = x; nx_cur++; - xc = xfrm_state_migrate(x, mp, encap); + xc = xfrm_state_migrate(x, mp, encap, net, xuo, extack); if (xc) { x_new[nx_new] = xc; nx_new++; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 3c2e27e5a1e3..1c5fe1b0b6d6 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2120,7 +2120,10 @@ EXPORT_SYMBOL(xfrm_migrate_state_find); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, - struct xfrm_encap_tmpl *encap) + struct xfrm_encap_tmpl *encap, + struct net *net, + struct xfrm_user_offload *xuo, + struct netlink_ext_ack *extack) { struct xfrm_state *xc; @@ -2136,6 +2139,10 @@ struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr)); memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr)); + /* configure the hardware if offload is requested */ + if (xuo && xfrm_dev_state_add(net, xc, xuo, extack)) + goto error; + /* add state */ if (xfrm_addr_equal(&x->id.daddr, &m->new_daddr, m->new_family)) { /* a care is needed when the destination address of the diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 0a3d3f3ae5a3..ae8e06573639 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3069,6 +3069,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, int n = 0; struct net *net = sock_net(skb->sk); struct xfrm_encap_tmpl *encap = NULL; + struct xfrm_user_offload *xuo = NULL; u32 if_id = 0; if (!attrs[XFRMA_MIGRATE]) { @@ -3099,11 +3100,19 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, if (attrs[XFRMA_IF_ID]) if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (attrs[XFRMA_OFFLOAD_DEV]) { + xuo = kmemdup(nla_data(attrs[XFRMA_OFFLOAD_DEV]), + sizeof(*xuo), GFP_KERNEL); + if (!xuo) { + err = -ENOMEM; + goto error; + } + } err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap, - if_id, extack); - + if_id, extack, xuo); +error: kfree(encap); - + kfree(xuo); return err; } #else -- cgit v1.2.3 From b7a63391aa982295bbb3125e7d4470f51f31ff0f Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:19 +0200 Subject: ovpn: add basic netlink support This commit introduces basic netlink support with family registration/unregistration functionalities and stub pre/post-doit. More importantly it introduces the YAML uAPI description along with its auto-generated files: - include/uapi/linux/ovpn.h - drivers/net/ovpn/netlink-gen.c - drivers/net/ovpn/netlink-gen.h Reviewed-by: Donald Hunter Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-2-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ovpn.yaml | 367 ++++++++++++++++++++++++++++++++++ MAINTAINERS | 2 + drivers/net/ovpn/Makefile | 2 + drivers/net/ovpn/main.c | 31 +++ drivers/net/ovpn/main.h | 14 ++ drivers/net/ovpn/netlink-gen.c | 213 ++++++++++++++++++++ drivers/net/ovpn/netlink-gen.h | 41 ++++ drivers/net/ovpn/netlink.c | 160 +++++++++++++++ drivers/net/ovpn/netlink.h | 15 ++ drivers/net/ovpn/ovpnpriv.h | 21 ++ include/uapi/linux/ovpn.h | 109 ++++++++++ 11 files changed, 975 insertions(+) create mode 100644 Documentation/netlink/specs/ovpn.yaml create mode 100644 drivers/net/ovpn/main.h create mode 100644 drivers/net/ovpn/netlink-gen.c create mode 100644 drivers/net/ovpn/netlink-gen.h create mode 100644 drivers/net/ovpn/netlink.c create mode 100644 drivers/net/ovpn/netlink.h create mode 100644 drivers/net/ovpn/ovpnpriv.h create mode 100644 include/uapi/linux/ovpn.h (limited to 'include') diff --git a/Documentation/netlink/specs/ovpn.yaml b/Documentation/netlink/specs/ovpn.yaml new file mode 100644 index 000000000000..096c51f0c69a --- /dev/null +++ b/Documentation/netlink/specs/ovpn.yaml @@ -0,0 +1,367 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +# +# Author: Antonio Quartulli +# +# Copyright (c) 2024-2025, OpenVPN Inc. +# + +name: ovpn + +protocol: genetlink + +doc: Netlink protocol to control OpenVPN network devices + +definitions: + - + type: const + name: nonce-tail-size + value: 8 + - + type: enum + name: cipher-alg + entries: [ none, aes-gcm, chacha20-poly1305 ] + - + type: enum + name: del-peer-reason + entries: + - teardown + - userspace + - expired + - transport-error + - transport-disconnect + - + type: enum + name: key-slot + entries: [ primary, secondary ] + +attribute-sets: + - + name: peer + attributes: + - + name: id + type: u32 + doc: >- + The unique ID of the peer in the device context. To be used to identify + peers during operations for a specific device + checks: + max: 0xFFFFFF + - + name: remote-ipv4 + type: u32 + doc: The remote IPv4 address of the peer + byte-order: big-endian + display-hint: ipv4 + - + name: remote-ipv6 + type: binary + doc: The remote IPv6 address of the peer + display-hint: ipv6 + checks: + exact-len: 16 + - + name: remote-ipv6-scope-id + type: u32 + doc: The scope id of the remote IPv6 address of the peer (RFC2553) + - + name: remote-port + type: u16 + doc: The remote port of the peer + byte-order: big-endian + checks: + min: 1 + - + name: socket + type: u32 + doc: The socket to be used to communicate with the peer + - + name: socket-netnsid + type: s32 + doc: The ID of the netns the socket assigned to this peer lives in + - + name: vpn-ipv4 + type: u32 + doc: The IPv4 address assigned to the peer by the server + byte-order: big-endian + display-hint: ipv4 + - + name: vpn-ipv6 + type: binary + doc: The IPv6 address assigned to the peer by the server + display-hint: ipv6 + checks: + exact-len: 16 + - + name: local-ipv4 + type: u32 + doc: The local IPv4 to be used to send packets to the peer (UDP only) + byte-order: big-endian + display-hint: ipv4 + - + name: local-ipv6 + type: binary + doc: The local IPv6 to be used to send packets to the peer (UDP only) + display-hint: ipv6 + checks: + exact-len: 16 + - + name: local-port + type: u16 + doc: The local port to be used to send packets to the peer (UDP only) + byte-order: big-endian + checks: + min: 1 + - + name: keepalive-interval + type: u32 + doc: >- + The number of seconds after which a keep alive message is sent to the + peer + - + name: keepalive-timeout + type: u32 + doc: >- + The number of seconds from the last activity after which the peer is + assumed dead + - + name: del-reason + type: u32 + doc: The reason why a peer was deleted + enum: del-peer-reason + - + name: vpn-rx-bytes + type: uint + doc: Number of bytes received over the tunnel + - + name: vpn-tx-bytes + type: uint + doc: Number of bytes transmitted over the tunnel + - + name: vpn-rx-packets + type: uint + doc: Number of packets received over the tunnel + - + name: vpn-tx-packets + type: uint + doc: Number of packets transmitted over the tunnel + - + name: link-rx-bytes + type: uint + doc: Number of bytes received at the transport level + - + name: link-tx-bytes + type: uint + doc: Number of bytes transmitted at the transport level + - + name: link-rx-packets + type: uint + doc: Number of packets received at the transport level + - + name: link-tx-packets + type: uint + doc: Number of packets transmitted at the transport level + - + name: keyconf + attributes: + - + name: peer-id + type: u32 + doc: >- + The unique ID of the peer in the device context. To be used to + identify peers during key operations + checks: + max: 0xFFFFFF + - + name: slot + type: u32 + doc: The slot where the key should be stored + enum: key-slot + - + name: key-id + doc: >- + The unique ID of the key in the peer context. Used to fetch the + correct key upon decryption + type: u32 + checks: + max: 7 + - + name: cipher-alg + type: u32 + doc: The cipher to be used when communicating with the peer + enum: cipher-alg + - + name: encrypt-dir + type: nest + doc: Key material for encrypt direction + nested-attributes: keydir + - + name: decrypt-dir + type: nest + doc: Key material for decrypt direction + nested-attributes: keydir + - + name: keydir + attributes: + - + name: cipher-key + type: binary + doc: The actual key to be used by the cipher + checks: + max-len: 256 + - + name: nonce-tail + type: binary + doc: >- + Random nonce to be concatenated to the packet ID, in order to + obtain the actual cipher IV + checks: + exact-len: nonce-tail-size + - + name: ovpn + attributes: + - + name: ifindex + type: u32 + doc: Index of the ovpn interface to operate on + - + name: peer + type: nest + doc: >- + The peer object containing the attributed of interest for the specific + operation + nested-attributes: peer + - + name: keyconf + type: nest + doc: Peer specific cipher configuration + nested-attributes: keyconf + +operations: + list: + - + name: peer-new + attribute-set: ovpn + flags: [ admin-perm ] + doc: Add a remote peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - peer + - + name: peer-set + attribute-set: ovpn + flags: [ admin-perm ] + doc: modify a remote peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - peer + - + name: peer-get + attribute-set: ovpn + flags: [ admin-perm ] + doc: Retrieve data about existing remote peers (or a specific one) + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - peer + reply: + attributes: + - peer + dump: + request: + attributes: + - ifindex + reply: + attributes: + - peer + - + name: peer-del + attribute-set: ovpn + flags: [ admin-perm ] + doc: Delete existing remote peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - peer + - + name: peer-del-ntf + doc: Notification about a peer being deleted + notify: peer-get + mcgrp: peers + + - + name: key-new + attribute-set: ovpn + flags: [ admin-perm ] + doc: Add a cipher key for a specific peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - keyconf + - + name: key-get + attribute-set: ovpn + flags: [ admin-perm ] + doc: Retrieve non-sensitive data about peer key and cipher + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - keyconf + reply: + attributes: + - keyconf + - + name: key-swap + attribute-set: ovpn + flags: [ admin-perm ] + doc: Swap primary and secondary session keys for a specific peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - keyconf + - + name: key-swap-ntf + notify: key-get + doc: >- + Notification about key having exhausted its IV space and requiring + renegotiation + mcgrp: peers + - + name: key-del + attribute-set: ovpn + flags: [ admin-perm ] + doc: Delete cipher key for a specific peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - keyconf + +mcast-groups: + list: + - + name: peers diff --git a/MAINTAINERS b/MAINTAINERS index d0d77f43c5af..c50e87ef7288 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18131,7 +18131,9 @@ L: openvpn-devel@lists.sourceforge.net (subscribers-only) L: netdev@vger.kernel.org S: Supported T: git https://github.com/OpenVPN/linux-kernel-ovpn.git +F: Documentation/netlink/specs/ovpn.yaml F: drivers/net/ovpn/ +F: include/uapi/linux/ovpn.h OPENVSWITCH M: Aaron Conole diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile index 876800ebaa21..75ac62bba029 100644 --- a/drivers/net/ovpn/Makefile +++ b/drivers/net/ovpn/Makefile @@ -8,3 +8,5 @@ obj-$(CONFIG_OVPN) := ovpn.o ovpn-y += main.o +ovpn-y += netlink.o +ovpn-y += netlink-gen.o diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c index d5d4ac8da6d7..719cac3db6ef 100644 --- a/drivers/net/ovpn/main.c +++ b/drivers/net/ovpn/main.c @@ -7,9 +7,29 @@ * James Yonan */ +#include #include #include #include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "netlink.h" + +static const struct net_device_ops ovpn_netdev_ops = { +}; + +/** + * ovpn_dev_is_valid - check if the netdevice is of type 'ovpn' + * @dev: the interface to check + * + * Return: whether the netdevice is of type 'ovpn' + */ +bool ovpn_dev_is_valid(const struct net_device *dev) +{ + return dev->netdev_ops == &ovpn_netdev_ops; +} static int ovpn_newlink(struct net_device *dev, struct rtnl_newlink_params *params, @@ -34,11 +54,22 @@ static int __init ovpn_init(void) return err; } + err = ovpn_nl_register(); + if (err) { + pr_err("ovpn: can't register netlink family: %d\n", err); + goto unreg_rtnl; + } + return 0; + +unreg_rtnl: + rtnl_link_unregister(&ovpn_link_ops); + return err; } static __exit void ovpn_cleanup(void) { + ovpn_nl_unregister(); rtnl_link_unregister(&ovpn_link_ops); rcu_barrier(); diff --git a/drivers/net/ovpn/main.h b/drivers/net/ovpn/main.h new file mode 100644 index 000000000000..017cd0100765 --- /dev/null +++ b/drivers/net/ovpn/main.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#ifndef _NET_OVPN_MAIN_H_ +#define _NET_OVPN_MAIN_H_ + +bool ovpn_dev_is_valid(const struct net_device *dev); + +#endif /* _NET_OVPN_MAIN_H_ */ diff --git a/drivers/net/ovpn/netlink-gen.c b/drivers/net/ovpn/netlink-gen.c new file mode 100644 index 000000000000..58e1a4342378 --- /dev/null +++ b/drivers/net/ovpn/netlink-gen.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ovpn.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "netlink-gen.h" + +#include + +/* Integer value ranges */ +static const struct netlink_range_validation ovpn_a_peer_id_range = { + .max = 16777215ULL, +}; + +static const struct netlink_range_validation ovpn_a_keyconf_peer_id_range = { + .max = 16777215ULL, +}; + +/* Common nested types */ +const struct nla_policy ovpn_keyconf_nl_policy[OVPN_A_KEYCONF_DECRYPT_DIR + 1] = { + [OVPN_A_KEYCONF_PEER_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &ovpn_a_keyconf_peer_id_range), + [OVPN_A_KEYCONF_SLOT] = NLA_POLICY_MAX(NLA_U32, 1), + [OVPN_A_KEYCONF_KEY_ID] = NLA_POLICY_MAX(NLA_U32, 7), + [OVPN_A_KEYCONF_CIPHER_ALG] = NLA_POLICY_MAX(NLA_U32, 2), + [OVPN_A_KEYCONF_ENCRYPT_DIR] = NLA_POLICY_NESTED(ovpn_keydir_nl_policy), + [OVPN_A_KEYCONF_DECRYPT_DIR] = NLA_POLICY_NESTED(ovpn_keydir_nl_policy), +}; + +const struct nla_policy ovpn_keydir_nl_policy[OVPN_A_KEYDIR_NONCE_TAIL + 1] = { + [OVPN_A_KEYDIR_CIPHER_KEY] = NLA_POLICY_MAX_LEN(256), + [OVPN_A_KEYDIR_NONCE_TAIL] = NLA_POLICY_EXACT_LEN(OVPN_NONCE_TAIL_SIZE), +}; + +const struct nla_policy ovpn_peer_nl_policy[OVPN_A_PEER_LINK_TX_PACKETS + 1] = { + [OVPN_A_PEER_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &ovpn_a_peer_id_range), + [OVPN_A_PEER_REMOTE_IPV4] = { .type = NLA_BE32, }, + [OVPN_A_PEER_REMOTE_IPV6] = NLA_POLICY_EXACT_LEN(16), + [OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID] = { .type = NLA_U32, }, + [OVPN_A_PEER_REMOTE_PORT] = NLA_POLICY_MIN(NLA_BE16, 1), + [OVPN_A_PEER_SOCKET] = { .type = NLA_U32, }, + [OVPN_A_PEER_SOCKET_NETNSID] = { .type = NLA_S32, }, + [OVPN_A_PEER_VPN_IPV4] = { .type = NLA_BE32, }, + [OVPN_A_PEER_VPN_IPV6] = NLA_POLICY_EXACT_LEN(16), + [OVPN_A_PEER_LOCAL_IPV4] = { .type = NLA_BE32, }, + [OVPN_A_PEER_LOCAL_IPV6] = NLA_POLICY_EXACT_LEN(16), + [OVPN_A_PEER_LOCAL_PORT] = NLA_POLICY_MIN(NLA_BE16, 1), + [OVPN_A_PEER_KEEPALIVE_INTERVAL] = { .type = NLA_U32, }, + [OVPN_A_PEER_KEEPALIVE_TIMEOUT] = { .type = NLA_U32, }, + [OVPN_A_PEER_DEL_REASON] = NLA_POLICY_MAX(NLA_U32, 4), + [OVPN_A_PEER_VPN_RX_BYTES] = { .type = NLA_UINT, }, + [OVPN_A_PEER_VPN_TX_BYTES] = { .type = NLA_UINT, }, + [OVPN_A_PEER_VPN_RX_PACKETS] = { .type = NLA_UINT, }, + [OVPN_A_PEER_VPN_TX_PACKETS] = { .type = NLA_UINT, }, + [OVPN_A_PEER_LINK_RX_BYTES] = { .type = NLA_UINT, }, + [OVPN_A_PEER_LINK_TX_BYTES] = { .type = NLA_UINT, }, + [OVPN_A_PEER_LINK_RX_PACKETS] = { .type = NLA_UINT, }, + [OVPN_A_PEER_LINK_TX_PACKETS] = { .type = NLA_UINT, }, +}; + +/* OVPN_CMD_PEER_NEW - do */ +static const struct nla_policy ovpn_peer_new_nl_policy[OVPN_A_PEER + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), +}; + +/* OVPN_CMD_PEER_SET - do */ +static const struct nla_policy ovpn_peer_set_nl_policy[OVPN_A_PEER + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), +}; + +/* OVPN_CMD_PEER_GET - do */ +static const struct nla_policy ovpn_peer_get_do_nl_policy[OVPN_A_PEER + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), +}; + +/* OVPN_CMD_PEER_GET - dump */ +static const struct nla_policy ovpn_peer_get_dump_nl_policy[OVPN_A_IFINDEX + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, +}; + +/* OVPN_CMD_PEER_DEL - do */ +static const struct nla_policy ovpn_peer_del_nl_policy[OVPN_A_PEER + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), +}; + +/* OVPN_CMD_KEY_NEW - do */ +static const struct nla_policy ovpn_key_new_nl_policy[OVPN_A_KEYCONF + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), +}; + +/* OVPN_CMD_KEY_GET - do */ +static const struct nla_policy ovpn_key_get_nl_policy[OVPN_A_KEYCONF + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), +}; + +/* OVPN_CMD_KEY_SWAP - do */ +static const struct nla_policy ovpn_key_swap_nl_policy[OVPN_A_KEYCONF + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), +}; + +/* OVPN_CMD_KEY_DEL - do */ +static const struct nla_policy ovpn_key_del_nl_policy[OVPN_A_KEYCONF + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), +}; + +/* Ops table for ovpn */ +static const struct genl_split_ops ovpn_nl_ops[] = { + { + .cmd = OVPN_CMD_PEER_NEW, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_peer_new_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_peer_new_nl_policy, + .maxattr = OVPN_A_PEER, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_PEER_SET, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_peer_set_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_peer_set_nl_policy, + .maxattr = OVPN_A_PEER, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_PEER_GET, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_peer_get_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_peer_get_do_nl_policy, + .maxattr = OVPN_A_PEER, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_PEER_GET, + .dumpit = ovpn_nl_peer_get_dumpit, + .policy = ovpn_peer_get_dump_nl_policy, + .maxattr = OVPN_A_IFINDEX, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DUMP, + }, + { + .cmd = OVPN_CMD_PEER_DEL, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_peer_del_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_peer_del_nl_policy, + .maxattr = OVPN_A_PEER, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_KEY_NEW, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_key_new_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_key_new_nl_policy, + .maxattr = OVPN_A_KEYCONF, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_KEY_GET, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_key_get_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_key_get_nl_policy, + .maxattr = OVPN_A_KEYCONF, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_KEY_SWAP, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_key_swap_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_key_swap_nl_policy, + .maxattr = OVPN_A_KEYCONF, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_KEY_DEL, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_key_del_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_key_del_nl_policy, + .maxattr = OVPN_A_KEYCONF, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group ovpn_nl_mcgrps[] = { + [OVPN_NLGRP_PEERS] = { "peers", }, +}; + +struct genl_family ovpn_nl_family __ro_after_init = { + .name = OVPN_FAMILY_NAME, + .version = OVPN_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = ovpn_nl_ops, + .n_split_ops = ARRAY_SIZE(ovpn_nl_ops), + .mcgrps = ovpn_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(ovpn_nl_mcgrps), +}; diff --git a/drivers/net/ovpn/netlink-gen.h b/drivers/net/ovpn/netlink-gen.h new file mode 100644 index 000000000000..66a4e4a0a055 --- /dev/null +++ b/drivers/net/ovpn/netlink-gen.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ovpn.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_OVPN_GEN_H +#define _LINUX_OVPN_GEN_H + +#include +#include + +#include + +/* Common nested types */ +extern const struct nla_policy ovpn_keyconf_nl_policy[OVPN_A_KEYCONF_DECRYPT_DIR + 1]; +extern const struct nla_policy ovpn_keydir_nl_policy[OVPN_A_KEYDIR_NONCE_TAIL + 1]; +extern const struct nla_policy ovpn_peer_nl_policy[OVPN_A_PEER_LINK_TX_PACKETS + 1]; + +int ovpn_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); +void +ovpn_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int ovpn_nl_peer_new_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_peer_set_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_peer_get_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_peer_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int ovpn_nl_peer_del_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_key_new_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_key_get_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_key_swap_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_key_del_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + OVPN_NLGRP_PEERS, +}; + +extern struct genl_family ovpn_nl_family; + +#endif /* _LINUX_OVPN_GEN_H */ diff --git a/drivers/net/ovpn/netlink.c b/drivers/net/ovpn/netlink.c new file mode 100644 index 000000000000..8d267d4c8228 --- /dev/null +++ b/drivers/net/ovpn/netlink.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#include +#include + +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "netlink.h" +#include "netlink-gen.h" + +MODULE_ALIAS_GENL_FAMILY(OVPN_FAMILY_NAME); + +/** + * ovpn_get_dev_from_attrs - retrieve the ovpn private data from the netdevice + * a netlink message is targeting + * @net: network namespace where to look for the interface + * @info: generic netlink info from the user request + * @tracker: tracker object to be used for the netdev reference acquisition + * + * Return: the ovpn private data, if found, or an error otherwise + */ +static struct ovpn_priv * +ovpn_get_dev_from_attrs(struct net *net, const struct genl_info *info, + netdevice_tracker *tracker) +{ + struct ovpn_priv *ovpn; + struct net_device *dev; + int ifindex; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_IFINDEX)) + return ERR_PTR(-EINVAL); + + ifindex = nla_get_u32(info->attrs[OVPN_A_IFINDEX]); + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + if (!dev) { + rcu_read_unlock(); + NL_SET_ERR_MSG_MOD(info->extack, + "ifindex does not match any interface"); + return ERR_PTR(-ENODEV); + } + + if (!ovpn_dev_is_valid(dev)) { + rcu_read_unlock(); + NL_SET_ERR_MSG_MOD(info->extack, + "specified interface is not ovpn"); + NL_SET_BAD_ATTR(info->extack, info->attrs[OVPN_A_IFINDEX]); + return ERR_PTR(-EINVAL); + } + + ovpn = netdev_priv(dev); + netdev_hold(dev, tracker, GFP_ATOMIC); + rcu_read_unlock(); + + return ovpn; +} + +int ovpn_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + netdevice_tracker *tracker = (netdevice_tracker *)&info->user_ptr[1]; + struct ovpn_priv *ovpn = ovpn_get_dev_from_attrs(genl_info_net(info), + info, tracker); + + if (IS_ERR(ovpn)) + return PTR_ERR(ovpn); + + info->user_ptr[0] = ovpn; + + return 0; +} + +void ovpn_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + netdevice_tracker *tracker = (netdevice_tracker *)&info->user_ptr[1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + + if (ovpn) + netdev_put(ovpn->dev, tracker); +} + +int ovpn_nl_peer_new_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_peer_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_peer_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_peer_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_peer_del_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_key_new_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_key_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_key_swap_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_key_del_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +/** + * ovpn_nl_register - perform any needed registration in the NL subsustem + * + * Return: 0 on success, a negative error code otherwise + */ +int __init ovpn_nl_register(void) +{ + int ret = genl_register_family(&ovpn_nl_family); + + if (ret) { + pr_err("ovpn: genl_register_family failed: %d\n", ret); + return ret; + } + + return 0; +} + +/** + * ovpn_nl_unregister - undo any module wide netlink registration + */ +void ovpn_nl_unregister(void) +{ + genl_unregister_family(&ovpn_nl_family); +} diff --git a/drivers/net/ovpn/netlink.h b/drivers/net/ovpn/netlink.h new file mode 100644 index 000000000000..0d6c34e17082 --- /dev/null +++ b/drivers/net/ovpn/netlink.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#ifndef _NET_OVPN_NETLINK_H_ +#define _NET_OVPN_NETLINK_H_ + +int ovpn_nl_register(void); +void ovpn_nl_unregister(void); + +#endif /* _NET_OVPN_NETLINK_H_ */ diff --git a/drivers/net/ovpn/ovpnpriv.h b/drivers/net/ovpn/ovpnpriv.h new file mode 100644 index 000000000000..f9322536b06d --- /dev/null +++ b/drivers/net/ovpn/ovpnpriv.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_OVPNSTRUCT_H_ +#define _NET_OVPN_OVPNSTRUCT_H_ + +/** + * struct ovpn_priv - per ovpn interface state + * @dev: the actual netdev representing the tunnel + */ +struct ovpn_priv { + struct net_device *dev; +}; + +#endif /* _NET_OVPN_OVPNSTRUCT_H_ */ diff --git a/include/uapi/linux/ovpn.h b/include/uapi/linux/ovpn.h new file mode 100644 index 000000000000..680d1522dc87 --- /dev/null +++ b/include/uapi/linux/ovpn.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ovpn.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_OVPN_H +#define _UAPI_LINUX_OVPN_H + +#define OVPN_FAMILY_NAME "ovpn" +#define OVPN_FAMILY_VERSION 1 + +#define OVPN_NONCE_TAIL_SIZE 8 + +enum ovpn_cipher_alg { + OVPN_CIPHER_ALG_NONE, + OVPN_CIPHER_ALG_AES_GCM, + OVPN_CIPHER_ALG_CHACHA20_POLY1305, +}; + +enum ovpn_del_peer_reason { + OVPN_DEL_PEER_REASON_TEARDOWN, + OVPN_DEL_PEER_REASON_USERSPACE, + OVPN_DEL_PEER_REASON_EXPIRED, + OVPN_DEL_PEER_REASON_TRANSPORT_ERROR, + OVPN_DEL_PEER_REASON_TRANSPORT_DISCONNECT, +}; + +enum ovpn_key_slot { + OVPN_KEY_SLOT_PRIMARY, + OVPN_KEY_SLOT_SECONDARY, +}; + +enum { + OVPN_A_PEER_ID = 1, + OVPN_A_PEER_REMOTE_IPV4, + OVPN_A_PEER_REMOTE_IPV6, + OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID, + OVPN_A_PEER_REMOTE_PORT, + OVPN_A_PEER_SOCKET, + OVPN_A_PEER_SOCKET_NETNSID, + OVPN_A_PEER_VPN_IPV4, + OVPN_A_PEER_VPN_IPV6, + OVPN_A_PEER_LOCAL_IPV4, + OVPN_A_PEER_LOCAL_IPV6, + OVPN_A_PEER_LOCAL_PORT, + OVPN_A_PEER_KEEPALIVE_INTERVAL, + OVPN_A_PEER_KEEPALIVE_TIMEOUT, + OVPN_A_PEER_DEL_REASON, + OVPN_A_PEER_VPN_RX_BYTES, + OVPN_A_PEER_VPN_TX_BYTES, + OVPN_A_PEER_VPN_RX_PACKETS, + OVPN_A_PEER_VPN_TX_PACKETS, + OVPN_A_PEER_LINK_RX_BYTES, + OVPN_A_PEER_LINK_TX_BYTES, + OVPN_A_PEER_LINK_RX_PACKETS, + OVPN_A_PEER_LINK_TX_PACKETS, + + __OVPN_A_PEER_MAX, + OVPN_A_PEER_MAX = (__OVPN_A_PEER_MAX - 1) +}; + +enum { + OVPN_A_KEYCONF_PEER_ID = 1, + OVPN_A_KEYCONF_SLOT, + OVPN_A_KEYCONF_KEY_ID, + OVPN_A_KEYCONF_CIPHER_ALG, + OVPN_A_KEYCONF_ENCRYPT_DIR, + OVPN_A_KEYCONF_DECRYPT_DIR, + + __OVPN_A_KEYCONF_MAX, + OVPN_A_KEYCONF_MAX = (__OVPN_A_KEYCONF_MAX - 1) +}; + +enum { + OVPN_A_KEYDIR_CIPHER_KEY = 1, + OVPN_A_KEYDIR_NONCE_TAIL, + + __OVPN_A_KEYDIR_MAX, + OVPN_A_KEYDIR_MAX = (__OVPN_A_KEYDIR_MAX - 1) +}; + +enum { + OVPN_A_IFINDEX = 1, + OVPN_A_PEER, + OVPN_A_KEYCONF, + + __OVPN_A_MAX, + OVPN_A_MAX = (__OVPN_A_MAX - 1) +}; + +enum { + OVPN_CMD_PEER_NEW = 1, + OVPN_CMD_PEER_SET, + OVPN_CMD_PEER_GET, + OVPN_CMD_PEER_DEL, + OVPN_CMD_PEER_DEL_NTF, + OVPN_CMD_KEY_NEW, + OVPN_CMD_KEY_GET, + OVPN_CMD_KEY_SWAP, + OVPN_CMD_KEY_SWAP_NTF, + OVPN_CMD_KEY_DEL, + + __OVPN_CMD_MAX, + OVPN_CMD_MAX = (__OVPN_CMD_MAX - 1) +}; + +#define OVPN_MCGRP_PEERS "peers" + +#endif /* _UAPI_LINUX_OVPN_H */ -- cgit v1.2.3 From c2d950c4672a012ea9765c15a389cdcdf919f652 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:20 +0200 Subject: ovpn: add basic interface creation/destruction/management routines Add basic infrastructure for handling ovpn interfaces. Tested-by: Donald Hunter Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-3-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/rt-link.yaml | 16 +++++++ drivers/net/ovpn/Makefile | 1 + drivers/net/ovpn/io.c | 22 +++++++++ drivers/net/ovpn/io.h | 24 ++++++++++ drivers/net/ovpn/main.c | 82 +++++++++++++++++++++++++++++++- drivers/net/ovpn/ovpnpriv.h | 5 ++ drivers/net/ovpn/proto.h | 38 +++++++++++++++ include/uapi/linux/if_link.h | 15 ++++++ 8 files changed, 201 insertions(+), 2 deletions(-) create mode 100644 drivers/net/ovpn/io.c create mode 100644 drivers/net/ovpn/io.h create mode 100644 drivers/net/ovpn/proto.h (limited to 'include') diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index 31238455f8e9..a50d9d7d882e 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -938,6 +938,12 @@ definitions: entries: - name: none - name: default + - + name: ovpn-mode + type: enum + entries: + - p2p + - mp attribute-sets: - @@ -2272,6 +2278,13 @@ attribute-sets: - name: tailroom type: u16 + - + name: linkinfo-ovpn-attrs + attributes: + - + name: mode + type: u8 + enum: ovpn-mode sub-messages: - @@ -2322,6 +2335,9 @@ sub-messages: - value: netkit attribute-set: linkinfo-netkit-attrs + - + value: ovpn + attribute-set: linkinfo-ovpn-attrs - name: linkinfo-member-data-msg formats: diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile index 75ac62bba029..0e5f686672fb 100644 --- a/drivers/net/ovpn/Makefile +++ b/drivers/net/ovpn/Makefile @@ -8,5 +8,6 @@ obj-$(CONFIG_OVPN) := ovpn.o ovpn-y += main.o +ovpn-y += io.o ovpn-y += netlink.o ovpn-y += netlink-gen.o diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c new file mode 100644 index 000000000000..4b71c38165d7 --- /dev/null +++ b/drivers/net/ovpn/io.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include +#include + +#include "io.h" + +/* Send user data to the network + */ +netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev) +{ + skb_tx_error(skb); + kfree_skb(skb); + return NET_XMIT_DROP; +} diff --git a/drivers/net/ovpn/io.h b/drivers/net/ovpn/io.h new file mode 100644 index 000000000000..afea5f81f562 --- /dev/null +++ b/drivers/net/ovpn/io.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_OVPN_H_ +#define _NET_OVPN_OVPN_H_ + +/* DATA_V2 header size with AEAD encryption */ +#define OVPN_HEAD_ROOM (OVPN_OPCODE_SIZE + OVPN_NONCE_WIRE_SIZE + \ + 16 /* AEAD TAG length */ + \ + max(sizeof(struct udphdr), sizeof(struct tcphdr)) +\ + max(sizeof(struct ipv6hdr), sizeof(struct iphdr))) + +/* max padding required by encryption */ +#define OVPN_MAX_PADDING 16 + +netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev); + +#endif /* _NET_OVPN_OVPN_H_ */ diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c index 719cac3db6ef..ea7dad374c00 100644 --- a/drivers/net/ovpn/main.c +++ b/drivers/net/ovpn/main.c @@ -10,14 +10,28 @@ #include #include #include +#include +#include #include -#include +#include #include "ovpnpriv.h" #include "main.h" #include "netlink.h" +#include "io.h" +#include "proto.h" static const struct net_device_ops ovpn_netdev_ops = { + .ndo_start_xmit = ovpn_net_xmit, +}; + +static const struct device_type ovpn_type = { + .name = OVPN_FAMILY_NAME, +}; + +static const struct nla_policy ovpn_policy[IFLA_OVPN_MAX + 1] = { + [IFLA_OVPN_MODE] = NLA_POLICY_RANGE(NLA_U8, OVPN_MODE_P2P, + OVPN_MODE_MP), }; /** @@ -31,18 +45,82 @@ bool ovpn_dev_is_valid(const struct net_device *dev) return dev->netdev_ops == &ovpn_netdev_ops; } +static void ovpn_setup(struct net_device *dev) +{ + netdev_features_t feat = NETIF_F_SG | NETIF_F_GSO | + NETIF_F_GSO_SOFTWARE | NETIF_F_HIGHDMA; + + dev->needs_free_netdev = true; + + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; + + dev->netdev_ops = &ovpn_netdev_ops; + + dev->hard_header_len = 0; + dev->addr_len = 0; + dev->mtu = ETH_DATA_LEN - OVPN_HEAD_ROOM; + dev->min_mtu = IPV4_MIN_MTU; + dev->max_mtu = IP_MAX_MTU - OVPN_HEAD_ROOM; + + dev->type = ARPHRD_NONE; + dev->flags = IFF_POINTOPOINT | IFF_NOARP; + dev->priv_flags |= IFF_NO_QUEUE; + + dev->lltx = true; + dev->features |= feat; + dev->hw_features |= feat; + dev->hw_enc_features |= feat; + + dev->needed_headroom = ALIGN(OVPN_HEAD_ROOM, 4); + dev->needed_tailroom = OVPN_MAX_PADDING; + + SET_NETDEV_DEVTYPE(dev, &ovpn_type); +} + static int ovpn_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - return -EOPNOTSUPP; + struct ovpn_priv *ovpn = netdev_priv(dev); + struct nlattr **data = params->data; + enum ovpn_mode mode = OVPN_MODE_P2P; + + if (data && data[IFLA_OVPN_MODE]) { + mode = nla_get_u8(data[IFLA_OVPN_MODE]); + netdev_dbg(dev, "setting device mode: %u\n", mode); + } + + ovpn->dev = dev; + ovpn->mode = mode; + + /* turn carrier explicitly off after registration, this way state is + * clearly defined + */ + netif_carrier_off(dev); + + return register_netdevice(dev); +} + +static int ovpn_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ovpn_priv *ovpn = netdev_priv(dev); + + if (nla_put_u8(skb, IFLA_OVPN_MODE, ovpn->mode)) + return -EMSGSIZE; + + return 0; } static struct rtnl_link_ops ovpn_link_ops = { .kind = "ovpn", .netns_refund = false, + .priv_size = sizeof(struct ovpn_priv), + .setup = ovpn_setup, + .policy = ovpn_policy, + .maxtype = IFLA_OVPN_MAX, .newlink = ovpn_newlink, .dellink = unregister_netdevice_queue, + .fill_info = ovpn_fill_info, }; static int __init ovpn_init(void) diff --git a/drivers/net/ovpn/ovpnpriv.h b/drivers/net/ovpn/ovpnpriv.h index f9322536b06d..b6b644668d46 100644 --- a/drivers/net/ovpn/ovpnpriv.h +++ b/drivers/net/ovpn/ovpnpriv.h @@ -10,12 +10,17 @@ #ifndef _NET_OVPN_OVPNSTRUCT_H_ #define _NET_OVPN_OVPNSTRUCT_H_ +#include +#include + /** * struct ovpn_priv - per ovpn interface state * @dev: the actual netdev representing the tunnel + * @mode: device operation mode (i.e. p2p, mp, ..) */ struct ovpn_priv { struct net_device *dev; + enum ovpn_mode mode; }; #endif /* _NET_OVPN_OVPNSTRUCT_H_ */ diff --git a/drivers/net/ovpn/proto.h b/drivers/net/ovpn/proto.h new file mode 100644 index 000000000000..5f95a78bebd3 --- /dev/null +++ b/drivers/net/ovpn/proto.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + * James Yonan + */ + +#ifndef _NET_OVPN_PROTO_H_ +#define _NET_OVPN_PROTO_H_ + +/* When the OpenVPN protocol is ran in AEAD mode, use + * the OpenVPN packet ID as the AEAD nonce: + * + * 00000005 521c3b01 4308c041 + * [seq # ] [ nonce_tail ] + * [ 12-byte full IV ] -> OVPN_NONCE_SIZE + * [4-bytes -> OVPN_NONCE_WIRE_SIZE + * on wire] + */ + +/* nonce size (96bits) as required by AEAD ciphers */ +#define OVPN_NONCE_SIZE 12 +/* last 8 bytes of AEAD nonce: provided by userspace and usually derived + * from key material generated during TLS handshake + */ +#define OVPN_NONCE_TAIL_SIZE 8 + +/* OpenVPN nonce size reduced by 8-byte nonce tail -- this is the + * size of the AEAD Associated Data (AD) sent over the wire + * and is normally the head of the IV + */ +#define OVPN_NONCE_WIRE_SIZE (OVPN_NONCE_SIZE - OVPN_NONCE_TAIL_SIZE) + +#define OVPN_OPCODE_SIZE 4 /* DATA_V2 opcode size */ + +#endif /* _NET_OVPN_PROTO_H_ */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 318386cc5b0d..3ad2d5d98034 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1986,4 +1986,19 @@ enum { #define IFLA_DSA_MAX (__IFLA_DSA_MAX - 1) +/* OVPN section */ + +enum ovpn_mode { + OVPN_MODE_P2P, + OVPN_MODE_MP, +}; + +enum { + IFLA_OVPN_UNSPEC, + IFLA_OVPN_MODE, + __IFLA_OVPN_MAX, +}; + +#define IFLA_OVPN_MAX (__IFLA_OVPN_MAX - 1) + #endif /* _UAPI_LINUX_IF_LINK_H */ -- cgit v1.2.3 From f6226ae7a0cd47aaa9175aca6a1e19600f884cbf Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:23 +0200 Subject: ovpn: introduce the ovpn_socket object This specific structure is used in the ovpn kernel module to wrap and carry around a standard kernel socket. ovpn takes ownership of passed sockets and therefore an ovpn specific objects is attached to them for status tracking purposes. Initially only UDP support is introduced. TCP will come in a later patch. Cc: willemdebruijn.kernel@gmail.com Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-6-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/ovpn/Makefile | 2 + drivers/net/ovpn/main.c | 3 +- drivers/net/ovpn/peer.c | 28 +++++-- drivers/net/ovpn/peer.h | 6 +- drivers/net/ovpn/socket.c | 197 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/net/ovpn/socket.h | 38 +++++++++ drivers/net/ovpn/udp.c | 75 ++++++++++++++++++ drivers/net/ovpn/udp.h | 19 +++++ include/uapi/linux/udp.h | 1 + 9 files changed, 362 insertions(+), 7 deletions(-) create mode 100644 drivers/net/ovpn/socket.c create mode 100644 drivers/net/ovpn/socket.h create mode 100644 drivers/net/ovpn/udp.c create mode 100644 drivers/net/ovpn/udp.h (limited to 'include') diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile index 618328ae3388..164f2058ea8e 100644 --- a/drivers/net/ovpn/Makefile +++ b/drivers/net/ovpn/Makefile @@ -13,3 +13,5 @@ ovpn-y += io.o ovpn-y += netlink.o ovpn-y += netlink-gen.o ovpn-y += peer.o +ovpn-y += socket.o +ovpn-y += udp.o diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c index 3c2b6a7df272..e9a6dc100d46 100644 --- a/drivers/net/ovpn/main.c +++ b/drivers/net/ovpn/main.c @@ -116,7 +116,8 @@ static void ovpn_dellink(struct net_device *dev, struct list_head *head) struct ovpn_priv *ovpn = netdev_priv(dev); if (ovpn->mode == OVPN_MODE_P2P) - ovpn_peer_release_p2p(ovpn, OVPN_DEL_PEER_REASON_TEARDOWN); + ovpn_peer_release_p2p(ovpn, NULL, + OVPN_DEL_PEER_REASON_TEARDOWN); unregister_netdevice_queue(dev, head); } diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c index 338069c99248..0bb6c1517184 100644 --- a/drivers/net/ovpn/peer.c +++ b/drivers/net/ovpn/peer.c @@ -16,17 +16,20 @@ #include "main.h" #include "netlink.h" #include "peer.h" +#include "socket.h" static void unlock_ovpn(struct ovpn_priv *ovpn, - struct llist_head *release_list) + struct llist_head *release_list) __releases(&ovpn->lock) { struct ovpn_peer *peer; spin_unlock_bh(&ovpn->lock); - llist_for_each_entry(peer, release_list->first, release_entry) + llist_for_each_entry(peer, release_list->first, release_entry) { + ovpn_socket_release(peer); ovpn_peer_put(peer); + } } /** @@ -394,18 +397,33 @@ int ovpn_peer_del(struct ovpn_peer *peer, enum ovpn_del_peer_reason reason) /** * ovpn_peer_release_p2p - release peer upon P2P device teardown * @ovpn: the instance being torn down + * @sk: if not NULL, release peer only if it's using this specific socket * @reason: the reason for releasing the peer */ -void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, +void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, struct sock *sk, enum ovpn_del_peer_reason reason) { + struct ovpn_socket *ovpn_sock; LLIST_HEAD(release_list); struct ovpn_peer *peer; spin_lock_bh(&ovpn->lock); peer = rcu_dereference_protected(ovpn->peer, lockdep_is_held(&ovpn->lock)); - if (peer) - ovpn_peer_remove(peer, reason, &release_list); + if (!peer) { + spin_unlock_bh(&ovpn->lock); + return; + } + + if (sk) { + ovpn_sock = rcu_access_pointer(peer->sock); + if (!ovpn_sock || ovpn_sock->sock->sk != sk) { + spin_unlock_bh(&ovpn->lock); + ovpn_peer_put(peer); + return; + } + } + + ovpn_peer_remove(peer, reason, &release_list); unlock_ovpn(ovpn, &release_list); } diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h index fd2e7625990a..29c9065cedcc 100644 --- a/drivers/net/ovpn/peer.h +++ b/drivers/net/ovpn/peer.h @@ -12,6 +12,8 @@ #include +#include "socket.h" + /** * struct ovpn_peer - the main remote peer object * @ovpn: main openvpn instance this peer belongs to @@ -20,6 +22,7 @@ * @vpn_addrs: IP addresses assigned over the tunnel * @vpn_addrs.ipv4: IPv4 assigned to peer on the tunnel * @vpn_addrs.ipv6: IPv6 assigned to peer on the tunnel + * @sock: the socket being used to talk to this peer * @dst_cache: cache for dst_entry used to send to peer * @bind: remote peer binding * @delete_reason: why peer was deleted (i.e. timeout, transport error, ..) @@ -36,6 +39,7 @@ struct ovpn_peer { struct in_addr ipv4; struct in6_addr ipv6; } vpn_addrs; + struct ovpn_socket __rcu *sock; struct dst_cache dst_cache; struct ovpn_bind __rcu *bind; enum ovpn_del_peer_reason delete_reason; @@ -70,7 +74,7 @@ static inline void ovpn_peer_put(struct ovpn_peer *peer) struct ovpn_peer *ovpn_peer_new(struct ovpn_priv *ovpn, u32 id); int ovpn_peer_add(struct ovpn_priv *ovpn, struct ovpn_peer *peer); int ovpn_peer_del(struct ovpn_peer *peer, enum ovpn_del_peer_reason reason); -void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, +void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, struct sock *sk, enum ovpn_del_peer_reason reason); struct ovpn_peer *ovpn_peer_get_by_transp_addr(struct ovpn_priv *ovpn, diff --git a/drivers/net/ovpn/socket.c b/drivers/net/ovpn/socket.c new file mode 100644 index 000000000000..beec7aee35e1 --- /dev/null +++ b/drivers/net/ovpn/socket.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include +#include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "io.h" +#include "peer.h" +#include "socket.h" +#include "udp.h" + +static void ovpn_socket_release_kref(struct kref *kref) +{ + struct ovpn_socket *sock = container_of(kref, struct ovpn_socket, + refcount); + + if (sock->sock->sk->sk_protocol == IPPROTO_UDP) + ovpn_udp_socket_detach(sock); + + kfree_rcu(sock, rcu); +} + +/** + * ovpn_socket_put - decrease reference counter + * @peer: peer whose socket reference counter should be decreased + * @sock: the RCU protected peer socket + * + * This function is only used internally. Users willing to release + * references to the ovpn_socket should use ovpn_socket_release() + */ +static void ovpn_socket_put(struct ovpn_peer *peer, struct ovpn_socket *sock) +{ + kref_put(&sock->refcount, ovpn_socket_release_kref); +} + +/** + * ovpn_socket_release - release resources owned by socket user + * @peer: peer whose socket should be released + * + * This function should be invoked when the user is shutting + * down and wants to drop its link to the socket. + * + * In case of UDP, the detach routine will drop a reference to the + * ovpn netdev, pointed by the ovpn_socket. + * + * In case of TCP, releasing the socket will cause dropping + * the refcounter for the peer it is linked to, thus allowing the peer + * disappear as well. + * + * This function is expected to be invoked exactly once per peer + * + * NOTE: this function may sleep + */ +void ovpn_socket_release(struct ovpn_peer *peer) +{ + struct ovpn_socket *sock; + + might_sleep(); + + sock = rcu_replace_pointer(peer->sock, NULL, true); + /* release may be invoked after socket was detached */ + if (!sock) + return; + + /* sanity check: we should not end up here if the socket + * was already closed + */ + if (!sock->sock->sk) { + DEBUG_NET_WARN_ON_ONCE(1); + return; + } + + /* Drop the reference while holding the sock lock to avoid + * concurrent ovpn_socket_new call to mess up with a partially + * detached socket. + * + * Holding the lock ensures that a socket with refcnt 0 is fully + * detached before it can be picked by a concurrent reader. + */ + lock_sock(sock->sock->sk); + ovpn_socket_put(peer, sock); + release_sock(sock->sock->sk); + + /* align all readers with sk_user_data being NULL */ + synchronize_rcu(); +} + +static bool ovpn_socket_hold(struct ovpn_socket *sock) +{ + return kref_get_unless_zero(&sock->refcount); +} + +static int ovpn_socket_attach(struct ovpn_socket *sock, struct ovpn_peer *peer) +{ + if (sock->sock->sk->sk_protocol == IPPROTO_UDP) + return ovpn_udp_socket_attach(sock, peer->ovpn); + + return -EOPNOTSUPP; +} + +/** + * ovpn_socket_new - create a new socket and initialize it + * @sock: the kernel socket to embed + * @peer: the peer reachable via this socket + * + * Return: an openvpn socket on success or a negative error code otherwise + */ +struct ovpn_socket *ovpn_socket_new(struct socket *sock, struct ovpn_peer *peer) +{ + struct ovpn_socket *ovpn_sock; + int ret; + + lock_sock(sock->sk); + + /* a TCP socket can only be owned by a single peer, therefore there + * can't be any other user + */ + if (sock->sk->sk_protocol == IPPROTO_TCP && sock->sk->sk_user_data) { + ovpn_sock = ERR_PTR(-EBUSY); + goto sock_release; + } + + /* a UDP socket can be shared across multiple peers, but we must make + * sure it is not owned by something else + */ + if (sock->sk->sk_protocol == IPPROTO_UDP) { + u8 type = READ_ONCE(udp_sk(sock->sk)->encap_type); + + /* socket owned by other encapsulation module */ + if (type && type != UDP_ENCAP_OVPNINUDP) { + ovpn_sock = ERR_PTR(-EBUSY); + goto sock_release; + } + + rcu_read_lock(); + ovpn_sock = rcu_dereference_sk_user_data(sock->sk); + if (ovpn_sock) { + /* socket owned by another ovpn instance, we can't use it */ + if (ovpn_sock->ovpn != peer->ovpn) { + ovpn_sock = ERR_PTR(-EBUSY); + rcu_read_unlock(); + goto sock_release; + } + + /* this socket is already owned by this instance, + * therefore we can increase the refcounter and + * use it as expected + */ + if (WARN_ON(!ovpn_socket_hold(ovpn_sock))) { + /* this should never happen because setting + * the refcnt to 0 and detaching the socket + * is expected to be atomic + */ + ovpn_sock = ERR_PTR(-EAGAIN); + rcu_read_unlock(); + goto sock_release; + } + + rcu_read_unlock(); + goto sock_release; + } + rcu_read_unlock(); + } + + /* socket is not owned: attach to this ovpn instance */ + + ovpn_sock = kzalloc(sizeof(*ovpn_sock), GFP_KERNEL); + if (!ovpn_sock) { + ovpn_sock = ERR_PTR(-ENOMEM); + goto sock_release; + } + + ovpn_sock->ovpn = peer->ovpn; + ovpn_sock->sock = sock; + kref_init(&ovpn_sock->refcount); + + ret = ovpn_socket_attach(ovpn_sock, peer); + if (ret < 0) { + kfree(ovpn_sock); + ovpn_sock = ERR_PTR(ret); + goto sock_release; + } + + rcu_assign_sk_user_data(sock->sk, ovpn_sock); +sock_release: + release_sock(sock->sk); + return ovpn_sock; +} diff --git a/drivers/net/ovpn/socket.h b/drivers/net/ovpn/socket.h new file mode 100644 index 000000000000..ade8c94619d7 --- /dev/null +++ b/drivers/net/ovpn/socket.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_SOCK_H_ +#define _NET_OVPN_SOCK_H_ + +#include +#include +#include + +struct ovpn_priv; +struct ovpn_peer; + +/** + * struct ovpn_socket - a kernel socket referenced in the ovpn code + * @ovpn: ovpn instance owning this socket (UDP only) + * @sock: the low level sock object + * @refcount: amount of contexts currently referencing this object + * @rcu: member used to schedule RCU destructor callback + */ +struct ovpn_socket { + struct ovpn_priv *ovpn; + struct socket *sock; + struct kref refcount; + struct rcu_head rcu; +}; + +struct ovpn_socket *ovpn_socket_new(struct socket *sock, + struct ovpn_peer *peer); +void ovpn_socket_release(struct ovpn_peer *peer); + +#endif /* _NET_OVPN_SOCK_H_ */ diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c new file mode 100644 index 000000000000..91970e66a434 --- /dev/null +++ b/drivers/net/ovpn/udp.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#include +#include +#include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "socket.h" +#include "udp.h" + +/** + * ovpn_udp_socket_attach - set udp-tunnel CBs on socket and link it to ovpn + * @ovpn_sock: socket to configure + * @ovpn: the openvp instance to link + * + * After invoking this function, the sock will be controlled by ovpn so that + * any incoming packet may be processed by ovpn first. + * + * Return: 0 on success or a negative error code otherwise + */ +int ovpn_udp_socket_attach(struct ovpn_socket *ovpn_sock, + struct ovpn_priv *ovpn) +{ + struct socket *sock = ovpn_sock->sock; + struct ovpn_socket *old_data; + int ret = 0; + + /* make sure no pre-existing encapsulation handler exists */ + rcu_read_lock(); + old_data = rcu_dereference_sk_user_data(sock->sk); + if (!old_data) { + /* socket is currently unused - we can take it */ + rcu_read_unlock(); + return 0; + } + + /* socket is in use. We need to understand if it's owned by this ovpn + * instance or by something else. + * In the former case, we can increase the refcounter and happily + * use it, because the same UDP socket is expected to be shared among + * different peers. + * + * Unlikely TCP, a single UDP socket can be used to talk to many remote + * hosts and therefore openvpn instantiates one only for all its peers + */ + if ((READ_ONCE(udp_sk(sock->sk)->encap_type) == UDP_ENCAP_OVPNINUDP) && + old_data->ovpn == ovpn) { + netdev_dbg(ovpn->dev, + "provided socket already owned by this interface\n"); + ret = -EALREADY; + } else { + netdev_dbg(ovpn->dev, + "provided socket already taken by other user\n"); + ret = -EBUSY; + } + rcu_read_unlock(); + + return ret; +} + +/** + * ovpn_udp_socket_detach - clean udp-tunnel status for this socket + * @ovpn_sock: the socket to clean + */ +void ovpn_udp_socket_detach(struct ovpn_socket *ovpn_sock) +{ +} diff --git a/drivers/net/ovpn/udp.h b/drivers/net/ovpn/udp.h new file mode 100644 index 000000000000..1c8fb6fe402d --- /dev/null +++ b/drivers/net/ovpn/udp.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#ifndef _NET_OVPN_UDP_H_ +#define _NET_OVPN_UDP_H_ + +struct ovpn_priv; +struct socket; + +int ovpn_udp_socket_attach(struct ovpn_socket *ovpn_sock, + struct ovpn_priv *ovpn); +void ovpn_udp_socket_detach(struct ovpn_socket *ovpn_sock); + +#endif /* _NET_OVPN_UDP_H_ */ diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h index d85d671deed3..edca3e430305 100644 --- a/include/uapi/linux/udp.h +++ b/include/uapi/linux/udp.h @@ -43,5 +43,6 @@ struct udphdr { #define UDP_ENCAP_GTP1U 5 /* 3GPP TS 29.060 */ #define UDP_ENCAP_RXRPC 6 #define TCP_ENCAP_ESPINTCP 7 /* Yikes, this is really xfrm encap types. */ +#define UDP_ENCAP_OVPNINUDP 8 /* OpenVPN traffic */ #endif /* _UAPI_LINUX_UDP_H */ -- cgit v1.2.3 From 17240749f26e07cafa676688d8a3326086498447 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:29 +0200 Subject: skb: implement skb_send_sock_locked_with_flags() When sending an skb over a socket using skb_send_sock_locked(), it is currently not possible to specify any flag to be set in msghdr->msg_flags. However, we may want to pass flags the user may have specified, like MSG_NOSIGNAL. Extend __skb_send_sock() with a new argument 'flags' and add a new interface named skb_send_sock_locked_with_flags(). Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-12-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 2 ++ net/core/skbuff.c | 18 +++++++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f1381aff0f89..beb084ee4f4d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4145,6 +4145,8 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, unsigned int flags); int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len); +int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, + int offset, int len, int flags); int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 74a2d886a35b..d73ad79fe739 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3227,7 +3227,7 @@ static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, - int len, sendmsg_func sendmsg) + int len, sendmsg_func sendmsg, int flags) { unsigned int orig_len = len; struct sk_buff *head = skb; @@ -3245,7 +3245,7 @@ do_frag_list: kv.iov_base = skb->data + offset; kv.iov_len = slen; memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT; + msg.msg_flags = MSG_DONTWAIT | flags; iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, @@ -3282,7 +3282,8 @@ do_frag_list: while (slen) { struct bio_vec bvec; struct msghdr msg = { - .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT, + .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | + flags, }; bvec_set_page(&bvec, skb_frag_page(frag), slen, @@ -3328,14 +3329,21 @@ error: int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, sendmsg_locked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0); } EXPORT_SYMBOL_GPL(skb_send_sock_locked); +int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, + int offset, int len, int flags) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags); +} +EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags); + /* Send skb data on a socket. Socket must be unlocked. */ int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0); } /** -- cgit v1.2.3 From a1b669ea16c4d7c1a1a8fc7e25aaf651ea0078c3 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 9 Apr 2025 14:45:57 -0700 Subject: bpf: Prepare to reuse get_ctx_arg_idx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename get_ctx_arg_idx to bpf_ctx_arg_idx, and allow others to call it. No functional change. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409214606.2000194-2-ameryhung@gmail.com --- include/linux/btf.h | 1 + kernel/bpf/btf.c | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index ebc0c0c9b944..b2983706292f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -522,6 +522,7 @@ bool btf_param_match_suffix(const struct btf *btf, const char *suffix); int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, u32 arg_no); +u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, int off); struct bpf_verifier_log; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 16ba36f34dfa..ac6c8ed08de3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6391,8 +6391,8 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t) return btf_type_is_int(t); } -static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, - int off) +u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, + int off) { const struct btf_param *args; const struct btf_type *t; @@ -6671,7 +6671,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, tname, off); return false; } - arg = get_ctx_arg_idx(btf, t, off); + arg = btf_ctx_arg_idx(btf, t, off); args = (const struct btf_param *)(t + 1); /* if (t == NULL) Fall back to default BPF prog with * MAX_BPF_FUNC_REG_ARGS u64 arguments. -- cgit v1.2.3 From 151e13ece86d234213b7f224f0e26a957c0eeb3e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 15 Apr 2025 18:02:15 -0700 Subject: net: ethtool: Adjust exactly ETH_GSTRING_LEN-long stats to use memcpy Many drivers populate the stats buffer using C-String based APIs (e.g. ethtool_sprintf() and ethtool_puts()), usually when building up the list of stats individually (i.e. with a for() loop). This, however, requires that the source strings be populated in such a way as to have a terminating NUL byte in the source. Other drivers populate the stats buffer directly using one big memcpy() of an entire array of strings. No NUL termination is needed here, as the bytes are being directly passed through. Yet others will build up the stats buffer individually, but also use memcpy(). This, too, does not need NUL termination of the source strings. However, there are cases where the strings that populate the source stats strings are exactly ETH_GSTRING_LEN long, and GCC 15's -Wunterminated-string-initialization option complains that the trailing NUL byte has been truncated. This situation is fine only if the driver is using the memcpy() approach. If the C-String APIs are used, the destination string name will have its final byte truncated by the required trailing NUL byte applied by the C-string API. For drivers that are already using memcpy() but have initializers that truncate the NUL terminator, mark their source strings as __nonstring to silence the GCC warnings. For drivers that have initializers that truncate the NUL terminator and are using the C-String APIs, switch to memcpy() to avoid destination string truncation and mark their source strings as __nonstring to silence the GCC warnings. (Also introduce ethtool_cpy() as a helper to make this an easy replacement). Specifically the following warnings were investigated and addressed: ../drivers/net/ethernet/chelsio/cxgb/cxgb2.c:364:9: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 364 | "TxFramesAbortedDueToXSCollisions", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/freescale/enetc/enetc_ethtool.c:165:33: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 165 | { ENETC_PM_R1523X(0), "MAC rx 1523 to max-octet packets" }, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/freescale/enetc/enetc_ethtool.c:190:33: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 190 | { ENETC_PM_T1523X(0), "MAC tx 1523 to max-octet packets" }, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/google/gve/gve_ethtool.c:76:9: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 76 | "adminq_dcfg_device_resources_cnt", "adminq_set_driver_parameter_cnt", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c:117:53: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 117 | STMMAC_STAT(ptp_rx_msg_type_pdelay_follow_up), | ^ ../drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c:46:12: note: in definition of macro 'STMMAC_STAT' 46 | { #m, sizeof_field(struct stmmac_extra_stats, m), \ | ^ ../drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c:328:24: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 328 | .str = "a_mac_control_frames_transmitted", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c:340:24: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 340 | .str = "a_pause_mac_ctrl_frames_received", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Kees Cook Reviewed-by: Petr Machata # for mlxsw Reviewed-by: Harshitha Ramamurthy Link: https://patch.msgid.link/20250416010210.work.904-kees@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb/cxgb2.c | 2 +- drivers/net/ethernet/freescale/enetc/enetc_ethtool.c | 4 ++-- drivers/net/ethernet/google/gve/gve_ethtool.c | 4 ++-- drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 2 +- include/linux/ethtool.h | 11 +++++++++++ 6 files changed, 18 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c index 3b7068832f95..4a0e2d2eb60a 100644 --- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c @@ -351,7 +351,7 @@ static void set_msglevel(struct net_device *dev, u32 val) adapter->msg_enable = val; } -static const char stats_strings[][ETH_GSTRING_LEN] = { +static const char stats_strings[][ETH_GSTRING_LEN] __nonstring_array = { "TxOctetsOK", "TxOctetsBad", "TxUnicastFramesOK", diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c index ece3ae28ba82..f47c8b6cc511 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c @@ -141,7 +141,7 @@ static const struct { static const struct { int reg; - char name[ETH_GSTRING_LEN]; + char name[ETH_GSTRING_LEN] __nonstring; } enetc_port_counters[] = { { ENETC_PM_REOCT(0), "MAC rx ethernet octets" }, { ENETC_PM_RALN(0), "MAC rx alignment errors" }, @@ -264,7 +264,7 @@ static void enetc_get_strings(struct net_device *ndev, u32 stringset, u8 *data) break; for (i = 0; i < ARRAY_SIZE(enetc_port_counters); i++) - ethtool_puts(&data, enetc_port_counters[i].name); + ethtool_cpy(&data, enetc_port_counters[i].name); break; } diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index eae1a7595a69..3c1da0cf3f61 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -67,7 +67,7 @@ static const char gve_gstrings_tx_stats[][ETH_GSTRING_LEN] = { "tx_xsk_sent[%u]", "tx_xdp_xmit[%u]", "tx_xdp_xmit_errors[%u]" }; -static const char gve_gstrings_adminq_stats[][ETH_GSTRING_LEN] = { +static const char gve_gstrings_adminq_stats[][ETH_GSTRING_LEN] __nonstring_array = { "adminq_prod_cnt", "adminq_cmd_fail", "adminq_timeouts", "adminq_describe_device_cnt", "adminq_cfg_device_resources_cnt", "adminq_register_page_list_cnt", "adminq_unregister_page_list_cnt", @@ -113,7 +113,7 @@ static void gve_get_strings(struct net_device *netdev, u32 stringset, u8 *data) i); for (i = 0; i < ARRAY_SIZE(gve_gstrings_adminq_stats); i++) - ethtool_puts(&s, gve_gstrings_adminq_stats[i]); + ethtool_cpy(&s, gve_gstrings_adminq_stats[i]); break; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c index 3f64cdbabfa3..0a8fb9c842d3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c @@ -262,7 +262,7 @@ err_port_pause_configure: } struct mlxsw_sp_port_hw_stats { - char str[ETH_GSTRING_LEN]; + char str[ETH_GSTRING_LEN] __nonstring; u64 (*getter)(const char *payload); bool cells_bytes; }; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 918a32f8fda8..844f7d516a40 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -37,7 +37,7 @@ #define ETHTOOL_DMA_OFFSET 55 struct stmmac_stats { - char stat_string[ETH_GSTRING_LEN]; + char stat_string[ETH_GSTRING_LEN] __nonstring; int sizeof_stat; int stat_offset; }; diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 013d25858642..7edb5f5e7134 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1330,6 +1330,17 @@ extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...); */ extern void ethtool_puts(u8 **data, const char *str); +/** + * ethtool_cpy - Write possibly-not-NUL-terminated string to ethtool string data + * @data: Pointer to a pointer to the start of string to write into + * @str: NUL-byte padded char array of size ETH_GSTRING_LEN to copy from + */ +#define ethtool_cpy(data, str) do { \ + BUILD_BUG_ON(sizeof(str) != ETH_GSTRING_LEN); \ + memcpy(*(data), str, ETH_GSTRING_LEN); \ + *(data) += ETH_GSTRING_LEN; \ +} while (0) + /* Link mode to forced speed capabilities maps */ struct ethtool_forced_speed_map { u32 speed; -- cgit v1.2.3 From 22cbc1ee268b7ec0000848708944daa61c6e4909 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 15 Apr 2025 20:04:47 -0700 Subject: netdev: fix the locking for netdev notifications Kuniyuki reports that the assert for netdev lock fires when there are netdev event listeners (otherwise we skip the netlink event generation). Correct the locking when coming from the notifier. The NETDEV_XDP_FEAT_CHANGE notifier is already fully locked, it's the documentation that's incorrect. Fixes: 99e44f39a8f7 ("netdev: depend on netdev->lock for xdp features") Reported-by: syzkaller Reported-by: Kuniyuki Iwashima Link: https://lore.kernel.org/20250410171019.62128-1-kuniyu@amazon.com Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250416030447.1077551-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/netdevices.rst | 4 +++- include/linux/netdevice.h | 2 +- include/net/netdev_lock.h | 16 ++++++++++++++++ net/core/lock_debug.c | 4 +++- net/core/netdev-genl.c | 4 ++++ 5 files changed, 27 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index 77fe1f7b93be..7ebb6c36482d 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -387,12 +387,14 @@ For device drivers that implement shaping or queue management APIs, some of the notifiers (``enum netdev_cmd``) are running under the netdev instance lock. +The following netdev notifiers are always run under the instance lock: +* ``NETDEV_XDP_FEAT_CHANGE`` + For devices with locked ops, currently only the following notifiers are running under the lock: * ``NETDEV_CHANGE`` * ``NETDEV_REGISTER`` * ``NETDEV_UP`` -* ``NETDEV_XDP_FEAT_CHANGE`` The following notifiers are running without the lock: * ``NETDEV_UNREGISTER`` diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e6036b82ef4c..0321fd952f70 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2520,7 +2520,7 @@ struct net_device { * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: - * @up, @moving_ns, @nd_net, @xdp_flags + * @up, @moving_ns, @nd_net, @xdp_features * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index 5706835a660c..2a753813f849 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -48,6 +48,22 @@ static inline void netdev_unlock_ops(struct net_device *dev) netdev_unlock(dev); } +static inline void netdev_lock_ops_to_full(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_lock(dev); +} + +static inline void netdev_unlock_full_to_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_unlock(dev); +} + static inline void netdev_ops_assert_locked(const struct net_device *dev) { if (netdev_need_ops_lock(dev)) diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c index 6fade574bc2a..9e9fb25314b9 100644 --- a/net/core/lock_debug.c +++ b/net/core/lock_debug.c @@ -18,10 +18,12 @@ int netdev_debug_event(struct notifier_block *nb, unsigned long event, /* Keep enum and don't add default to trigger -Werror=switch */ switch (cmd) { + case NETDEV_XDP_FEAT_CHANGE: + netdev_assert_locked(dev); + fallthrough; case NETDEV_CHANGE: case NETDEV_REGISTER: case NETDEV_UP: - case NETDEV_XDP_FEAT_CHANGE: netdev_ops_assert_locked(dev); fallthrough; case NETDEV_DOWN: diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index b64c614a00c4..2c104947d224 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -963,10 +963,14 @@ static int netdev_genl_netdevice_event(struct notifier_block *nb, switch (event) { case NETDEV_REGISTER: + netdev_lock_ops_to_full(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); + netdev_unlock_full_to_ops(netdev); break; case NETDEV_UNREGISTER: + netdev_lock(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); + netdev_unlock(netdev); break; case NETDEV_XDP_FEAT_CHANGE: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); -- cgit v1.2.3 From 2b905deb43ea0b67fa8448fc9c15dacb068f45b6 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Wed, 16 Apr 2025 19:56:23 +0800 Subject: net: Delete the outer () duplicated of macro SOCK_SKB_CB_OFFSET definition For macro SOCK_SKB_CB_OFFSET definition, Delete the outer () duplicated. Signed-off-by: Zijun Hu Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250416-fix_net-v1-1-d544c9f3f169@quicinc.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index bb4d6189292f..e223102337c7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2604,8 +2604,8 @@ struct sock_skb_cb { * using skb->cb[] would keep using it directly and utilize its * alignment guarantee. */ -#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \ - sizeof(struct sock_skb_cb))) +#define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \ + sizeof(struct sock_skb_cb)) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ SOCK_SKB_CB_OFFSET)) -- cgit v1.2.3 From 1df4a945444f071a9c5e09580a485919c42d4de5 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 16 Apr 2025 10:06:12 -0700 Subject: trace: tcp: Add const qualifier to skb parameter in tcp_probe event Change the tcp_probe tracepoint to accept a const struct sk_buff parameter instead of a non-const one. This improves type safety and better reflects that the skb is not modified within the tracepoint implementation. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250416-tcp_probe-v1-1-1edc3c5a1cb8@debian.org Signed-off-by: Jakub Kicinski --- include/trace/events/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 75d3d53a3832..53e878fa14d1 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -293,7 +293,7 @@ DECLARE_TRACE(tcp_cwnd_reduction_tp, TRACE_EVENT(tcp_probe, - TP_PROTO(struct sock *sk, struct sk_buff *skb), + TP_PROTO(struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb), -- cgit v1.2.3 From 8066e388be48f1ad62b0449dc1d31a25489fa12a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 16 Apr 2025 13:08:40 -0700 Subject: net: add UAPI to the header guard in various network headers fib_rule, ip6_tunnel, and a whole lot of if_* headers lack the customary _UAPI in the header guard. Without it YNL build can't protect from in tree and system headers both getting included. YNL doesn't need most of these but it's annoying to have to fix them one by one. Note that header installation strips this _UAPI prefix so this should result in no change to the end user. Acked-by: Jamal Hadi Salim Reviewed-by: Jason Xing Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250416200840.1338195-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/fib_rules.h | 4 ++-- include/uapi/linux/if_addr.h | 4 ++-- include/uapi/linux/if_addrlabel.h | 4 ++-- include/uapi/linux/if_alg.h | 6 +++--- include/uapi/linux/if_arcnet.h | 6 +++--- include/uapi/linux/if_bonding.h | 6 +++--- include/uapi/linux/if_fc.h | 6 +++--- include/uapi/linux/if_hippi.h | 6 +++--- include/uapi/linux/if_packet.h | 4 ++-- include/uapi/linux/if_plip.h | 4 ++-- include/uapi/linux/if_slip.h | 4 ++-- include/uapi/linux/if_x25.h | 6 +++--- include/uapi/linux/if_xdp.h | 6 +++--- include/uapi/linux/ip6_tunnel.h | 4 ++-- include/uapi/linux/net_dropmon.h | 4 ++-- include/uapi/linux/net_tstamp.h | 6 +++--- include/uapi/linux/netlink_diag.h | 4 ++-- include/uapi/linux/pkt_cls.h | 4 ++-- include/uapi/linux/pkt_sched.h | 4 ++-- 19 files changed, 46 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 2df6e4035d50..418c4be697ad 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_FIB_RULES_H -#define __LINUX_FIB_RULES_H +#ifndef _UAPI__LINUX_FIB_RULES_H +#define _UAPI__LINUX_FIB_RULES_H #include #include diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h index 1c392dd95a5e..aa7958b4e41d 100644 --- a/include/uapi/linux/if_addr.h +++ b/include/uapi/linux/if_addr.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_IF_ADDR_H -#define __LINUX_IF_ADDR_H +#ifndef _UAPI__LINUX_IF_ADDR_H +#define _UAPI__LINUX_IF_ADDR_H #include #include diff --git a/include/uapi/linux/if_addrlabel.h b/include/uapi/linux/if_addrlabel.h index d1f5974c76e1..e69db764fbba 100644 --- a/include/uapi/linux/if_addrlabel.h +++ b/include/uapi/linux/if_addrlabel.h @@ -8,8 +8,8 @@ * YOSHIFUJI Hideaki @ USAGI/WIDE */ -#ifndef __LINUX_IF_ADDRLABEL_H -#define __LINUX_IF_ADDRLABEL_H +#ifndef _UAPI__LINUX_IF_ADDRLABEL_H +#define _UAPI__LINUX_IF_ADDRLABEL_H #include diff --git a/include/uapi/linux/if_alg.h b/include/uapi/linux/if_alg.h index 0824fbc026a1..b35871cbeed7 100644 --- a/include/uapi/linux/if_alg.h +++ b/include/uapi/linux/if_alg.h @@ -11,8 +11,8 @@ * */ -#ifndef _LINUX_IF_ALG_H -#define _LINUX_IF_ALG_H +#ifndef _UAPI_LINUX_IF_ALG_H +#define _UAPI_LINUX_IF_ALG_H #include @@ -58,4 +58,4 @@ struct af_alg_iv { #define ALG_OP_DECRYPT 0 #define ALG_OP_ENCRYPT 1 -#endif /* _LINUX_IF_ALG_H */ +#endif /* _UAPI_LINUX_IF_ALG_H */ diff --git a/include/uapi/linux/if_arcnet.h b/include/uapi/linux/if_arcnet.h index b122cfac7128..473569eaf692 100644 --- a/include/uapi/linux/if_arcnet.h +++ b/include/uapi/linux/if_arcnet.h @@ -14,8 +14,8 @@ * 2 of the License, or (at your option) any later version. */ -#ifndef _LINUX_IF_ARCNET_H -#define _LINUX_IF_ARCNET_H +#ifndef _UAPI_LINUX_IF_ARCNET_H +#define _UAPI_LINUX_IF_ARCNET_H #include #include @@ -127,4 +127,4 @@ struct archdr { } soft; }; -#endif /* _LINUX_IF_ARCNET_H */ +#endif /* _UAPI_LINUX_IF_ARCNET_H */ diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index d174914a837d..3bcc03f3aa4f 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -41,8 +41,8 @@ * - added definitions for various XOR hashing policies */ -#ifndef _LINUX_IF_BONDING_H -#define _LINUX_IF_BONDING_H +#ifndef _UAPI_LINUX_IF_BONDING_H +#define _UAPI_LINUX_IF_BONDING_H #include #include @@ -152,4 +152,4 @@ enum { }; #define BOND_3AD_STAT_MAX (__BOND_3AD_STAT_MAX - 1) -#endif /* _LINUX_IF_BONDING_H */ +#endif /* _UAPI_LINUX_IF_BONDING_H */ diff --git a/include/uapi/linux/if_fc.h b/include/uapi/linux/if_fc.h index 3e3173282cc3..ff5ab92d16c2 100644 --- a/include/uapi/linux/if_fc.h +++ b/include/uapi/linux/if_fc.h @@ -18,8 +18,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#ifndef _LINUX_IF_FC_H -#define _LINUX_IF_FC_H +#ifndef _UAPI_LINUX_IF_FC_H +#define _UAPI_LINUX_IF_FC_H #include @@ -49,4 +49,4 @@ struct fcllc { __be16 ethertype; /* ether type field */ }; -#endif /* _LINUX_IF_FC_H */ +#endif /* _UAPI_LINUX_IF_FC_H */ diff --git a/include/uapi/linux/if_hippi.h b/include/uapi/linux/if_hippi.h index 785a1452a66c..42c4ffd11dae 100644 --- a/include/uapi/linux/if_hippi.h +++ b/include/uapi/linux/if_hippi.h @@ -20,8 +20,8 @@ * 2 of the License, or (at your option) any later version. */ -#ifndef _LINUX_IF_HIPPI_H -#define _LINUX_IF_HIPPI_H +#ifndef _UAPI_LINUX_IF_HIPPI_H +#define _UAPI_LINUX_IF_HIPPI_H #include #include @@ -151,4 +151,4 @@ struct hippi_hdr { struct hippi_snap_hdr snap; } __attribute__((packed)); -#endif /* _LINUX_IF_HIPPI_H */ +#endif /* _UAPI_LINUX_IF_HIPPI_H */ diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 1d2718dd9647..6cd1d7a41dfb 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_IF_PACKET_H -#define __LINUX_IF_PACKET_H +#ifndef _UAPI__LINUX_IF_PACKET_H +#define _UAPI__LINUX_IF_PACKET_H #include #include diff --git a/include/uapi/linux/if_plip.h b/include/uapi/linux/if_plip.h index 495a366112f2..054d86a9c6e6 100644 --- a/include/uapi/linux/if_plip.h +++ b/include/uapi/linux/if_plip.h @@ -9,8 +9,8 @@ * */ -#ifndef _LINUX_IF_PLIP_H -#define _LINUX_IF_PLIP_H +#ifndef _UAPI_LINUX_IF_PLIP_H +#define _UAPI_LINUX_IF_PLIP_H #include diff --git a/include/uapi/linux/if_slip.h b/include/uapi/linux/if_slip.h index 65937be53103..299bf7adc862 100644 --- a/include/uapi/linux/if_slip.h +++ b/include/uapi/linux/if_slip.h @@ -6,8 +6,8 @@ * KISS TNC driver. */ -#ifndef __LINUX_SLIP_H -#define __LINUX_SLIP_H +#ifndef _UAPI__LINUX_SLIP_H +#define _UAPI__LINUX_SLIP_H #define SL_MODE_SLIP 0 #define SL_MODE_CSLIP 1 diff --git a/include/uapi/linux/if_x25.h b/include/uapi/linux/if_x25.h index 3a5938e38370..861cfa983db4 100644 --- a/include/uapi/linux/if_x25.h +++ b/include/uapi/linux/if_x25.h @@ -13,8 +13,8 @@ * GNU General Public License for more details. */ -#ifndef _IF_X25_H -#define _IF_X25_H +#ifndef _UAPI_IF_X25_H +#define _UAPI_IF_X25_H #include @@ -24,4 +24,4 @@ #define X25_IFACE_DISCONNECT 0x02 #define X25_IFACE_PARAMS 0x03 -#endif /* _IF_X25_H */ +#endif /* _UAPI_IF_X25_H */ diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 42869770776e..44f2bb93e7e6 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -7,8 +7,8 @@ * Magnus Karlsson */ -#ifndef _LINUX_IF_XDP_H -#define _LINUX_IF_XDP_H +#ifndef _UAPI_LINUX_IF_XDP_H +#define _UAPI_LINUX_IF_XDP_H #include @@ -180,4 +180,4 @@ struct xdp_desc { /* TX packet carries valid metadata. */ #define XDP_TX_METADATA (1 << 1) -#endif /* _LINUX_IF_XDP_H */ +#endif /* _UAPI_LINUX_IF_XDP_H */ diff --git a/include/uapi/linux/ip6_tunnel.h b/include/uapi/linux/ip6_tunnel.h index 0245269b037c..85182a839d42 100644 --- a/include/uapi/linux/ip6_tunnel.h +++ b/include/uapi/linux/ip6_tunnel.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _IP6_TUNNEL_H -#define _IP6_TUNNEL_H +#ifndef _UAPI_IP6_TUNNEL_H +#define _UAPI_IP6_TUNNEL_H #include #include /* For IFNAMSIZ. */ diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h index 84f622a66a7a..9dd41c2f58a6 100644 --- a/include/uapi/linux/net_dropmon.h +++ b/include/uapi/linux/net_dropmon.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __NET_DROPMON_H -#define __NET_DROPMON_H +#ifndef _UAPI__NET_DROPMON_H +#define _UAPI__NET_DROPMON_H #include #include diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 383213de612a..a93e6ea37fb3 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -7,8 +7,8 @@ * */ -#ifndef _NET_TIMESTAMPING_H -#define _NET_TIMESTAMPING_H +#ifndef _UAPI_NET_TIMESTAMPING_H +#define _UAPI_NET_TIMESTAMPING_H #include #include /* for SO_TIMESTAMPING */ @@ -216,4 +216,4 @@ struct sock_txtime { __u32 flags; /* as defined by enum txtime_flags */ }; -#endif /* _NET_TIMESTAMPING_H */ +#endif /* _UAPI_NET_TIMESTAMPING_H */ diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h index dfa61be43d2f..ff28200204bb 100644 --- a/include/uapi/linux/netlink_diag.h +++ b/include/uapi/linux/netlink_diag.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __NETLINK_DIAG_H__ -#define __NETLINK_DIAG_H__ +#ifndef _UAPI__NETLINK_DIAG_H__ +#define _UAPI__NETLINK_DIAG_H__ #include diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 2c32080416b5..490821364165 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_PKT_CLS_H -#define __LINUX_PKT_CLS_H +#ifndef _UAPI__LINUX_PKT_CLS_H +#define _UAPI__LINUX_PKT_CLS_H #include #include diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 25a9a47001cd..9ea874395717 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_PKT_SCHED_H -#define __LINUX_PKT_SCHED_H +#ifndef _UAPI__LINUX_PKT_SCHED_H +#define _UAPI__LINUX_PKT_SCHED_H #include #include -- cgit v1.2.3 From 9ff2aa4206eff40a202e425f232036bc84ad4c0e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 17 Mar 2025 23:07:30 -0400 Subject: net: ethtool: mm: extract stmmac verification logic into common library It appears that stmmac is not the only hardware which requires a software-driven verification state machine for the MAC Merge layer. While on the one hand it's good to encourage hardware implementations, on the other hand it's quite difficult to tolerate multiple drivers implementing independently fairly non-trivial logic. Extract the hardware-independent logic from stmmac into library code and put it in ethtool. Name the state structure "mmsv" for MAC Merge Software Verification. Let this expose an operations structure for executing the hardware stuff: sync hardware with the tx_active boolean (result of verification process), enable/disable the pMAC, send mPackets, notify library of external events (reception of mPackets), as well as link state changes. Note that it is assumed that the external events are received in hardirq context. If they are not, it is probably a good idea to disable hardirqs when calling ethtool_mmsv_event_handle(), because the library does not do so. Also, the MM software verification process has no business with the tx_min_frag_size, that is all the driver's to handle. Signed-off-by: Vladimir Oltean Co-developed-by: Choong Yong Liang Signed-off-by: Choong Yong Liang Tested-by: Choong Yong Liang Tested-by: Furong Xu <0x1207@gmail.com> Reviewed-by: Vladimir Oltean Signed-off-by: Faizal Rahim Signed-off-by: Tony Nguyen --- drivers/net/ethernet/stmicro/stmmac/Kconfig | 1 + drivers/net/ethernet/stmicro/stmmac/stmmac.h | 16 +- .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 42 +--- drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.c | 174 +++---------- drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.h | 5 - drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 8 +- include/linux/ethtool.h | 73 ++++++ net/ethtool/mm.c | 275 ++++++++++++++++++++- 8 files changed, 392 insertions(+), 202 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig index 3c820ef56775..59c742a8b3ad 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Kconfig +++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig @@ -3,6 +3,7 @@ config STMMAC_ETH tristate "STMicroelectronics Multi-Gigabit Ethernet driver" depends on HAS_IOMEM && HAS_DMA depends on PTP_1588_CLOCK_OPTIONAL + depends on ETHTOOL_NETLINK select MII select PCS_XPCS select PAGE_POOL diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index bddfa0f4aa21..1686e559f66e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -149,21 +149,9 @@ struct stmmac_channel { }; struct stmmac_fpe_cfg { - /* Serialize access to MAC Merge state between ethtool requests - * and link state updates. - */ - spinlock_t lock; - + struct ethtool_mmsv mmsv; const struct stmmac_fpe_reg *reg; - u32 fpe_csr; /* MAC_FPE_CTRL_STS reg cache */ - - enum ethtool_mm_verify_status status; - struct timer_list verify_timer; - bool verify_enabled; - int verify_retries; - bool pmac_enabled; - u32 verify_time; - bool tx_enabled; + u32 fpe_csr; /* MAC_FPE_CTRL_STS reg cache */ }; struct stmmac_tc_entry { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 53f51eebb746..f702f7b7bf9f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -1210,7 +1210,6 @@ static int stmmac_get_mm(struct net_device *ndev, struct ethtool_mm_state *state) { struct stmmac_priv *priv = netdev_priv(ndev); - unsigned long flags; u32 frag_size; if (!stmmac_fpe_supported(priv)) @@ -1220,26 +1219,7 @@ static int stmmac_get_mm(struct net_device *ndev, frag_size = stmmac_fpe_get_add_frag_size(priv); state->tx_min_frag_size = ethtool_mm_frag_size_add_to_min(frag_size); - spin_lock_irqsave(&priv->fpe_cfg.lock, flags); - - state->max_verify_time = STMMAC_FPE_MM_MAX_VERIFY_TIME_MS; - state->verify_enabled = priv->fpe_cfg.verify_enabled; - state->pmac_enabled = priv->fpe_cfg.pmac_enabled; - state->verify_time = priv->fpe_cfg.verify_time; - state->tx_enabled = priv->fpe_cfg.tx_enabled; - state->verify_status = priv->fpe_cfg.status; - - /* FPE active if common tx_enabled and - * (verification success or disabled(forced)) - */ - if (state->tx_enabled && - (state->verify_status == ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED || - state->verify_status == ETHTOOL_MM_VERIFY_STATUS_DISABLED)) - state->tx_active = true; - else - state->tx_active = false; - - spin_unlock_irqrestore(&priv->fpe_cfg.lock, flags); + ethtool_mmsv_get_mm(&priv->fpe_cfg.mmsv, state); return 0; } @@ -1248,8 +1228,6 @@ static int stmmac_set_mm(struct net_device *ndev, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack) { struct stmmac_priv *priv = netdev_priv(ndev); - struct stmmac_fpe_cfg *fpe_cfg = &priv->fpe_cfg; - unsigned long flags; u32 frag_size; int err; @@ -1259,23 +1237,7 @@ static int stmmac_set_mm(struct net_device *ndev, struct ethtool_mm_cfg *cfg, return err; stmmac_fpe_set_add_frag_size(priv, frag_size); - - /* Wait for the verification that's currently in progress to finish */ - timer_shutdown_sync(&fpe_cfg->verify_timer); - - spin_lock_irqsave(&fpe_cfg->lock, flags); - - fpe_cfg->verify_enabled = cfg->verify_enabled; - fpe_cfg->pmac_enabled = cfg->pmac_enabled; - fpe_cfg->verify_time = cfg->verify_time; - fpe_cfg->tx_enabled = cfg->tx_enabled; - - if (!cfg->verify_enabled) - fpe_cfg->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED; - - stmmac_fpe_apply(priv); - - spin_unlock_irqrestore(&fpe_cfg->lock, flags); + ethtool_mmsv_set_mm(&priv->fpe_cfg.mmsv, cfg); return 0; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.c index 3a4bee029c7f..75b470ee621a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.c @@ -27,12 +27,6 @@ #define STMMAC_MAC_FPE_CTRL_STS_SVER BIT(1) #define STMMAC_MAC_FPE_CTRL_STS_EFPE BIT(0) -/* FPE link-partner hand-shaking mPacket type */ -enum stmmac_mpacket_type { - MPACKET_VERIFY = 0, - MPACKET_RESPONSE = 1, -}; - struct stmmac_fpe_reg { const u32 mac_fpe_reg; /* offset of MAC_FPE_CTRL_STS */ const u32 mtl_fpe_reg; /* offset of MTL_FPE_CTRL_STS */ @@ -48,10 +42,10 @@ bool stmmac_fpe_supported(struct stmmac_priv *priv) priv->hw->mac->fpe_map_preemption_class; } -static void stmmac_fpe_configure(struct stmmac_priv *priv, bool tx_enable, - bool pmac_enable) +static void stmmac_fpe_configure_tx(struct ethtool_mmsv *mmsv, bool tx_enable) { - struct stmmac_fpe_cfg *cfg = &priv->fpe_cfg; + struct stmmac_fpe_cfg *cfg = container_of(mmsv, struct stmmac_fpe_cfg, mmsv); + struct stmmac_priv *priv = container_of(cfg, struct stmmac_priv, fpe_cfg); const struct stmmac_fpe_reg *reg = cfg->reg; u32 num_rxq = priv->plat->rx_queues_to_use; void __iomem *ioaddr = priv->ioaddr; @@ -68,6 +62,15 @@ static void stmmac_fpe_configure(struct stmmac_priv *priv, bool tx_enable, cfg->fpe_csr = 0; } writel(cfg->fpe_csr, ioaddr + reg->mac_fpe_reg); +} + +static void stmmac_fpe_configure_pmac(struct ethtool_mmsv *mmsv, bool pmac_enable) +{ + struct stmmac_fpe_cfg *cfg = container_of(mmsv, struct stmmac_fpe_cfg, mmsv); + struct stmmac_priv *priv = container_of(cfg, struct stmmac_priv, fpe_cfg); + const struct stmmac_fpe_reg *reg = cfg->reg; + void __iomem *ioaddr = priv->ioaddr; + u32 value; value = readl(ioaddr + reg->int_en_reg); @@ -85,47 +88,45 @@ static void stmmac_fpe_configure(struct stmmac_priv *priv, bool tx_enable, writel(value, ioaddr + reg->int_en_reg); } -static void stmmac_fpe_send_mpacket(struct stmmac_priv *priv, - enum stmmac_mpacket_type type) +static void stmmac_fpe_send_mpacket(struct ethtool_mmsv *mmsv, + enum ethtool_mpacket type) { - const struct stmmac_fpe_reg *reg = priv->fpe_cfg.reg; + struct stmmac_fpe_cfg *cfg = container_of(mmsv, struct stmmac_fpe_cfg, mmsv); + struct stmmac_priv *priv = container_of(cfg, struct stmmac_priv, fpe_cfg); + const struct stmmac_fpe_reg *reg = cfg->reg; void __iomem *ioaddr = priv->ioaddr; - u32 value = priv->fpe_cfg.fpe_csr; + u32 value = cfg->fpe_csr; - if (type == MPACKET_VERIFY) + if (type == ETHTOOL_MPACKET_VERIFY) value |= STMMAC_MAC_FPE_CTRL_STS_SVER; - else if (type == MPACKET_RESPONSE) + else if (type == ETHTOOL_MPACKET_RESPONSE) value |= STMMAC_MAC_FPE_CTRL_STS_SRSP; writel(value, ioaddr + reg->mac_fpe_reg); } +static const struct ethtool_mmsv_ops stmmac_mmsv_ops = { + .configure_tx = stmmac_fpe_configure_tx, + .configure_pmac = stmmac_fpe_configure_pmac, + .send_mpacket = stmmac_fpe_send_mpacket, +}; + static void stmmac_fpe_event_status(struct stmmac_priv *priv, int status) { struct stmmac_fpe_cfg *fpe_cfg = &priv->fpe_cfg; + struct ethtool_mmsv *mmsv = &fpe_cfg->mmsv; - /* This is interrupt context, just spin_lock() */ - spin_lock(&fpe_cfg->lock); - - if (!fpe_cfg->pmac_enabled || status == FPE_EVENT_UNKNOWN) - goto unlock_out; + if (status == FPE_EVENT_UNKNOWN) + return; - /* LP has sent verify mPacket */ if ((status & FPE_EVENT_RVER) == FPE_EVENT_RVER) - stmmac_fpe_send_mpacket(priv, MPACKET_RESPONSE); + ethtool_mmsv_event_handle(mmsv, ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET); - /* Local has sent verify mPacket */ - if ((status & FPE_EVENT_TVER) == FPE_EVENT_TVER && - fpe_cfg->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) - fpe_cfg->status = ETHTOOL_MM_VERIFY_STATUS_VERIFYING; + if ((status & FPE_EVENT_TVER) == FPE_EVENT_TVER) + ethtool_mmsv_event_handle(mmsv, ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET); - /* LP has sent response mPacket */ - if ((status & FPE_EVENT_RRSP) == FPE_EVENT_RRSP && - fpe_cfg->status == ETHTOOL_MM_VERIFY_STATUS_VERIFYING) - fpe_cfg->status = ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED; - -unlock_out: - spin_unlock(&fpe_cfg->lock); + if ((status & FPE_EVENT_RRSP) == FPE_EVENT_RRSP) + ethtool_mmsv_event_handle(mmsv, ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET); } void stmmac_fpe_irq_status(struct stmmac_priv *priv) @@ -164,119 +165,16 @@ void stmmac_fpe_irq_status(struct stmmac_priv *priv) stmmac_fpe_event_status(priv, status); } -/** - * stmmac_fpe_verify_timer - Timer for MAC Merge verification - * @t: timer_list struct containing private info - * - * Verify the MAC Merge capability in the local TX direction, by - * transmitting Verify mPackets up to 3 times. Wait until link - * partner responds with a Response mPacket, otherwise fail. - */ -static void stmmac_fpe_verify_timer(struct timer_list *t) -{ - struct stmmac_fpe_cfg *fpe_cfg = from_timer(fpe_cfg, t, verify_timer); - struct stmmac_priv *priv = container_of(fpe_cfg, struct stmmac_priv, - fpe_cfg); - unsigned long flags; - bool rearm = false; - - spin_lock_irqsave(&fpe_cfg->lock, flags); - - switch (fpe_cfg->status) { - case ETHTOOL_MM_VERIFY_STATUS_INITIAL: - case ETHTOOL_MM_VERIFY_STATUS_VERIFYING: - if (fpe_cfg->verify_retries != 0) { - stmmac_fpe_send_mpacket(priv, MPACKET_VERIFY); - rearm = true; - } else { - fpe_cfg->status = ETHTOOL_MM_VERIFY_STATUS_FAILED; - } - - fpe_cfg->verify_retries--; - break; - - case ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED: - stmmac_fpe_configure(priv, true, true); - break; - - default: - break; - } - - if (rearm) { - mod_timer(&fpe_cfg->verify_timer, - jiffies + msecs_to_jiffies(fpe_cfg->verify_time)); - } - - spin_unlock_irqrestore(&fpe_cfg->lock, flags); -} - -static void stmmac_fpe_verify_timer_arm(struct stmmac_fpe_cfg *fpe_cfg) -{ - if (fpe_cfg->pmac_enabled && fpe_cfg->tx_enabled && - fpe_cfg->verify_enabled && - fpe_cfg->status != ETHTOOL_MM_VERIFY_STATUS_FAILED && - fpe_cfg->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) { - timer_setup(&fpe_cfg->verify_timer, stmmac_fpe_verify_timer, 0); - mod_timer(&fpe_cfg->verify_timer, jiffies); - } -} - void stmmac_fpe_init(struct stmmac_priv *priv) { - priv->fpe_cfg.verify_retries = STMMAC_FPE_MM_MAX_VERIFY_RETRIES; - priv->fpe_cfg.verify_time = STMMAC_FPE_MM_MAX_VERIFY_TIME_MS; - priv->fpe_cfg.status = ETHTOOL_MM_VERIFY_STATUS_DISABLED; - timer_setup(&priv->fpe_cfg.verify_timer, stmmac_fpe_verify_timer, 0); - spin_lock_init(&priv->fpe_cfg.lock); + ethtool_mmsv_init(&priv->fpe_cfg.mmsv, priv->dev, + &stmmac_mmsv_ops); if ((!priv->fpe_cfg.reg || !priv->hw->mac->fpe_map_preemption_class) && priv->dma_cap.fpesel) dev_info(priv->device, "FPE is not supported by driver.\n"); } -void stmmac_fpe_apply(struct stmmac_priv *priv) -{ - struct stmmac_fpe_cfg *fpe_cfg = &priv->fpe_cfg; - - /* If verification is disabled, configure FPE right away. - * Otherwise let the timer code do it. - */ - if (!fpe_cfg->verify_enabled) { - stmmac_fpe_configure(priv, fpe_cfg->tx_enabled, - fpe_cfg->pmac_enabled); - } else { - fpe_cfg->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL; - fpe_cfg->verify_retries = STMMAC_FPE_MM_MAX_VERIFY_RETRIES; - - if (netif_running(priv->dev)) - stmmac_fpe_verify_timer_arm(fpe_cfg); - } -} - -void stmmac_fpe_link_state_handle(struct stmmac_priv *priv, bool is_up) -{ - struct stmmac_fpe_cfg *fpe_cfg = &priv->fpe_cfg; - unsigned long flags; - - timer_shutdown_sync(&fpe_cfg->verify_timer); - - spin_lock_irqsave(&fpe_cfg->lock, flags); - - if (is_up && fpe_cfg->pmac_enabled) { - /* VERIFY process requires pmac enabled when NIC comes up */ - stmmac_fpe_configure(priv, false, true); - - /* New link => maybe new partner => new verification process */ - stmmac_fpe_apply(priv); - } else { - /* No link => turn off EFPE */ - stmmac_fpe_configure(priv, false, false); - } - - spin_unlock_irqrestore(&fpe_cfg->lock, flags); -} - int stmmac_fpe_get_add_frag_size(struct stmmac_priv *priv) { const struct stmmac_fpe_reg *reg = priv->fpe_cfg.reg; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.h index b884eac7142d..3fc46acf7001 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.h @@ -9,15 +9,10 @@ #include #include -#define STMMAC_FPE_MM_MAX_VERIFY_RETRIES 3 -#define STMMAC_FPE_MM_MAX_VERIFY_TIME_MS 128 - struct stmmac_priv; -void stmmac_fpe_link_state_handle(struct stmmac_priv *priv, bool is_up); bool stmmac_fpe_supported(struct stmmac_priv *priv); void stmmac_fpe_init(struct stmmac_priv *priv); -void stmmac_fpe_apply(struct stmmac_priv *priv); void stmmac_fpe_irq_status(struct stmmac_priv *priv); int stmmac_fpe_get_add_frag_size(struct stmmac_priv *priv); void stmmac_fpe_set_add_frag_size(struct stmmac_priv *priv, u32 add_frag_size); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 59d07d0d3369..f59a2363f150 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -946,7 +946,7 @@ static void stmmac_mac_link_down(struct phylink_config *config, stmmac_set_eee_pls(priv, priv->hw, false); if (stmmac_fpe_supported(priv)) - stmmac_fpe_link_state_handle(priv, false); + ethtool_mmsv_link_state_handle(&priv->fpe_cfg.mmsv, false); } static void stmmac_mac_link_up(struct phylink_config *config, @@ -1064,7 +1064,7 @@ static void stmmac_mac_link_up(struct phylink_config *config, stmmac_set_eee_pls(priv, priv->hw, true); if (stmmac_fpe_supported(priv)) - stmmac_fpe_link_state_handle(priv, true); + ethtool_mmsv_link_state_handle(&priv->fpe_cfg.mmsv, true); if (priv->plat->flags & STMMAC_FLAG_HWTSTAMP_CORRECT_LATENCY) stmmac_hwtstamp_correct_latency(priv, priv); @@ -4152,7 +4152,7 @@ static int stmmac_release(struct net_device *dev) stmmac_release_ptp(priv); if (stmmac_fpe_supported(priv)) - timer_shutdown_sync(&priv->fpe_cfg.verify_timer); + ethtool_mmsv_stop(&priv->fpe_cfg.mmsv); pm_runtime_put(priv->device); @@ -7871,7 +7871,7 @@ int stmmac_suspend(struct device *dev) rtnl_unlock(); if (stmmac_fpe_supported(priv)) - timer_shutdown_sync(&priv->fpe_cfg.verify_timer); + ethtool_mmsv_stop(&priv->fpe_cfg.mmsv); return 0; } diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 7edb5f5e7134..117718c24814 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -17,9 +17,13 @@ #include #include #include +#include #include #include +#define ETHTOOL_MM_MAX_VERIFY_TIME_MS 128 +#define ETHTOOL_MM_MAX_VERIFY_RETRIES 3 + struct compat_ethtool_rx_flow_spec { u32 flow_type; union ethtool_flow_union h_u; @@ -718,6 +722,75 @@ struct ethtool_mm_stats { u64 MACMergeHoldCount; }; +enum ethtool_mmsv_event { + ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET, + ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET, + ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET, +}; + +/* MAC Merge verification mPacket type */ +enum ethtool_mpacket { + ETHTOOL_MPACKET_VERIFY, + ETHTOOL_MPACKET_RESPONSE, +}; + +struct ethtool_mmsv; + +/** + * struct ethtool_mmsv_ops - Operations for MAC Merge Software Verification + * @configure_tx: Driver callback for the event where the preemptible TX + * becomes active or inactive. Preemptible traffic + * classes must be committed to hardware only while + * preemptible TX is active. + * @configure_pmac: Driver callback for the event where the pMAC state + * changes as result of an administrative setting + * (ethtool) or a call to ethtool_mmsv_link_state_handle(). + * @send_mpacket: Driver-provided method for sending a Verify or a Response + * mPacket. + */ +struct ethtool_mmsv_ops { + void (*configure_tx)(struct ethtool_mmsv *mmsv, bool tx_active); + void (*configure_pmac)(struct ethtool_mmsv *mmsv, bool pmac_enabled); + void (*send_mpacket)(struct ethtool_mmsv *mmsv, enum ethtool_mpacket mpacket); +}; + +/** + * struct ethtool_mmsv - MAC Merge Software Verification + * @ops: operations for MAC Merge Software Verification + * @dev: pointer to net_device structure + * @lock: serialize access to MAC Merge state between + * ethtool requests and link state updates. + * @status: current verification FSM state + * @verify_timer: timer for verification in local TX direction + * @verify_enabled: indicates if verification is enabled + * @verify_retries: number of retries for verification + * @pmac_enabled: indicates if the preemptible MAC is enabled + * @verify_time: time for verification in milliseconds + * @tx_enabled: indicates if transmission is enabled + */ +struct ethtool_mmsv { + const struct ethtool_mmsv_ops *ops; + struct net_device *dev; + spinlock_t lock; + enum ethtool_mm_verify_status status; + struct timer_list verify_timer; + bool verify_enabled; + int verify_retries; + bool pmac_enabled; + u32 verify_time; + bool tx_enabled; +}; + +void ethtool_mmsv_stop(struct ethtool_mmsv *mmsv); +void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up); +void ethtool_mmsv_event_handle(struct ethtool_mmsv *mmsv, + enum ethtool_mmsv_event event); +void ethtool_mmsv_get_mm(struct ethtool_mmsv *mmsv, + struct ethtool_mm_state *state); +void ethtool_mmsv_set_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_cfg *cfg); +void ethtool_mmsv_init(struct ethtool_mmsv *mmsv, struct net_device *dev, + const struct ethtool_mmsv_ops *ops); + /** * struct ethtool_rxfh_param - RXFH (RSS) parameters * @hfunc: Defines the current RSS hash function used by HW (or to be set to). diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c index 2816bb23c3ad..bfd988464d7d 100644 --- a/net/ethtool/mm.c +++ b/net/ethtool/mm.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright 2022-2023 NXP + * Copyright 2022-2025 NXP + * Copyright 2024 Furong Xu <0x1207@gmail.com> */ #include "common.h" #include "netlink.h" @@ -282,3 +283,275 @@ bool ethtool_dev_mm_supported(struct net_device *dev) return supported; } EXPORT_SYMBOL_GPL(ethtool_dev_mm_supported); + +static void ethtool_mmsv_configure_tx(struct ethtool_mmsv *mmsv, + bool tx_active) +{ + if (mmsv->ops->configure_tx) + mmsv->ops->configure_tx(mmsv, tx_active); +} + +static void ethtool_mmsv_configure_pmac(struct ethtool_mmsv *mmsv, + bool pmac_enabled) +{ + if (mmsv->ops->configure_pmac) + mmsv->ops->configure_pmac(mmsv, pmac_enabled); +} + +static void ethtool_mmsv_send_mpacket(struct ethtool_mmsv *mmsv, + enum ethtool_mpacket mpacket) +{ + if (mmsv->ops->send_mpacket) + mmsv->ops->send_mpacket(mmsv, mpacket); +} + +/** + * ethtool_mmsv_verify_timer - Timer for MAC Merge verification + * @t: timer_list struct containing private info + * + * Verify the MAC Merge capability in the local TX direction, by + * transmitting Verify mPackets up to 3 times. Wait until link + * partner responds with a Response mPacket, otherwise fail. + */ +static void ethtool_mmsv_verify_timer(struct timer_list *t) +{ + struct ethtool_mmsv *mmsv = from_timer(mmsv, t, verify_timer); + unsigned long flags; + bool rearm = false; + + spin_lock_irqsave(&mmsv->lock, flags); + + switch (mmsv->status) { + case ETHTOOL_MM_VERIFY_STATUS_INITIAL: + case ETHTOOL_MM_VERIFY_STATUS_VERIFYING: + if (mmsv->verify_retries != 0) { + ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_VERIFY); + rearm = true; + } else { + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_FAILED; + } + + mmsv->verify_retries--; + break; + + case ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED: + ethtool_mmsv_configure_tx(mmsv, true); + break; + + default: + break; + } + + if (rearm) { + mod_timer(&mmsv->verify_timer, + jiffies + msecs_to_jiffies(mmsv->verify_time)); + } + + spin_unlock_irqrestore(&mmsv->lock, flags); +} + +static void ethtool_mmsv_verify_timer_arm(struct ethtool_mmsv *mmsv) +{ + if (mmsv->pmac_enabled && mmsv->tx_enabled && mmsv->verify_enabled && + mmsv->status != ETHTOOL_MM_VERIFY_STATUS_FAILED && + mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) { + timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0); + mod_timer(&mmsv->verify_timer, jiffies); + } +} + +static void ethtool_mmsv_apply(struct ethtool_mmsv *mmsv) +{ + /* If verification is disabled, configure FPE right away. + * Otherwise let the timer code do it. + */ + if (!mmsv->verify_enabled) { + ethtool_mmsv_configure_pmac(mmsv, mmsv->pmac_enabled); + ethtool_mmsv_configure_tx(mmsv, mmsv->tx_enabled); + } else { + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL; + mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES; + + if (netif_running(mmsv->dev)) + ethtool_mmsv_verify_timer_arm(mmsv); + } +} + +/** + * ethtool_mmsv_stop() - Stop MAC Merge Software Verification + * @mmsv: MAC Merge Software Verification state + * + * Drivers should call this method in a state where the hardware is + * about to lose state, like ndo_stop() or suspend(), and turning off + * MAC Merge features would be superfluous. Otherwise, prefer + * ethtool_mmsv_link_state_handle() with up=false. + */ +void ethtool_mmsv_stop(struct ethtool_mmsv *mmsv) +{ + timer_shutdown_sync(&mmsv->verify_timer); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_stop); + +/** + * ethtool_mmsv_link_state_handle() - Inform MAC Merge Software Verification + * of link state changes + * @mmsv: MAC Merge Software Verification state + * @up: True if device carrier is up and able to pass verification packets + * + * Calling context is expected to be from a task, interrupts enabled. + */ +void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up) +{ + unsigned long flags; + + ethtool_mmsv_stop(mmsv); + + spin_lock_irqsave(&mmsv->lock, flags); + + if (up && mmsv->pmac_enabled) { + /* VERIFY process requires pMAC enabled when NIC comes up */ + ethtool_mmsv_configure_pmac(mmsv, true); + + /* New link => maybe new partner => new verification process */ + ethtool_mmsv_apply(mmsv); + } else { + /* No link or pMAC not enabled */ + ethtool_mmsv_configure_pmac(mmsv, false); + ethtool_mmsv_configure_tx(mmsv, false); + } + + spin_unlock_irqrestore(&mmsv->lock, flags); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_link_state_handle); + +/** + * ethtool_mmsv_event_handle() - Inform MAC Merge Software Verification + * of interrupt-based events + * @mmsv: MAC Merge Software Verification state + * @event: Event which took place (packet transmission or reception) + * + * Calling context expects to have interrupts disabled. + */ +void ethtool_mmsv_event_handle(struct ethtool_mmsv *mmsv, + enum ethtool_mmsv_event event) +{ + /* This is interrupt context, just spin_lock() */ + spin_lock(&mmsv->lock); + + if (!mmsv->pmac_enabled) + goto unlock; + + switch (event) { + case ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET: + /* Link partner has sent verify mPacket */ + ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_RESPONSE); + break; + case ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET: + /* Local device has sent verify mPacket */ + if (mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_VERIFYING; + break; + case ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET: + /* Link partner has sent response mPacket */ + if (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_VERIFYING) + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED; + break; + } + +unlock: + spin_unlock(&mmsv->lock); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_event_handle); + +static bool ethtool_mmsv_is_tx_active(struct ethtool_mmsv *mmsv) +{ + /* TX is active if administratively enabled, and verification either + * succeeded, or was administratively disabled. + */ + return mmsv->tx_enabled && + (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED || + mmsv->status == ETHTOOL_MM_VERIFY_STATUS_DISABLED); +} + +/** + * ethtool_mmsv_get_mm() - get_mm() hook for MAC Merge Software Verification + * @mmsv: MAC Merge Software Verification state + * @state: see struct ethtool_mm_state + * + * Drivers are expected to call this from their ethtool_ops :: get_mm() + * method. + */ +void ethtool_mmsv_get_mm(struct ethtool_mmsv *mmsv, + struct ethtool_mm_state *state) +{ + unsigned long flags; + + spin_lock_irqsave(&mmsv->lock, flags); + + state->max_verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS; + state->verify_enabled = mmsv->verify_enabled; + state->pmac_enabled = mmsv->pmac_enabled; + state->verify_time = mmsv->verify_time; + state->tx_enabled = mmsv->tx_enabled; + state->verify_status = mmsv->status; + state->tx_active = ethtool_mmsv_is_tx_active(mmsv); + + spin_unlock_irqrestore(&mmsv->lock, flags); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_get_mm); + +/** + * ethtool_mmsv_set_mm() - set_mm() hook for MAC Merge Software Verification + * @mmsv: MAC Merge Software Verification state + * @cfg: see struct ethtool_mm_cfg + * + * Drivers are expected to call this from their ethtool_ops :: set_mm() + * method. + */ +void ethtool_mmsv_set_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_cfg *cfg) +{ + unsigned long flags; + + /* Wait for the verification that's currently in progress to finish */ + ethtool_mmsv_stop(mmsv); + + spin_lock_irqsave(&mmsv->lock, flags); + + mmsv->verify_enabled = cfg->verify_enabled; + mmsv->pmac_enabled = cfg->pmac_enabled; + mmsv->verify_time = cfg->verify_time; + mmsv->tx_enabled = cfg->tx_enabled; + + if (!cfg->verify_enabled) + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED; + + ethtool_mmsv_apply(mmsv); + + spin_unlock_irqrestore(&mmsv->lock, flags); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_set_mm); + +/** + * ethtool_mmsv_init() - Initialize MAC Merge Software Verification state + * @mmsv: MAC Merge Software Verification state + * @dev: Pointer to network interface + * @ops: Methods for implementing the generic functionality + * + * The MAC Merge Software Verification is a timer- and event-based state + * machine intended for network interfaces which lack a hardware-based + * TX verification process (as per IEEE 802.3 clause 99.4.3). The timer + * is managed by the core code, whereas events are supplied by the + * driver explicitly calling one of the other API functions. + */ +void ethtool_mmsv_init(struct ethtool_mmsv *mmsv, struct net_device *dev, + const struct ethtool_mmsv_ops *ops) +{ + mmsv->ops = ops; + mmsv->dev = dev; + mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES; + mmsv->verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS; + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED; + timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0); + spin_lock_init(&mmsv->lock); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_init); -- cgit v1.2.3 From 094adad91310d9f8f8485251129482f4f3e2c5b3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:34 +0300 Subject: vxlan: Use a single lock to protect the FDB table Currently, the VXLAN driver stores FDB entries in a hash table with a fixed number of buckets (256). Subsequent patches are going to convert this table to rhashtable with a linked list for entry traversal, as rhashtable is more scalable. In preparation for this conversion, move from a per-bucket spin lock to a single spin lock that protects the entire FDB table. The per-bucket spin locks were introduced by commit fe1e0713bbe8 ("vxlan: Use FDB_HASH_SIZE hash_locks to reduce contention") citing "huge contention when inserting/deleting vxlan_fdbs into the fdb_head". It is not clear from the commit message which code path was holding the spin lock for long periods of time, but the obvious suspect is the FDB cleanup routine (vxlan_cleanup()) that periodically traverses the entire table in order to delete aged-out entries. This will be solved by subsequent patches that will convert the FDB cleanup routine to traverse the linked list of FDB entries using RCU, only acquiring the spin lock when deleting an aged-out entry. The change reduces the size of the VXLAN device structure from 3600 bytes to 2576 bytes. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-7-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 82 ++++++++++++++----------------------- drivers/net/vxlan/vxlan_vnifilter.c | 8 ++-- include/net/vxlan.h | 2 +- 3 files changed, 34 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index d45b63180714..b8edd8afda28 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -524,8 +524,8 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, return -EINVAL; vxlan = netdev_priv(dev); + spin_lock_bh(&vxlan->hash_lock); for (h = 0; h < FDB_HASH_SIZE; ++h) { - spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) { if (f->vni == vni) { list_for_each_entry(rdst, &f->remotes, list) { @@ -537,12 +537,12 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, } } } - spin_unlock_bh(&vxlan->hash_lock[h]); } + spin_unlock_bh(&vxlan->hash_lock); return 0; unlock: - spin_unlock_bh(&vxlan->hash_lock[h]); + spin_unlock_bh(&vxlan->hash_lock); return rc; } EXPORT_SYMBOL_GPL(vxlan_fdb_replay); @@ -558,14 +558,14 @@ void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) return; vxlan = netdev_priv(dev); + spin_lock_bh(&vxlan->hash_lock); for (h = 0; h < FDB_HASH_SIZE; ++h) { - spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) if (f->vni == vni) list_for_each_entry(rdst, &f->remotes, list) rdst->offloaded = false; - spin_unlock_bh(&vxlan->hash_lock[h]); } + spin_unlock_bh(&vxlan->hash_lock); } EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload); @@ -1248,7 +1248,6 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], __be16 port; __be32 src_vni, vni; u32 ifindex, nhid; - u32 hash_index; int err; if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { @@ -1268,13 +1267,12 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family) return -EAFNOSUPPORT; - hash_index = fdb_head_index(vxlan, addr, src_vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, port, src_vni, vni, ifindex, ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER, nhid, true, extack); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); if (!err) *notified = true; @@ -1325,7 +1323,6 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], union vxlan_addr ip; __be32 src_vni, vni; u32 ifindex, nhid; - u32 hash_index; __be16 port; int err; @@ -1334,11 +1331,10 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], if (err) return err; - hash_index = fdb_head_index(vxlan, addr, src_vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex, true); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); if (!err) *notified = true; @@ -1486,10 +1482,8 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, rdst->remote_ip = *src_ip; vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL); } else { - u32 hash_index = fdb_head_index(vxlan, src_mac, vni); - /* learned new entry */ - spin_lock(&vxlan->hash_lock[hash_index]); + spin_lock(&vxlan->hash_lock); /* close off race between vxlan_flush and incoming packets */ if (netif_running(dev)) @@ -1500,7 +1494,7 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, vni, vxlan->default_dst.remote_vni, ifindex, NTF_SELF, 0, true, NULL); - spin_unlock(&vxlan->hash_lock[hash_index]); + spin_unlock(&vxlan->hash_lock); } return SKB_NOT_DROPPED_YET; @@ -2839,10 +2833,10 @@ static void vxlan_cleanup(struct timer_list *t) if (!netif_running(vxlan->dev)) return; + spin_lock(&vxlan->hash_lock); for (h = 0; h < FDB_HASH_SIZE; ++h) { struct hlist_node *p, *n; - spin_lock(&vxlan->hash_lock[h]); hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); @@ -2864,8 +2858,8 @@ static void vxlan_cleanup(struct timer_list *t) } else if (time_before(timeout, next_timer)) next_timer = timeout; } - spin_unlock(&vxlan->hash_lock[h]); } + spin_unlock(&vxlan->hash_lock); mod_timer(&vxlan->age_timer, next_timer); } @@ -3056,10 +3050,10 @@ static void vxlan_flush(struct vxlan_dev *vxlan, bool match_remotes = vxlan_fdb_flush_should_match_remotes(desc); unsigned int h; + spin_lock_bh(&vxlan->hash_lock); for (h = 0; h < FDB_HASH_SIZE; ++h) { struct hlist_node *p, *n; - spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); @@ -3079,8 +3073,8 @@ static void vxlan_flush(struct vxlan_dev *vxlan, vxlan_fdb_destroy(vxlan, f, true, true); } - spin_unlock_bh(&vxlan->hash_lock[h]); } + spin_unlock_bh(&vxlan->hash_lock); } static const struct nla_policy vxlan_del_bulk_policy[NDA_MAX + 1] = { @@ -3358,15 +3352,14 @@ static void vxlan_setup(struct net_device *dev) dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; INIT_LIST_HEAD(&vxlan->next); + spin_lock_init(&vxlan->hash_lock); timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); vxlan->dev = dev; - for (h = 0; h < FDB_HASH_SIZE; ++h) { - spin_lock_init(&vxlan->hash_lock[h]); + for (h = 0; h < FDB_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vxlan->fdb_head[h]); - } } static void vxlan_ether_setup(struct net_device *dev) @@ -3977,10 +3970,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, /* create an fdb entry for a valid default destination */ if (!vxlan_addr_any(&dst->remote_ip)) { - u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, - dst->remote_vni); - - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, all_zeros_mac, &dst->remote_ip, NUD_REACHABLE | NUD_PERMANENT, @@ -3990,7 +3980,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, dst->remote_vni, dst->remote_ifindex, NTF_SELF, 0, true, extack); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); if (err) goto unlink; } @@ -4420,9 +4410,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], /* handle default dst entry */ if (rem_ip_changed) { - u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni); - - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); if (!vxlan_addr_any(&conf.remote_ip)) { err = vxlan_fdb_update(vxlan, all_zeros_mac, &conf.remote_ip, @@ -4433,7 +4421,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], conf.remote_ifindex, NTF_SELF, 0, true, extack); if (err) { - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); netdev_adjacent_change_abort(dst->remote_dev, lowerdev, dev); return err; @@ -4447,7 +4435,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], dst->remote_vni, dst->remote_ifindex, true); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); /* If vni filtering device, also update fdb entries of * all vnis that were using default remote ip @@ -4747,11 +4735,8 @@ vxlan_fdb_offloaded_set(struct net_device *dev, struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *rdst; struct vxlan_fdb *f; - u32 hash_index; - - hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) @@ -4767,7 +4752,7 @@ vxlan_fdb_offloaded_set(struct net_device *dev, rdst->offloaded = fdb_info->offloaded; out: - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); } static int @@ -4776,13 +4761,11 @@ vxlan_fdb_external_learn_add(struct net_device *dev, { struct vxlan_dev *vxlan = netdev_priv(dev); struct netlink_ext_ack *extack; - u32 hash_index; int err; - hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); extack = switchdev_notifier_info_to_extack(&fdb_info->info); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip, NUD_REACHABLE, NLM_F_CREATE | NLM_F_REPLACE, @@ -4792,7 +4775,7 @@ vxlan_fdb_external_learn_add(struct net_device *dev, fdb_info->remote_ifindex, NTF_USE | NTF_SELF | NTF_EXT_LEARNED, 0, false, extack); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } @@ -4803,11 +4786,9 @@ vxlan_fdb_external_learn_del(struct net_device *dev, { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; - u32 hash_index; int err = 0; - hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) @@ -4821,7 +4802,7 @@ vxlan_fdb_external_learn_del(struct net_device *dev, fdb_info->remote_ifindex, false); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } @@ -4870,18 +4851,15 @@ static void vxlan_fdb_nh_flush(struct nexthop *nh) { struct vxlan_fdb *fdb; struct vxlan_dev *vxlan; - u32 hash_index; rcu_read_lock(); list_for_each_entry_rcu(fdb, &nh->fdb_list, nh_list) { vxlan = rcu_dereference(fdb->vdev); WARN_ON(!vxlan); - hash_index = fdb_head_index(vxlan, fdb->eth_addr, - vxlan->default_dst.remote_vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); if (!hlist_unhashed(&fdb->hlist)) vxlan_fdb_destroy(vxlan, fdb, false, false); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); } rcu_read_unlock(); } diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c index d0753776d339..336cd0e20309 100644 --- a/drivers/net/vxlan/vxlan_vnifilter.c +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -482,11 +482,9 @@ static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni, struct netlink_ext_ack *extack) { struct vxlan_rdst *dst = &vxlan->default_dst; - u32 hash_index; int err = 0; - hash_index = fdb_head_index(vxlan, all_zeros_mac, vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); if (remote_ip && !vxlan_addr_any(remote_ip)) { err = vxlan_fdb_update(vxlan, all_zeros_mac, remote_ip, @@ -498,7 +496,7 @@ static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni, dst->remote_ifindex, NTF_SELF, 0, true, extack); if (err) { - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } } @@ -511,7 +509,7 @@ static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni, dst->remote_ifindex, true); } - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 2dd23ee2bacd..272e11708a33 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -296,7 +296,7 @@ struct vxlan_dev { struct vxlan_rdst default_dst; /* default destination */ struct timer_list age_timer; - spinlock_t hash_lock[FDB_HASH_SIZE]; + spinlock_t hash_lock; unsigned int addrcnt; struct gro_cells gro_cells; -- cgit v1.2.3 From 8d45673d2d2e59d03e108c569a3e8c031aa534c8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:35 +0300 Subject: vxlan: Add a linked list of FDB entries Currently, FDB entries are stored in a hash table with a fixed number of buckets. The table is used for both lookups and entry traversal. Subsequent patches will convert the table to rhashtable which is not suitable for entry traversal. In preparation for this conversion, add FDB entries to a linked list. Subsequent patches will convert the driver to use this list when traversing entries during dump, flush, etc. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-8-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 3 +++ drivers/net/vxlan/vxlan_private.h | 1 + include/net/vxlan.h | 1 + 3 files changed, 5 insertions(+) (limited to 'include') diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index b8edd8afda28..511c24e29d45 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -908,6 +908,7 @@ int vxlan_fdb_create(struct vxlan_dev *vxlan, ++vxlan->addrcnt; hlist_add_head_rcu(&f->hlist, vxlan_fdb_head(vxlan, mac, src_vni)); + hlist_add_head_rcu(&f->fdb_node, &vxlan->fdb_list); *fdb = f; @@ -962,6 +963,7 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, swdev_notify, NULL); } + hlist_del_init_rcu(&f->fdb_node); hlist_del_rcu(&f->hlist); list_del_rcu(&f->nh_list); call_rcu(&f->rcu, vxlan_fdb_free); @@ -3360,6 +3362,7 @@ static void vxlan_setup(struct net_device *dev) for (h = 0; h < FDB_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vxlan->fdb_head[h]); + INIT_HLIST_HEAD(&vxlan->fdb_list); } static void vxlan_ether_setup(struct net_device *dev) diff --git a/drivers/net/vxlan/vxlan_private.h b/drivers/net/vxlan/vxlan_private.h index 76a351a997d5..078702ec604d 100644 --- a/drivers/net/vxlan/vxlan_private.h +++ b/drivers/net/vxlan/vxlan_private.h @@ -36,6 +36,7 @@ struct vxlan_fdb { __be32 vni; u16 flags; /* see ndm_flags and below */ struct list_head nh_list; + struct hlist_node fdb_node; struct nexthop __rcu *nh; struct vxlan_dev __rcu *vdev; }; diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 272e11708a33..96a6c6f45c2e 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -307,6 +307,7 @@ struct vxlan_dev { struct hlist_head fdb_head[FDB_HASH_SIZE]; struct rhashtable mdb_tbl; + struct hlist_head fdb_list; struct hlist_head mdb_list; unsigned int mdb_seq; }; -- cgit v1.2.3 From 1f763fa808e92a67feea8364ef80ca3065d74702 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:43 +0300 Subject: vxlan: Convert FDB table to rhashtable FDB entries are currently stored in a hash table with a fixed number of buckets (256), resulting in performance degradation as the number of entries grows. Solve this by converting the driver to use rhashtable which maintains more or less constant performance regardless of the number of entries. Measured transmitted packets per second using a single pktgen thread with varying number of entries when the transmitted packet always hits the default entry (worst case): Number of entries | Improvement ------------------|------------ 1k | +1.12% 4k | +9.22% 16k | +55% 64k | +585% 256k | +2460% In addition, the change reduces the size of the VXLAN device structure from 2584 bytes to 672 bytes. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-16-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 102 +++++++++++++++----------------------- drivers/net/vxlan/vxlan_private.h | 2 +- include/net/vxlan.h | 2 +- 3 files changed, 43 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 8e359cf8dbbd..a56d7239b127 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -63,8 +64,12 @@ static int vxlan_sock_add(struct vxlan_dev *vxlan); static void vxlan_vs_del_dev(struct vxlan_dev *vxlan); -/* salt for hash table */ -static u32 vxlan_salt __read_mostly; +static const struct rhashtable_params vxlan_fdb_rht_params = { + .head_offset = offsetof(struct vxlan_fdb, rhnode), + .key_offset = offsetof(struct vxlan_fdb, key), + .key_len = sizeof(struct vxlan_fdb_key), + .automatic_shrinking = true, +}; static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) { @@ -371,62 +376,21 @@ static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); } -/* Hash Ethernet address */ -static u32 eth_hash(const unsigned char *addr) -{ - u64 value = get_unaligned((u64 *)addr); - - /* only want 6 bytes */ -#ifdef __BIG_ENDIAN - value >>= 16; -#else - value <<= 16; -#endif - return hash_64(value, FDB_HASH_BITS); -} - -u32 eth_vni_hash(const unsigned char *addr, __be32 vni) -{ - /* use 1 byte of OUI and 3 bytes of NIC */ - u32 key = get_unaligned((u32 *)(addr + 2)); - - return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1); -} - -u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) -{ - if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) - return eth_vni_hash(mac, vni); - else - return eth_hash(mac); -} - -/* Hash chain to use given mac address */ -static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, - const u8 *mac, __be32 vni) -{ - return &vxlan->fdb_head[fdb_head_index(vxlan, mac, vni)]; -} - /* Look up Ethernet address in forwarding table */ static struct vxlan_fdb *vxlan_find_mac_rcu(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { - struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni); - struct vxlan_fdb *f; + struct vxlan_fdb_key key; - hlist_for_each_entry_rcu(f, head, hlist) { - if (ether_addr_equal(mac, f->key.eth_addr)) { - if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { - if (vni == f->key.vni) - return f; - } else { - return f; - } - } - } + memset(&key, 0, sizeof(key)); + memcpy(key.eth_addr, mac, sizeof(key.eth_addr)); + if (!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) + key.vni = vxlan->default_dst.remote_vni; + else + key.vni = vni; - return NULL; + return rhashtable_lookup(&vxlan->fdb_hash_tbl, &key, + vxlan_fdb_rht_params); } static struct vxlan_fdb *vxlan_find_mac_tx(struct vxlan_dev *vxlan, @@ -915,15 +879,27 @@ int vxlan_fdb_create(struct vxlan_dev *vxlan, if (rc < 0) goto errout; + rc = rhashtable_lookup_insert_fast(&vxlan->fdb_hash_tbl, &f->rhnode, + vxlan_fdb_rht_params); + if (rc) + goto destroy_remote; + ++vxlan->addrcnt; - hlist_add_head_rcu(&f->hlist, - vxlan_fdb_head(vxlan, mac, src_vni)); hlist_add_head_rcu(&f->fdb_node, &vxlan->fdb_list); *fdb = f; return 0; +destroy_remote: + if (rcu_access_pointer(f->nh)) { + list_del_rcu(&f->nh_list); + nexthop_put(rtnl_dereference(f->nh)); + } else { + list_del(&rd->list); + dst_cache_destroy(&rd->dst_cache); + kfree(rd); + } errout: kfree(f); return rc; @@ -974,7 +950,8 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, } hlist_del_init_rcu(&f->fdb_node); - hlist_del_rcu(&f->hlist); + rhashtable_remove_fast(&vxlan->fdb_hash_tbl, &f->rhnode, + vxlan_fdb_rht_params); list_del_rcu(&f->nh_list); call_rcu(&f->rcu, vxlan_fdb_free); } @@ -2898,10 +2875,14 @@ static int vxlan_init(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); int err; + err = rhashtable_init(&vxlan->fdb_hash_tbl, &vxlan_fdb_rht_params); + if (err) + return err; + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { err = vxlan_vnigroup_init(vxlan); if (err) - return err; + goto err_rhashtable_destroy; } err = gro_cells_init(&vxlan->gro_cells, dev); @@ -2920,6 +2901,8 @@ err_gro_cells_destroy: err_vnigroup_uninit: if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_uninit(vxlan); +err_rhashtable_destroy: + rhashtable_destroy(&vxlan->fdb_hash_tbl); return err; } @@ -2933,6 +2916,8 @@ static void vxlan_uninit(struct net_device *dev) vxlan_vnigroup_uninit(vxlan); gro_cells_destroy(&vxlan->gro_cells); + + rhashtable_destroy(&vxlan->fdb_hash_tbl); } /* Start ageing timer and join group when device is brought up */ @@ -3329,7 +3314,6 @@ static void vxlan_offload_rx_ports(struct net_device *dev, bool push) static void vxlan_setup(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - unsigned int h; eth_hw_addr_random(dev); ether_setup(dev); @@ -3362,8 +3346,6 @@ static void vxlan_setup(struct net_device *dev) vxlan->dev = dev; - for (h = 0; h < FDB_HASH_SIZE; ++h) - INIT_HLIST_HEAD(&vxlan->fdb_head[h]); INIT_HLIST_HEAD(&vxlan->fdb_list); } @@ -4944,8 +4926,6 @@ static int __init vxlan_init_module(void) { int rc; - get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); - rc = register_pernet_subsys(&vxlan_net_ops); if (rc) goto out1; diff --git a/drivers/net/vxlan/vxlan_private.h b/drivers/net/vxlan/vxlan_private.h index 3ca19e7167c9..d328aed9feef 100644 --- a/drivers/net/vxlan/vxlan_private.h +++ b/drivers/net/vxlan/vxlan_private.h @@ -31,7 +31,7 @@ struct vxlan_fdb_key { /* Forwarding table entry */ struct vxlan_fdb { - struct hlist_node hlist; /* linked list of entries */ + struct rhash_head rhnode; struct rcu_head rcu; unsigned long updated; /* jiffies */ unsigned long used; diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 96a6c6f45c2e..e2f7ca045d3e 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -304,7 +304,7 @@ struct vxlan_dev { struct vxlan_vni_group __rcu *vnigrp; - struct hlist_head fdb_head[FDB_HASH_SIZE]; + struct rhashtable fdb_hash_tbl; struct rhashtable mdb_tbl; struct hlist_head fdb_list; -- cgit v1.2.3 From 45bd443bfd8697a7da308c16c3e75e2bb353b3d1 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 18 Apr 2025 02:15:19 +0100 Subject: net: 802: Remove unused p8022 code p8022.c defines two external functions, register_8022_client() and unregister_8022_client(), the last use of which was removed in 2018 by commit 7a2e838d28cf ("staging: ipx: delete it from the tree") Remove the p8022.c file, it's corresponding header, and glue surrounding it. There was one place the header was included in vlan.c but it didn't use the functions it declared. There was a comment in net/802/Makefile about checking against net/core/Makefile, but that's at least 20 years old and there's no sign of net/core/Makefile mentioning it. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250418011519.145320-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/net/p8022.h | 16 -------------- net/802/Makefile | 5 ++--- net/802/p8022.c | 64 ----------------------------------------------------- net/8021q/vlan.c | 1 - 4 files changed, 2 insertions(+), 84 deletions(-) delete mode 100644 include/net/p8022.h delete mode 100644 net/802/p8022.c (limited to 'include') diff --git a/include/net/p8022.h b/include/net/p8022.h deleted file mode 100644 index a29e224ac498..000000000000 --- a/include/net/p8022.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_P8022_H -#define _NET_P8022_H - -struct net_device; -struct packet_type; -struct sk_buff; - -struct datalink_proto * -register_8022_client(unsigned char type, - int (*func)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt, - struct net_device *orig_dev)); -void unregister_8022_client(struct datalink_proto *proto); -#endif diff --git a/net/802/Makefile b/net/802/Makefile index bfed80221b8b..99abc29d537c 100644 --- a/net/802/Makefile +++ b/net/802/Makefile @@ -3,12 +3,11 @@ # Makefile for the Linux 802.x protocol layers. # -# Check the p8022 selections against net/core/Makefile. -obj-$(CONFIG_LLC) += p8022.o psnap.o +obj-$(CONFIG_LLC) += psnap.o obj-$(CONFIG_NET_FC) += fc.o obj-$(CONFIG_FDDI) += fddi.o obj-$(CONFIG_HIPPI) += hippi.o -obj-$(CONFIG_ATALK) += p8022.o psnap.o +obj-$(CONFIG_ATALK) += psnap.o obj-$(CONFIG_STP) += stp.o obj-$(CONFIG_GARP) += garp.o obj-$(CONFIG_MRP) += mrp.o diff --git a/net/802/p8022.c b/net/802/p8022.c deleted file mode 100644 index 78c25168d7c9..000000000000 --- a/net/802/p8022.c +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * NET3: Support for 802.2 demultiplexing off Ethernet - * - * Demultiplex 802.2 encoded protocols. We match the entry by the - * SSAP/DSAP pair and then deliver to the registered datalink that - * matches. The control byte is ignored and handling of such items - * is up to the routine passed the frame. - * - * Unlike the 802.3 datalink we have a list of 802.2 entries as - * there are multiple protocols to demux. The list is currently - * short (3 or 4 entries at most). The current demux assumes this. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb, - const unsigned char *dest) -{ - llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap); - return 0; -} - -struct datalink_proto *register_8022_client(unsigned char type, - int (*func)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt, - struct net_device *orig_dev)) -{ - struct datalink_proto *proto; - - proto = kmalloc(sizeof(*proto), GFP_ATOMIC); - if (proto) { - proto->type[0] = type; - proto->header_length = 3; - proto->request = p8022_request; - proto->sap = llc_sap_open(type, func); - if (!proto->sap) { - kfree(proto); - proto = NULL; - } - } - return proto; -} - -void unregister_8022_client(struct datalink_proto *proto) -{ - llc_sap_put(proto->sap); - kfree(proto); -} - -EXPORT_SYMBOL(register_8022_client); -EXPORT_SYMBOL(unregister_8022_client); - -MODULE_DESCRIPTION("Support for 802.2 demultiplexing off Ethernet"); -MODULE_LICENSE("GPL"); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 41be38264493..06908e37c3d9 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 145436ae01193c0a379fd3ea9c4fbdf32863db1f Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Wed, 16 Apr 2025 19:14:49 +0200 Subject: net: phy: Add helper for getting MAC termination resistance Add helper which returns the MAC termination resistance value. Modifying the resistance to an appropriate value can reduce signal reflections and therefore improve signal quality. Reviewed-by: Russell King (Oracle) Signed-off-by: Dimitri Fedrau Link: https://patch.msgid.link/20250416-dp83822-mac-impedance-v3-3-028ac426cddb@liebherr.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 15 +++++++++++++++ include/linux/phy.h | 3 +++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index cc6c209fe702..f85c172c446c 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -2975,6 +2975,21 @@ int phy_get_tx_amplitude_gain(struct phy_device *phydev, struct device *dev, } EXPORT_SYMBOL_GPL(phy_get_tx_amplitude_gain); +/** + * phy_get_mac_termination - stores MAC termination in @val + * @phydev: phy_device struct + * @dev: pointer to the devices device struct + * @val: MAC termination + * + * Returns: 0 on success, < 0 on failure + */ +int phy_get_mac_termination(struct phy_device *phydev, struct device *dev, + u32 *val) +{ + return phy_get_u32_property(dev, "mac-termination-ohms", val); +} +EXPORT_SYMBOL_GPL(phy_get_mac_termination); + static int phy_led_set_brightness(struct led_classdev *led_cdev, enum led_brightness value) { diff --git a/include/linux/phy.h b/include/linux/phy.h index fb755358d965..066a28a4b64b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2040,6 +2040,9 @@ int phy_get_tx_amplitude_gain(struct phy_device *phydev, struct device *dev, enum ethtool_link_mode_bit_indices linkmode, u32 *val); +int phy_get_mac_termination(struct phy_device *phydev, struct device *dev, + u32 *val); + void phy_resolve_pause(unsigned long *local_adv, unsigned long *partner_adv, bool *tx_pause, bool *rx_pause); -- cgit v1.2.3 From 0e0a7e3719bc8cbe6d6e30b3e81f21472ecba5bc Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Mon, 21 Apr 2025 18:16:32 -0700 Subject: xdp: create locked/unlocked instances of xdp redirect target setters Commit 03df156dd3a6 ("xdp: double protect netdev->xdp_flags with netdev->lock") introduces the netdev lock to xdp_set_features_flag(). The change includes a _locked version of the method, as it is possible for a driver to have already acquired the netdev lock before calling this helper. However, the same applies to xdp_features_(set|clear)_redirect_flags(), which ends up calling the unlocked version of xdp_set_features_flags() leading to deadlocks in GVE, which grabs the netdev lock as part of its suspend, reset, and shutdown processes: [ 833.265543] WARNING: possible recursive locking detected [ 833.270949] 6.15.0-rc1 #6 Tainted: G E [ 833.276271] -------------------------------------------- [ 833.281681] systemd-shutdow/1 is trying to acquire lock: [ 833.287090] ffff949d2b148c68 (&dev->lock){+.+.}-{4:4}, at: xdp_set_features_flag+0x29/0x90 [ 833.295470] [ 833.295470] but task is already holding lock: [ 833.301400] ffff949d2b148c68 (&dev->lock){+.+.}-{4:4}, at: gve_shutdown+0x44/0x90 [gve] [ 833.309508] [ 833.309508] other info that might help us debug this: [ 833.316130] Possible unsafe locking scenario: [ 833.316130] [ 833.322142] CPU0 [ 833.324681] ---- [ 833.327220] lock(&dev->lock); [ 833.330455] lock(&dev->lock); [ 833.333689] [ 833.333689] *** DEADLOCK *** [ 833.333689] [ 833.339701] May be due to missing lock nesting notation [ 833.339701] [ 833.346582] 5 locks held by systemd-shutdow/1: [ 833.351205] #0: ffffffffa9c89130 (system_transition_mutex){+.+.}-{4:4}, at: __se_sys_reboot+0xe6/0x210 [ 833.360695] #1: ffff93b399e5c1b8 (&dev->mutex){....}-{4:4}, at: device_shutdown+0xb4/0x1f0 [ 833.369144] #2: ffff949d19a471b8 (&dev->mutex){....}-{4:4}, at: device_shutdown+0xc2/0x1f0 [ 833.377603] #3: ffffffffa9eca050 (rtnl_mutex){+.+.}-{4:4}, at: gve_shutdown+0x33/0x90 [gve] [ 833.386138] #4: ffff949d2b148c68 (&dev->lock){+.+.}-{4:4}, at: gve_shutdown+0x44/0x90 [gve] Introduce xdp_features_(set|clear)_redirect_target_locked() versions which assume that the netdev lock has already been acquired before setting the XDP feature flag and update GVE to use the locked version. Fixes: 03df156dd3a6 ("xdp: double protect netdev->xdp_flags with netdev->lock") Tested-by: Mina Almasry Reviewed-by: Willem de Bruijn Reviewed-by: Harshitha Ramamurthy Signed-off-by: Joshua Washington Acked-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250422011643.3509287-1-joshwash@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_main.c | 4 ++-- include/net/xdp.h | 3 +++ net/core/xdp.c | 25 +++++++++++++++++++++---- 3 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 8aaac9101377..446e4b6fd3f1 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1830,7 +1830,7 @@ static void gve_turndown(struct gve_priv *priv) /* Stop tx queues */ netif_tx_disable(priv->dev); - xdp_features_clear_redirect_target(priv->dev); + xdp_features_clear_redirect_target_locked(priv->dev); gve_clear_napi_enabled(priv); gve_clear_report_stats(priv); @@ -1902,7 +1902,7 @@ static void gve_turnup(struct gve_priv *priv) } if (priv->tx_cfg.num_xdp_queues && gve_supports_xdp_xmit(priv)) - xdp_features_set_redirect_target(priv->dev, false); + xdp_features_set_redirect_target_locked(priv->dev, false); gve_set_napi_enabled(priv); } diff --git a/include/net/xdp.h b/include/net/xdp.h index 20e41b5ff319..b40f1f96cb11 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -618,7 +618,10 @@ bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); +void xdp_features_clear_redirect_target_locked(struct net_device *dev); #else static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } diff --git a/net/core/xdp.c b/net/core/xdp.c index a014df88620c..ea819764ae39 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -1014,21 +1014,38 @@ void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) } EXPORT_SYMBOL_GPL(xdp_set_features_flag); -void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg) { xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT); if (support_sg) val |= NETDEV_XDP_ACT_NDO_XMIT_SG; - xdp_set_features_flag(dev, val); + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target_locked); + +void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +{ + netdev_lock(dev); + xdp_features_set_redirect_target_locked(dev, support_sg); + netdev_unlock(dev); } EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target); -void xdp_features_clear_redirect_target(struct net_device *dev) +void xdp_features_clear_redirect_target_locked(struct net_device *dev) { xdp_features_t val = dev->xdp_features; val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG); - xdp_set_features_flag(dev, val); + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target_locked); + +void xdp_features_clear_redirect_target(struct net_device *dev) +{ + netdev_lock(dev); + xdp_features_clear_redirect_target_locked(dev); + netdev_unlock(dev); } EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target); -- cgit v1.2.3 From 76a853f86c97b348dc96e75a6e6f94d8750715ee Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 13 Mar 2025 13:49:39 +0100 Subject: wifi: free SKBTX_WIFI_STATUS skb tx_flags flag Jason mentioned at netdevconf that we've run out of tx_flags in the skb_shinfo(). Gain one bit back by removing the wifi bit. We can do that because the only userspace application for it (hostapd) doesn't change the setting on the socket, it just uses different sockets, and normally doesn't even use this any more, sending the frames over nl80211 instead. Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250313134942.52ff54a140ec.If390bbdc46904cf451256ba989d7a056c457af6e@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/wil6210/txrx.h | 3 ++- drivers/net/wireless/marvell/mwifiex/main.c | 3 ++- include/linux/skbuff.h | 3 --- include/net/sock.h | 2 -- net/mac80211/mesh.c | 3 ++- net/mac80211/tx.c | 9 ++++----- 6 files changed, 10 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/wil6210/txrx.h b/drivers/net/wireless/ath/wil6210/txrx.h index 689f68d89a44..33ccd0b248d4 100644 --- a/drivers/net/wireless/ath/wil6210/txrx.h +++ b/drivers/net/wireless/ath/wil6210/txrx.h @@ -7,6 +7,7 @@ #ifndef WIL6210_TXRX_H #define WIL6210_TXRX_H +#include #include "wil6210.h" #include "txrx_edma.h" @@ -617,7 +618,7 @@ static inline bool wil_need_txstat(struct sk_buff *skb) const u8 *da = wil_skb_get_da(skb); return is_unicast_ether_addr(da) && skb->sk && - (skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS); + sock_flag(skb->sk, SOCK_WIFI_STATUS); } static inline void wil_consume_skb(struct sk_buff *skb, bool acked) diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c index 0e1f53940401..40b48386cfa9 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.c +++ b/drivers/net/wireless/marvell/mwifiex/main.c @@ -6,6 +6,7 @@ */ #include +#include #include "main.h" #include "wmm.h" @@ -939,7 +940,7 @@ mwifiex_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) multicast = is_multicast_ether_addr(skb->data); if (unlikely(!multicast && skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS && + sock_flag(skb->sk, SOCK_WIFI_STATUS) && priv->adapter->fw_api_ver == MWIFIEX_FW_V15)) skb = mwifiex_clone_skb_for_tx_status(priv, skb, diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b974a277975a..9ee39670e8f4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -481,9 +481,6 @@ enum { /* generate software time stamp on packet tx completion */ SKBTX_COMPLETION_TSTAMP = 1 << 3, - /* generate wifi status information (where possible) */ - SKBTX_WIFI_STATUS = 1 << 4, - /* determine hardware time stamp based on time or cycles */ SKBTX_HW_TSTAMP_NETDEV = 1 << 5, diff --git a/include/net/sock.h b/include/net/sock.h index 694f954258d4..36b219109790 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2736,8 +2736,6 @@ static inline void _sock_tx_timestamp(struct sock *sk, *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } } - if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) - *tx_flags |= SKBTX_WIFI_STATUS; } static inline void sock_tx_timestamp(struct sock *sk, diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 7257f5610af5..e2ce11ddb111 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -8,6 +8,7 @@ #include #include +#include #include "ieee80211_i.h" #include "mesh.h" #include "wme.h" @@ -776,7 +777,7 @@ bool ieee80211_mesh_xmit_fast(struct ieee80211_sub_if_data *sdata, if (ethertype < ETH_P_802_3_MIN) return false; - if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) + if (skb->sk && sock_flag(skb->sk, SOCK_WIFI_STATUS)) return false; if (skb->ip_summed == CHECKSUM_PARTIAL) { diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 20179db88c4a..b75f72fbefd9 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include "ieee80211_i.h" @@ -2876,8 +2877,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, } if (unlikely(!multicast && - ((skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) || + ((skb->sk && sock_flag(skb->sk, SOCK_WIFI_STATUS)) || ctrl_flags & IEEE80211_TX_CTL_REQ_TX_STATUS))) info_id = ieee80211_store_ack_skb(local, skb, &info_flags, cookie); @@ -3774,7 +3774,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, return false; /* don't handle TX status request here either */ - if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) + if (skb->sk && sock_flag(skb->sk, SOCK_WIFI_STATUS)) return false; if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { @@ -4664,8 +4664,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, memcpy(IEEE80211_SKB_CB(seg), info, sizeof(*info)); } - if (unlikely(skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) { + if (unlikely(skb->sk && sock_flag(skb->sk, SOCK_WIFI_STATUS))) { info->status_data = ieee80211_store_ack_skb(local, skb, &info->flags, NULL); if (info->status_data) -- cgit v1.2.3 From 996c15bd30a9caf5d3a32414a28503f3389fc96e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 29 Mar 2025 22:14:20 +0100 Subject: wifi: cfg80211/mac80211: remove more 5/10 MHz code We still have ieee80211_chandef_rate_flags() and all that, but all the users seem pretty much broken (deflink, etc.) Remove all the code. It's been two years since last anyone even vaguely entertained the notion of looking at this and fixing it. Link: https://patch.msgid.link/20250329221419.c31da7ae8c84.I1a3a4b6008134d66ca75a5bdfc004f4594da8145@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 37 ------------------------------------- net/mac80211/ibss.c | 19 +------------------ net/mac80211/ieee80211_i.h | 2 +- net/mac80211/mesh.c | 7 ++----- net/mac80211/mesh_plink.c | 10 +++------- net/mac80211/mlme.c | 4 ++-- net/mac80211/parse.c | 3 --- net/mac80211/rate.c | 10 ++-------- net/mac80211/rc80211_minstrel_ht.c | 13 +++---------- net/mac80211/tdls.c | 4 ++-- net/mac80211/tx.c | 11 ----------- net/mac80211/util.c | 25 +++++-------------------- 12 files changed, 21 insertions(+), 124 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index efbd79c67be2..6df4e17f1437 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1096,43 +1096,6 @@ int cfg80211_chandef_primary(const struct cfg80211_chan_def *chandef, **/ int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef); -/** - * ieee80211_chanwidth_rate_flags - return rate flags for channel width - * @width: the channel width of the channel - * - * In some channel types, not all rates may be used - for example CCK - * rates may not be used in 5/10 MHz channels. - * - * Returns: rate flags which apply for this channel width - */ -static inline enum ieee80211_rate_flags -ieee80211_chanwidth_rate_flags(enum nl80211_chan_width width) -{ - switch (width) { - case NL80211_CHAN_WIDTH_5: - return IEEE80211_RATE_SUPPORTS_5MHZ; - case NL80211_CHAN_WIDTH_10: - return IEEE80211_RATE_SUPPORTS_10MHZ; - default: - break; - } - return 0; -} - -/** - * ieee80211_chandef_rate_flags - returns rate flags for a channel - * @chandef: channel definition for the channel - * - * See ieee80211_chanwidth_rate_flags(). - * - * Returns: rate flags which apply for this channel - */ -static inline enum ieee80211_rate_flags -ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef) -{ - return ieee80211_chanwidth_rate_flags(chandef->width); -} - /** * ieee80211_chandef_max_power - maximum transmission power for the chandef * diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 4246d168374f..a6e7b7ba6a01 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -48,7 +48,7 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt; u8 *pos; struct ieee80211_supported_band *sband; - u32 rate_flags, rates = 0, rates_added = 0; + u32 rates = 0, rates_added = 0; struct beacon_data *presp; int frame_len; @@ -90,14 +90,11 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, pos += ifibss->ssid_len; sband = local->hw.wiphy->bands[chandef->chan->band]; - rate_flags = ieee80211_chandef_rate_flags(chandef); rates_n = 0; if (have_higher_than_11mbit) *have_higher_than_11mbit = false; for (i = 0; i < sband->n_bitrates; i++) { - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; if (sband->bitrates[i].bitrate > 110 && have_higher_than_11mbit) *have_higher_than_11mbit = true; @@ -395,7 +392,6 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, const struct cfg80211_bss_ies *ies; enum nl80211_channel_type chan_type; u64 tsf; - u32 rate_flags; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -429,7 +425,6 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, } sband = sdata->local->hw.wiphy->bands[cbss->channel->band]; - rate_flags = ieee80211_chandef_rate_flags(&sdata->u.ibss.chandef); basic_rates = 0; @@ -439,9 +434,6 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, for (j = 0; j < sband->n_bitrates; j++) { int brate; - if ((rate_flags & sband->bitrates[j].flags) - != rate_flags) - continue; brate = DIV_ROUND_UP(sband->bitrates[j].bitrate, 5); if (brate == rate) { @@ -1717,12 +1709,9 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, struct cfg80211_ibss_params *params) { u64 changed = 0; - u32 rate_flags; - struct ieee80211_supported_band *sband; enum ieee80211_chanctx_mode chanmode; struct ieee80211_local *local = sdata->local; int radar_detect_width = 0; - int i; int ret; lockdep_assert_wiphy(local->hw.wiphy); @@ -1765,12 +1754,6 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, sdata->u.ibss.last_scan_completed = jiffies; /* fix basic_rates if channel does not support these rates */ - rate_flags = ieee80211_chandef_rate_flags(¶ms->chandef); - sband = local->hw.wiphy->bands[params->chandef.chan->band]; - for (i = 0; i < sband->n_bitrates; i++) { - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - sdata->u.ibss.basic_rates &= ~BIT(i); - } memcpy(sdata->vif.bss_conf.mcast_rate, params->mcast_rate, sizeof(params->mcast_rate)); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index fb05f3cd37ec..bf8d8a4145fc 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2613,7 +2613,7 @@ void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata, /* element building in SKBs */ int ieee80211_put_srates_elem(struct sk_buff *skb, const struct ieee80211_supported_band *sband, - u32 basic_rates, u32 rate_flags, u32 masked_rates, + u32 basic_rates, u32 masked_rates, u8 element_id); int ieee80211_put_he_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index e2ce11ddb111..a381b4b756ea 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -957,13 +957,10 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) u8 *pos; struct ieee80211_sub_if_data *sdata; int hdr_len = offsetofend(struct ieee80211_mgmt, u.beacon); - u32 rate_flags; sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh); sband = ieee80211_get_sband(sdata); - rate_flags = - ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); ie_len_he_cap = ieee80211_ie_len_he_cap(sdata); ie_len_eht_cap = ieee80211_ie_len_eht_cap(sdata); @@ -1092,7 +1089,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) if (ieee80211_put_srates_elem(skb, sband, sdata->vif.bss_conf.basic_rates, - rate_flags, 0, WLAN_EID_SUPP_RATES) || + 0, WLAN_EID_SUPP_RATES) || mesh_add_ds_params_ie(sdata, skb)) goto out_free; @@ -1105,7 +1102,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) if (ieee80211_put_srates_elem(skb, sband, sdata->vif.bss_conf.basic_rates, - rate_flags, 0, WLAN_EID_EXT_SUPP_RATES) || + 0, WLAN_EID_EXT_SUPP_RATES) || mesh_add_rsn_ie(sdata, skb) || mesh_add_ht_cap_ie(sdata, skb) || mesh_add_ht_oper_ie(sdata, skb) || diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 96e0a861886a..9c6a2b342170 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -264,7 +264,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, if (action != WLAN_SP_MESH_PEERING_CLOSE) { struct ieee80211_supported_band *sband; - u32 rate_flags, basic_rates; + u32 basic_rates; sband = ieee80211_get_sband(sdata); if (!sband) { @@ -280,16 +280,12 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, put_unaligned_le16(sta->sta.aid, pos); } - rate_flags = - ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); basic_rates = sdata->vif.bss_conf.basic_rates; if (ieee80211_put_srates_elem(skb, sband, basic_rates, - rate_flags, 0, - WLAN_EID_SUPP_RATES) || + 0, WLAN_EID_SUPP_RATES) || ieee80211_put_srates_elem(skb, sband, basic_rates, - rate_flags, 0, - WLAN_EID_EXT_SUPP_RATES) || + 0, WLAN_EID_EXT_SUPP_RATES) || mesh_add_rsn_ie(sdata, skb) || mesh_add_meshid_ie(sdata, skb) || mesh_add_meshconf_ie(sdata, skb)) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 5d1f2d6d09ad..784e180aab74 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1525,9 +1525,9 @@ static void ieee80211_assoc_add_rates(struct ieee80211_local *local, rates = ~0; } - ieee80211_put_srates_elem(skb, sband, 0, 0, ~rates, + ieee80211_put_srates_elem(skb, sband, 0, ~rates, WLAN_EID_SUPP_RATES); - ieee80211_put_srates_elem(skb, sband, 0, 0, ~rates, + ieee80211_put_srates_elem(skb, sband, 0, ~rates, WLAN_EID_EXT_SUPP_RATES); } diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c index 6da39c864f45..96584b39215e 100644 --- a/net/mac80211/parse.c +++ b/net/mac80211/parse.c @@ -1101,7 +1101,6 @@ int ieee80211_parse_bitrates(enum nl80211_chan_width width, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates) { - u32 rate_flags = ieee80211_chanwidth_rate_flags(width); struct ieee80211_rate *br; int brate, rate, i, j, count = 0; @@ -1112,8 +1111,6 @@ int ieee80211_parse_bitrates(enum nl80211_chan_width width, for (j = 0; j < sband->n_bitrates; j++) { br = &sband->bitrates[j]; - if ((rate_flags & br->flags) != rate_flags) - continue; brate = DIV_ROUND_UP(br->bitrate, 5); if (brate == rate) { diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 0d056db9f81e..1b0667ca359e 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -368,9 +368,8 @@ static void __rate_control_send_low(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, u32 rate_mask) { + u32 rate_flags = 0; int i; - u32 rate_flags = - ieee80211_chandef_rate_flags(&hw->conf.chandef); if (sband->band == NL80211_BAND_S1GHZ) { info->control.rates[0].flags |= IEEE80211_TX_RC_S1G_MCS; @@ -778,14 +777,9 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN], u16 vht_mask[NL80211_VHT_NSS_MAX]) { - u32 i, flags; + u32 i; *mask = sdata->rc_rateidx_mask[sband->band]; - flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); - for (i = 0; i < sband->n_bitrates; i++) { - if ((flags & sband->bitrates[i].flags) != flags) - *mask &= ~BIT(i); - } if (*mask == (1 << sband->n_bitrates) - 1 && !sdata->rc_has_mcs_mask[sband->band] && diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 706cbc99f718..f66910013218 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1873,16 +1873,13 @@ minstrel_ht_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta) static void minstrel_ht_fill_rate_array(u8 *dest, struct ieee80211_supported_band *sband, - const s16 *bitrates, int n_rates, u32 rate_flags) + const s16 *bitrates, int n_rates) { int i, j; for (i = 0; i < sband->n_bitrates; i++) { struct ieee80211_rate *rate = &sband->bitrates[i]; - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; - for (j = 0; j < n_rates; j++) { if (rate->bitrate != bitrates[j]) continue; @@ -1898,7 +1895,6 @@ minstrel_ht_init_cck_rates(struct minstrel_priv *mp) { static const s16 bitrates[4] = { 10, 20, 55, 110 }; struct ieee80211_supported_band *sband; - u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); memset(mp->cck_rates, 0xff, sizeof(mp->cck_rates)); sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ]; @@ -1908,8 +1904,7 @@ minstrel_ht_init_cck_rates(struct minstrel_priv *mp) BUILD_BUG_ON(ARRAY_SIZE(mp->cck_rates) != ARRAY_SIZE(bitrates)); minstrel_ht_fill_rate_array(mp->cck_rates, sband, minstrel_cck_bitrates, - ARRAY_SIZE(minstrel_cck_bitrates), - rate_flags); + ARRAY_SIZE(minstrel_cck_bitrates)); } static void @@ -1917,7 +1912,6 @@ minstrel_ht_init_ofdm_rates(struct minstrel_priv *mp, enum nl80211_band band) { static const s16 bitrates[8] = { 60, 90, 120, 180, 240, 360, 480, 540 }; struct ieee80211_supported_band *sband; - u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); memset(mp->ofdm_rates[band], 0xff, sizeof(mp->ofdm_rates[band])); sband = mp->hw->wiphy->bands[band]; @@ -1927,8 +1921,7 @@ minstrel_ht_init_ofdm_rates(struct minstrel_priv *mp, enum nl80211_band band) BUILD_BUG_ON(ARRAY_SIZE(mp->ofdm_rates[band]) != ARRAY_SIZE(bitrates)); minstrel_ht_fill_rate_array(mp->ofdm_rates[band], sband, minstrel_ofdm_bitrates, - ARRAY_SIZE(minstrel_ofdm_bitrates), - rate_flags); + ARRAY_SIZE(minstrel_ofdm_bitrates)); } static void * diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 2f92e7c7f203..94714f8ffd22 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -382,8 +382,8 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_link_data *link, if (WARN_ON_ONCE(!sband)) return; - ieee80211_put_srates_elem(skb, sband, 0, 0, 0, WLAN_EID_SUPP_RATES); - ieee80211_put_srates_elem(skb, sband, 0, 0, 0, WLAN_EID_EXT_SUPP_RATES); + ieee80211_put_srates_elem(skb, sband, 0, 0, WLAN_EID_SUPP_RATES); + ieee80211_put_srates_elem(skb, sband, 0, 0, WLAN_EID_EXT_SUPP_RATES); ieee80211_tdls_add_supp_channels(sdata, skb); /* add any custom IEs that go before Extended Capabilities */ diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index b75f72fbefd9..e4e6f5d24884 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -50,19 +50,11 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, struct ieee80211_supported_band *sband; struct ieee80211_hdr *hdr; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_chanctx_conf *chanctx_conf; - u32 rate_flags = 0; /* assume HW handles this */ if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS)) return 0; - rcu_read_lock(); - chanctx_conf = rcu_dereference(tx->sdata->vif.bss_conf.chanctx_conf); - if (chanctx_conf) - rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def); - rcu_read_unlock(); - /* uh huh? */ if (WARN_ON_ONCE(tx->rate.idx < 0)) return 0; @@ -139,9 +131,6 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, if (r->bitrate > txrate->bitrate) break; - if ((rate_flags & r->flags) != rate_flags) - continue; - if (tx->sdata->vif.bss_conf.basic_rates & BIT(i)) rate = r->bitrate; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index dec6e16b8c7d..27d414efa3fd 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1204,7 +1204,6 @@ static int ieee80211_put_preq_ies_band(struct sk_buff *skb, struct ieee80211_supported_band *sband; int i, err; size_t noffset; - u32 rate_flags; bool have_80mhz = false; *offset = 0; @@ -1213,13 +1212,11 @@ static int ieee80211_put_preq_ies_band(struct sk_buff *skb, if (WARN_ON_ONCE(!sband)) return 0; - rate_flags = ieee80211_chandef_rate_flags(chandef); - /* For direct scan add S1G IE and consider its override bits */ if (band == NL80211_BAND_S1GHZ) return ieee80211_put_s1g_cap(skb, &sband->s1g_cap); - err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags, + err = ieee80211_put_srates_elem(skb, sband, 0, ~rate_mask, WLAN_EID_SUPP_RATES); if (err) return err; @@ -1241,7 +1238,7 @@ static int ieee80211_put_preq_ies_band(struct sk_buff *skb, *offset = noffset; } - err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags, + err = ieee80211_put_srates_elem(skb, sband, 0, ~rate_mask, WLAN_EID_EXT_SUPP_RATES); if (err) return err; @@ -1522,16 +1519,13 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, { struct ieee80211_supported_band *sband; size_t num_rates; - u32 supp_rates, rate_flags; + u32 supp_rates; int i, j; sband = sdata->local->hw.wiphy->bands[band]; if (WARN_ON(!sband)) return 1; - rate_flags = - ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); - num_rates = sband->n_bitrates; supp_rates = 0; for (i = 0; i < elems->supp_rates_len + @@ -1551,12 +1545,7 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, continue; for (j = 0; j < num_rates; j++) { - int brate; - if ((rate_flags & sband->bitrates[j].flags) - != rate_flags) - continue; - - brate = sband->bitrates[j].bitrate; + int brate = sband->bitrates[j].bitrate; if (brate == own_rate) { supp_rates |= BIT(j); @@ -3223,15 +3212,13 @@ bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, int ieee80211_put_srates_elem(struct sk_buff *skb, const struct ieee80211_supported_band *sband, - u32 basic_rates, u32 rate_flags, u32 masked_rates, + u32 basic_rates, u32 masked_rates, u8 element_id) { u8 i, rates, skip; rates = 0; for (i = 0; i < sband->n_bitrates; i++) { - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; if (masked_rates & BIT(i)) continue; rates++; @@ -3257,8 +3244,6 @@ int ieee80211_put_srates_elem(struct sk_buff *skb, int rate; u8 basic; - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; if (masked_rates & BIT(i)) continue; -- cgit v1.2.3 From 4876376988081d636a4c4e5f03a5556386b49087 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 3 Apr 2025 20:39:28 +0200 Subject: Revert "mac80211: Dynamically set CoDel parameters per station" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 484a54c2e597dbc4ace79c1687022282905afba0. The CoDel parameter change essentially disables CoDel on slow stations, with some questionable assumptions, as Dave pointed out in [0]. Quoting from there: But here are my pithy comments as to why this part of mac80211 is so wrong... static void sta_update_codel_params(struct sta_info *sta, u32 thr) { - if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) { 1) sta->local->num_sta is the number of associated, rather than active, stations. "Active" stations in the last 50ms or so, might have been a better thing to use, but as most people have far more than that associated, we end up with really lousy codel parameters, all the time. Mistake numero uno! 2) The STA_SLOW_THRESHOLD was completely arbitrary in 2016. - sta->cparams.target = MS2TIME(50); This, by itself, was probably not too bad. 30ms might have been better, at the time, when we were battling powersave etc, but 20ms was enough, really, to cover most scenarios, even where we had low rate 2Ghz multicast to cope with. Even then, codel has a hard time finding any sane drop rate at all, with a target this high. - sta->cparams.interval = MS2TIME(300); But this was horrible, a total mistake, that is leading to codel being completely ineffective in almost any scenario on clients or APS. 100ms, even 80ms, here, would be vastly better than this insanity. I'm seeing 5+seconds of delay accumulated in a bunch of otherwise happily fq-ing APs.... 100ms of observed jitter during a flow is enough. Certainly (in 2016) there were interactions with powersave that I did not understand, and still don't, but if you are transmitting in the first place, powersave shouldn't be a problemmmm..... - sta->cparams.ecn = false; At the time we were pretty nervous about ecn, I'm kind of sanguine about it now, and reliably indicating ecn seems better than turning it off for any reason. [...] In production, on p2p wireless, I've had 8ms and 80ms for target and interval for years now, and it works great. I think Dave's arguments above are basically sound on the face of it, and various experimentation with tighter CoDel parameters in the OpenWrt community have show promising results[1]. So I don't think there's any reason to keep this parameter fiddling; hence this revert. [0] https://lore.kernel.org/linux-wireless/CAA93jw6NJ2cmLmMauz0xAgC2MGbBq6n0ZiZzAdkK0u4b+O2yXg@mail.gmail.com/ [1] https://forum.openwrt.org/t/reducing-multiplexing-latencies-still-further-in-wifi/133605/130 Suggested-By: Dave Taht In-memory-of: Dave Taht Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250403183930.197716-1-toke@toke.dk Signed-off-by: Johannes Berg --- include/net/mac80211.h | 16 ---------------- net/mac80211/debugfs_sta.c | 6 ------ net/mac80211/rate.c | 2 -- net/mac80211/sta_info.c | 28 ---------------------------- net/mac80211/sta_info.h | 11 ----------- net/mac80211/tx.c | 9 +-------- 6 files changed, 1 insertion(+), 71 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c498f685d01f..5349df596157 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5346,22 +5346,6 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, struct ieee80211_tx_rate *dest, int max_rates); -/** - * ieee80211_sta_set_expected_throughput - set the expected tpt for a station - * - * Call this function to notify mac80211 about a change in expected throughput - * to a station. A driver for a device that does rate control in firmware can - * call this function when the expected throughput estimate towards a station - * changes. The information is used to tune the CoDel AQM applied to traffic - * going towards that station (which can otherwise be too aggressive and cause - * slow stations to starve). - * - * @pubsta: the station to set throughput for. - * @thr: the current expected throughput in kbps. - */ -void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, - u32 thr); - /** * ieee80211_tx_rate_update - transmit rate update callback * diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index a8948f4d983e..49061bd4151b 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -150,12 +150,6 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, spin_lock_bh(&local->fq.lock); rcu_read_lock(); - p += scnprintf(p, - bufsz + buf - p, - "target %uus interval %uus ecn %s\n", - codel_time_to_us(sta->cparams.target), - codel_time_to_us(sta->cparams.interval), - sta->cparams.ecn ? "yes" : "no"); p += scnprintf(p, bufsz + buf - p, "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n"); diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 1b0667ca359e..3cb2ad6d0b28 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -984,8 +984,6 @@ int rate_control_set_rates(struct ieee80211_hw *hw, if (sta->uploaded) drv_sta_rate_tbl_update(hw_to_local(hw), sta->sdata, pubsta); - ieee80211_sta_set_expected_throughput(pubsta, sta_get_expected_throughput(sta)); - return 0; } EXPORT_SYMBOL(rate_control_set_rates); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 248e1f63bf73..84b18be1f0b1 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -18,7 +18,6 @@ #include #include -#include #include #include "ieee80211_i.h" #include "driver-ops.h" @@ -701,12 +700,6 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata, } } - sta->cparams.ce_threshold = CODEL_DISABLED_THRESHOLD; - sta->cparams.target = MS2TIME(20); - sta->cparams.interval = MS2TIME(100); - sta->cparams.ecn = true; - sta->cparams.ce_threshold_selector = 0; - sta->cparams.ce_threshold_mask = 0; sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); @@ -2905,27 +2898,6 @@ unsigned long ieee80211_sta_last_active(struct sta_info *sta) return sta->deflink.status_stats.last_ack; } -static void sta_update_codel_params(struct sta_info *sta, u32 thr) -{ - if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) { - sta->cparams.target = MS2TIME(50); - sta->cparams.interval = MS2TIME(300); - sta->cparams.ecn = false; - } else { - sta->cparams.target = MS2TIME(20); - sta->cparams.interval = MS2TIME(100); - sta->cparams.ecn = true; - } -} - -void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, - u32 thr) -{ - struct sta_info *sta = container_of(pubsta, struct sta_info, sta); - - sta_update_codel_params(sta, thr); -} - int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 07b7ec39a52f..7a95d8d34fca 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -466,14 +466,6 @@ struct ieee80211_fragment_cache { unsigned int next; }; -/* - * The bandwidth threshold below which the per-station CoDel parameters will be - * scaled to be more lenient (to prevent starvation of slow stations). This - * value will be scaled by the number of active stations when it is being - * applied. - */ -#define STA_SLOW_THRESHOLD 6000 /* 6 Mbps */ - /** * struct link_sta_info - Link STA information * All link specific sta info are stored here for reference. This can be @@ -626,7 +618,6 @@ struct link_sta_info { * @sta: station information we share with the driver * @sta_state: duplicates information about station state (for debug) * @rcu_head: RCU head used for freeing this station struct - * @cparams: CoDel parameters for this station. * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED) * @amsdu_mesh_control: track the mesh A-MSDU format used by the peer: * @@ -717,8 +708,6 @@ struct sta_info { struct dentry *debugfs_dir; #endif - struct codel_params cparams; - u8 reserved_tid; s8 amsdu_mesh_control; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index e4e6f5d24884..d2e4cc117903 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1392,16 +1392,9 @@ static struct sk_buff *fq_tin_dequeue_func(struct fq *fq, local = container_of(fq, struct ieee80211_local, fq); txqi = container_of(tin, struct txq_info, tin); + cparams = &local->cparams; cstats = &txqi->cstats; - if (txqi->txq.sta) { - struct sta_info *sta = container_of(txqi->txq.sta, - struct sta_info, sta); - cparams = &sta->cparams; - } else { - cparams = &local->cparams; - } - if (flow == &tin->default_flow) cvars = &txqi->def_cvars; else -- cgit v1.2.3 From fcc2d3e11bcc8f01d52a8c419f49f86ff8343b7c Mon Sep 17 00:00:00 2001 From: Karthikeyan Kathirvel Date: Mon, 21 Apr 2025 16:45:05 +0530 Subject: wifi: ieee80211: define beacon protection bit field An AP supporting Beacon Protection should set bit 84 in the extended capabilities IE (9.4.2.25 in the 802.11be D7 spec). So the *4th* bit of the 10th byte should be checked to figure out whether beacon protection is enabled or disabled. Signed-off-by: Karthikeyan Kathirvel Reviewed-by: Jeff Johnson Link: https://patch.msgid.link/20250421111505.3633992-1-karthikeyan.kathirvel@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 508d466de1cc..cbc3928aa504 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4087,6 +4087,9 @@ enum ieee80211_tdls_actioncode { /* Defines support for enhanced multi-bssid advertisement*/ #define WLAN_EXT_CAPA11_EMA_SUPPORT BIT(3) +/* Enable Beacon Protection */ +#define WLAN_EXT_CAPA11_BCN_PROTECT BIT(4) + /* TDLS specific payload type in the LLC/SNAP header */ #define WLAN_TDLS_SNAP_RFTYPE 0x2 -- cgit v1.2.3 From 53160d0edf7336acaed4c74c6af8549d87c92ae6 Mon Sep 17 00:00:00 2001 From: Ramasamy Kaliappan Date: Thu, 27 Mar 2025 10:43:17 +0530 Subject: wifi: cfg80211: Add support to get EMLSR capabilities of non-AP MLD The Enhanced multi-link single-radio (EMLSR) operation allows a non-AP MLD with multiple receive chains to listen on one or more EMLSR links when the corresponding non-AP STA(s) affiliated with the non-AP MLD is (are) in the awake state. [IEEE 802.11be-2024, (35.3.17 Enhanced multi-link single-radio (EMLSR) operation)] An MLD which intends to enable EMLSR operations will set the EML Capabilities Present subfield to 1 and shall set the EMLSR Support subfield in the Common Info field of the Basic Multi-Link element to 1 in all Management frames that include the Basic Multi-Link element except Authentication frames. EML capabilities contains information such as EML Transition timeout, Padding delay and Transition delay. These fields needs to updated to drivers to trigger EMLSR operation and to transmit and receive initial control frame and data frames. Add support to receive EML Capabilities subfield that non-AP MLD advertises during (re)association request and send it to underlying drivers during ADD/SET station. Signed-off-by: Ramasamy Kaliappan Signed-off-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20250327051320.3253783-2-quic_ramess@quicinc.com [accept EMLSR capabilities only for unassoc AP STA] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 +++++ net/wireless/nl80211.c | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 6df4e17f1437..87cb66fba621 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1733,6 +1733,9 @@ struct cfg80211_ttlm_params { * @supported_oper_classes_len: number of supported operating classes * @support_p2p_ps: information if station supports P2P PS mechanism * @airtime_weight: airtime scheduler weight for this station + * @eml_cap_present: Specifies if EML capabilities field (@eml_cap) is + * present/updated + * @eml_cap: EML capabilities of this station * @link_sta_params: link related params. */ struct station_parameters { @@ -1757,6 +1760,8 @@ struct station_parameters { u8 supported_oper_classes_len; int support_p2p_ps; u16 airtime_weight; + bool eml_cap_present; + u16 eml_cap; struct link_station_parameters link_sta_params; }; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f039a7d0d6f7..de6c936039f4 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -833,6 +833,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MLD_ADDR] = NLA_POLICY_EXACT_LEN(ETH_ALEN), [NL80211_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG }, [NL80211_ATTR_MAX_NUM_AKM_SUITES] = { .type = NLA_REJECT }, + [NL80211_ATTR_EML_CAPABILITY] = { .type = NLA_U16 }, [NL80211_ATTR_PUNCT_BITMAP] = NLA_POLICY_FULL_RANGE(NLA_U32, &nl80211_punct_bitmap_range), @@ -7118,6 +7119,11 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EINVAL; } + /* Accept EMLSR capabilities only for AP client before association */ + if (statype != CFG80211_STA_AP_CLIENT_UNASSOC && + params->eml_cap_present) + return -EINVAL; + switch (statype) { case CFG80211_STA_AP_MLME_CLIENT: /* Use this only for authorizing/unauthorizing a station */ @@ -7473,6 +7479,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) params.link_sta_params.he_6ghz_capa = nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); + if (info->attrs[NL80211_ATTR_EML_CAPABILITY]) { + params.eml_cap_present = true; + params.eml_cap = + nla_get_u16(info->attrs[NL80211_ATTR_EML_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]) params.airtime_weight = nla_get_u16(info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]); @@ -7631,6 +7643,12 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) } } + if (info->attrs[NL80211_ATTR_EML_CAPABILITY]) { + params.eml_cap_present = true; + params.eml_cap = + nla_get_u16(info->attrs[NL80211_ATTR_EML_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) params.link_sta_params.he_6ghz_capa = nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); -- cgit v1.2.3 From 14e0f59a88cc22ceeb36e26b89b70b22292d23de Mon Sep 17 00:00:00 2001 From: Ramasamy Kaliappan Date: Thu, 27 Mar 2025 10:43:18 +0530 Subject: wifi: mac80211: update ML STA with EML capabilities When an AP and Non-AP MLD operates in EMLSR mode, EML capabilities advertised during Association contains information such as EMLSR transition delay, padding delay and transition timeout values. Save the EML capabilities information that is received during station addition and capabilities update in ieee80211_sta so that drivers can use it for triggering EMLSR operation. Signed-off-by: Ramasamy Kaliappan Signed-off-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20250327051320.3253783-3-quic_ramess@quicinc.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ net/mac80211/cfg.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 5349df596157..c305ebfa6e45 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2488,6 +2488,7 @@ struct ieee80211_link_sta { * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single * A-MSDU. Taken from the Extended Capabilities element. 0 means * unlimited. + * @eml_cap: EML capabilities of this MLO station * @cur: currently valid data as aggregated from the active links * For non MLO STA it will point to the deflink data. For MLO STA * ieee80211_sta_recalc_aggregates() must be called to update it. @@ -2522,6 +2523,7 @@ struct ieee80211_sta { bool mlo; bool spp_amsdu; u8 max_amsdu_subframes; + u16 eml_cap; struct ieee80211_sta_aggregates *cur; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 9f683f838431..2676b368e5a6 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2066,6 +2066,9 @@ static int sta_apply_parameters(struct ieee80211_local *local, if (params->listen_interval >= 0) sta->listen_interval = params->listen_interval; + if (params->eml_cap_present) + sta->sta.eml_cap = params->eml_cap; + ret = sta_link_apply_parameters(local, sta, STA_LINK_MODE_STA_MODIFY, ¶ms->link_sta_params); if (ret) -- cgit v1.2.3 From 91ea0489dc97bfda72ed74f98ab66dc0ab4235c7 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Thu, 27 Mar 2025 10:43:19 +0530 Subject: wifi: ieee80211: Add helpers to fetch EMLSR delay and timeout values Add helpers to get EMLSR transition delay, padding delay and transition timeout values from EML capabilities field of Multi-link Element. Signed-off-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20250327051320.3253783-4-quic_ramess@quicinc.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 74 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index cbc3928aa504..15a87f522017 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -5618,6 +5618,80 @@ static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len) return len >= fixed + elem_len; } +/** + * ieee80211_emlsr_pad_delay_in_us - Fetch the EMLSR Padding delay + * in microseconds + * @eml_cap: EML capabilities field value from common info field of + * the Multi-link element + * Return: the EMLSR Padding delay (in microseconds) encoded in the + * EML Capabilities field + */ + +static inline u32 ieee80211_emlsr_pad_delay_in_us(u16 eml_cap) +{ + /* IEEE Std 802.11be-2024 Table 9-417i—Encoding of the EMLSR + * Padding Delay subfield. + */ + u32 pad_delay = u16_get_bits(eml_cap, + IEEE80211_EML_CAP_EMLSR_PADDING_DELAY); + + if (!pad_delay || + pad_delay > IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US) + return 0; + + return 32 * (1 << (pad_delay - 1)); +} + +/** + * ieee80211_emlsr_trans_delay_in_us - Fetch the EMLSR Transition + * delay in microseconds + * @eml_cap: EML capabilities field value from common info field of + * the Multi-link element + * Return: the EMLSR Transition delay (in microseconds) encoded in the + * EML Capabilities field + */ + +static inline u32 ieee80211_emlsr_trans_delay_in_us(u16 eml_cap) +{ + /* IEEE Std 802.11be-2024 Table 9-417j—Encoding of the EMLSR + * Transition Delay subfield. + */ + u32 trans_delay = + u16_get_bits(eml_cap, + IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY); + + /* invalid values also just use 0 */ + if (!trans_delay || + trans_delay > IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US) + return 0; + + return 16 * (1 << (trans_delay - 1)); +} + +/** + * ieee80211_eml_trans_timeout_in_us - Fetch the EMLSR Transition + * timeout value in microseconds + * @eml_cap: EML capabilities field value from common info field of + * the Multi-link element + * Return: the EMLSR Transition timeout (in microseconds) encoded in + * the EML Capabilities field + */ + +static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) +{ + /* IEEE Std 802.11be-2024 Table 9-417m—Encoding of the + * Transition Timeout subfield. + */ + u8 timeout = u16_get_bits(eml_cap, + IEEE80211_EML_CAP_TRANSITION_TIMEOUT); + + /* invalid values also just use 0 */ + if (!timeout || timeout > IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU) + return 0; + + return 128 * (1 << (timeout - 1)); +} + #define for_each_mle_subelement(_elem, _data, _len) \ if (ieee80211_mle_size_ok(_data, _len)) \ for_each_element(_elem, \ -- cgit v1.2.3 From 37523c3c47b3f3cc4c7d2ff47d28ee9ec99317c1 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Tue, 8 Apr 2025 11:44:59 -0700 Subject: wifi: nl80211: add link id of transmitted profile for MLO MBSSID During non-transmitted (nontx) profile configuration, interface index of the transmitted (tx) profile is used to retrieve the wireless device (wdev) associated with it. With MLO, this 'wdev' may be part of an MLD with more than one link, hence only interface index is not sufficient anymore to retrieve the correct tx profile. Add a new attribute to configure link id of tx profile. Signed-off-by: Rameshkumar Sundaram Co-developed-by: Muna Sinada Signed-off-by: Muna Sinada Co-developed-by: Aloka Dixit Signed-off-by: Aloka Dixit Link: https://patch.msgid.link/20250408184501.3715887-2-aloka.dixit@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 6 ++++++ net/wireless/nl80211.c | 24 +++++++++++++++++++++++- 3 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 87cb66fba621..d1848dc8ec99 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1263,11 +1263,13 @@ struct cfg80211_crypto_settings { * struct cfg80211_mbssid_config - AP settings for multi bssid * * @tx_wdev: pointer to the transmitted interface in the MBSSID set + * @tx_link_id: link ID of the transmitted profile in an MLD. * @index: index of this AP in the multi bssid group. * @ema: set to true if the beacons should be sent out in EMA mode. */ struct cfg80211_mbssid_config { struct wireless_dev *tx_wdev; + u8 tx_link_id; u8 index; bool ema; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ddcc4cda74af..e9ccf43fe3c6 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -8036,6 +8036,11 @@ enum nl80211_sar_specs_attrs { * Setting this flag is permitted only if the driver advertises EMA support * by setting wiphy->ema_max_profile_periodicity to non-zero. * + * @NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID: Link ID of the transmitted profile. + * This parameter is mandatory when NL80211_ATTR_MBSSID_CONFIG attributes + * are sent for a non-transmitted profile and if the transmitted profile + * is part of an MLD. For all other cases this parameter is unnecessary. + * * @__NL80211_MBSSID_CONFIG_ATTR_LAST: Internal * @NL80211_MBSSID_CONFIG_ATTR_MAX: highest attribute */ @@ -8047,6 +8052,7 @@ enum nl80211_mbssid_config_attributes { NL80211_MBSSID_CONFIG_ATTR_INDEX, NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX, NL80211_MBSSID_CONFIG_ATTR_EMA, + NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID, /* keep last */ __NL80211_MBSSID_CONFIG_ATTR_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index de6c936039f4..fd5f79266471 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -469,6 +469,8 @@ nl80211_mbssid_config_policy[NL80211_MBSSID_CONFIG_ATTR_MAX + 1] = { [NL80211_MBSSID_CONFIG_ATTR_INDEX] = { .type = NLA_U8 }, [NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX] = { .type = NLA_U32 }, [NL80211_MBSSID_CONFIG_ATTR_EMA] = { .type = NLA_FLAG }, + [NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID] = + NLA_POLICY_MAX(NLA_U8, IEEE80211_MLD_MAX_NUM_LINKS), }; static const struct nla_policy @@ -5524,11 +5526,13 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, static int nl80211_parse_mbssid_config(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id, struct nlattr *attrs, struct cfg80211_mbssid_config *config, u8 num_elems) { struct nlattr *tb[NL80211_MBSSID_CONFIG_ATTR_MAX + 1]; + int tx_link_id = -1; if (!wiphy->mbssid_max_interfaces) return -EOPNOTSUPP; @@ -5552,6 +5556,9 @@ static int nl80211_parse_mbssid_config(struct wiphy *wiphy, (!config->index && !num_elems)) return -EINVAL; + if (tb[NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID]) + tx_link_id = nla_get_u8(tb[NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID]); + if (tb[NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX]) { u32 tx_ifindex = nla_get_u32(tb[NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX]); @@ -5573,10 +5580,25 @@ static int nl80211_parse_mbssid_config(struct wiphy *wiphy, } config->tx_wdev = tx_netdev->ieee80211_ptr; + /* Caller should call dev_put(config->tx_wdev) from this point */ + + if (config->tx_wdev->valid_links) { + if (tx_link_id == -1 || + !(config->tx_wdev->valid_links & BIT(tx_link_id))) + return -ENOLINK; + + config->tx_link_id = tx_link_id; + } } else { + if (tx_link_id >= 0 && tx_link_id != link_id) + return -EINVAL; + config->tx_wdev = dev->ieee80211_ptr; } } else if (!config->index) { + if (tx_link_id >= 0 && tx_link_id != link_id) + return -EINVAL; + config->tx_wdev = dev->ieee80211_ptr; } else { return -EINVAL; @@ -6326,7 +6348,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_MBSSID_CONFIG]) { - err = nl80211_parse_mbssid_config(&rdev->wiphy, dev, + err = nl80211_parse_mbssid_config(&rdev->wiphy, dev, link_id, info->attrs[NL80211_ATTR_MBSSID_CONFIG], ¶ms->mbssid_config, params->beacon.mbssid_ies ? -- cgit v1.2.3 From f600832794c91d7021d7337104734246b02a2b86 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Tue, 8 Apr 2025 11:45:00 -0700 Subject: wifi: mac80211: restructure tx profile retrieval for MLO MBSSID For MBSSID, each vif (struct ieee80211_vif) stores another vif pointer for the transmitting profile of MBSSID set. This won't suffice for MLO as there may be multiple links, each of which can be part of different MBSSID sets. Hence the information needs to be stored per-link. Additionally, the transmitted profile itself may be part of an MLD hence storing vif will not suffice either. Fix MLO by storing an instance of struct ieee80211_bss_conf for each link. Modify following operations to reflect the above structure updates: - channel switch completion - BSS color change completion - Removing nontransmitted links in ieee80211_stop_mbssid() - drivers retrieving the transmitted link for beacon templates. Signed-off-by: Rameshkumar Sundaram Co-developed-by: Muna Sinada Signed-off-by: Muna Sinada Co-developed-by: Aloka Dixit Signed-off-by: Aloka Dixit Link: https://patch.msgid.link/20250408184501.3715887-3-aloka.dixit@oss.qualcomm.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath11k/mac.c | 10 +++- drivers/net/wireless/ath/ath12k/mac.c | 22 ++++++-- drivers/net/wireless/virtual/mac80211_hwsim.c | 7 +-- include/net/mac80211.h | 7 +-- net/mac80211/cfg.c | 64 ++++++++++++++--------- net/mac80211/ieee80211_i.h | 2 + net/mac80211/iface.c | 75 +++++++++++++++++++-------- 7 files changed, 130 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 97816916abac..8191ee14a1fd 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -1531,8 +1531,14 @@ static int ath11k_mac_set_vif_params(struct ath11k_vif *arvif, static struct ath11k_vif *ath11k_mac_get_tx_arvif(struct ath11k_vif *arvif) { - if (arvif->vif->mbssid_tx_vif) - return ath11k_vif_to_arvif(arvif->vif->mbssid_tx_vif); + struct ieee80211_bss_conf *link_conf, *tx_bss_conf; + + lockdep_assert_wiphy(arvif->ar->hw->wiphy); + + link_conf = &arvif->vif->bss_conf; + tx_bss_conf = wiphy_dereference(arvif->ar->hw->wiphy, link_conf->tx_bss_conf); + if (tx_bss_conf) + return ath11k_vif_to_arvif(tx_bss_conf->vif); return NULL; } diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index dfa05f0ee6c9..84da77bf245b 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -582,12 +582,26 @@ static int ath12k_mac_vif_link_chan(struct ieee80211_vif *vif, u8 link_id, static struct ath12k_link_vif *ath12k_mac_get_tx_arvif(struct ath12k_link_vif *arvif) { + struct ieee80211_bss_conf *link_conf, *tx_bss_conf; + struct ath12k *ar = arvif->ar; struct ath12k_vif *tx_ahvif; - if (arvif->ahvif->vif->mbssid_tx_vif) { - tx_ahvif = ath12k_vif_to_ahvif(arvif->ahvif->vif->mbssid_tx_vif); - if (tx_ahvif) - return &tx_ahvif->deflink; + lockdep_assert_wiphy(ath12k_ar_to_hw(ar)->wiphy); + + link_conf = ath12k_mac_get_link_bss_conf(arvif); + if (!link_conf) { + ath12k_warn(ar->ab, + "unable to access bss link conf for link %u required to retrieve transmitting link conf\n", + arvif->link_id); + return NULL; + } + + tx_bss_conf = wiphy_dereference(ath12k_ar_to_hw(ar)->wiphy, + link_conf->tx_bss_conf); + if (tx_bss_conf) { + tx_ahvif = ath12k_vif_to_ahvif(tx_bss_conf->vif); + return wiphy_dereference(tx_ahvif->ah->hw->wiphy, + tx_ahvif->link[tx_bss_conf->link_id]); } return NULL; diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c index cf3e976471c6..74d037cfccca 100644 --- a/drivers/net/wireless/virtual/mac80211_hwsim.c +++ b/drivers/net/wireless/virtual/mac80211_hwsim.c @@ -2273,7 +2273,7 @@ static void mac80211_hwsim_beacon_tx(void *arg, u8 *mac, { struct mac80211_hwsim_link_data *link_data = arg; u32 link_id = link_data->link_id; - struct ieee80211_bss_conf *link_conf; + struct ieee80211_bss_conf *link_conf, *tx_bss_conf; struct mac80211_hwsim_data *data = container_of(link_data, struct mac80211_hwsim_data, link_data[link_id]); @@ -2292,10 +2292,11 @@ static void mac80211_hwsim_beacon_tx(void *arg, u8 *mac, vif->type != NL80211_IFTYPE_OCB) return; - if (vif->mbssid_tx_vif && vif->mbssid_tx_vif != vif) + tx_bss_conf = rcu_access_pointer(link_conf->tx_bss_conf); + if (tx_bss_conf && tx_bss_conf != link_conf) return; - if (vif->bss_conf.ema_ap) { + if (link_conf->ema_ap) { struct ieee80211_ema_beacons *ema; u8 i = 0; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c305ebfa6e45..fdafc37d17cc 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -682,6 +682,9 @@ struct ieee80211_parsed_tpe { * responder functionality. * @ftmr_params: configurable lci/civic parameter when enabling FTM responder. * @nontransmitted: this BSS is a nontransmitted BSS profile + * @tx_bss_conf: Pointer to the BSS configuration of transmitting interface + * if MBSSID is enabled. This pointer is RCU-protected due to CSA finish + * and BSS color change flows accessing it. * @transmitter_bssid: the address of transmitter AP * @bssid_index: index inside the multiple BSSID set * @bssid_indicator: 2^bssid_indicator is the maximum number of APs in set @@ -804,6 +807,7 @@ struct ieee80211_bss_conf { struct ieee80211_ftm_responder_params *ftmr_params; /* Multiple BSSID data */ bool nontransmitted; + struct ieee80211_bss_conf __rcu *tx_bss_conf; u8 transmitter_bssid[ETH_ALEN]; u8 bssid_index; u8 bssid_indicator; @@ -2023,7 +2027,6 @@ enum ieee80211_neg_ttlm_res { * @txq: the multicast data TX queue * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see * &enum ieee80211_offload_flags. - * @mbssid_tx_vif: Pointer to the transmitting interface if MBSSID is enabled. */ struct ieee80211_vif { enum nl80211_iftype type; @@ -2052,8 +2055,6 @@ struct ieee80211_vif { bool probe_req_reg; bool rx_mcast_action_reg; - struct ieee80211_vif *mbssid_tx_vif; - /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 2676b368e5a6..9017b98fea04 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -146,8 +146,8 @@ static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf) { struct ieee80211_sub_if_data *tx_sdata; + struct ieee80211_bss_conf *old; - sdata->vif.mbssid_tx_vif = NULL; link_conf->bssid_index = 0; link_conf->nontransmitted = false; link_conf->ema_ap = false; @@ -156,14 +156,26 @@ static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata, if (sdata->vif.type != NL80211_IFTYPE_AP || !params->tx_wdev) return -EINVAL; + old = sdata_dereference(link_conf->tx_bss_conf, sdata); + if (old) + return -EALREADY; + tx_sdata = IEEE80211_WDEV_TO_SUB_IF(params->tx_wdev); if (!tx_sdata) return -EINVAL; if (tx_sdata == sdata) { - sdata->vif.mbssid_tx_vif = &sdata->vif; + rcu_assign_pointer(link_conf->tx_bss_conf, link_conf); } else { - sdata->vif.mbssid_tx_vif = &tx_sdata->vif; + struct ieee80211_bss_conf *tx_bss_conf; + + tx_bss_conf = sdata_dereference(tx_sdata->vif.link_conf[params->tx_link_id], + sdata); + if (rcu_access_pointer(tx_bss_conf->tx_bss_conf) != tx_bss_conf) + return -EINVAL; + + rcu_assign_pointer(link_conf->tx_bss_conf, tx_bss_conf); + link_conf->nontransmitted = true; link_conf->bssid_index = params->index; } @@ -1669,7 +1681,6 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, kfree(link_conf->ftmr_params); link_conf->ftmr_params = NULL; - sdata->vif.mbssid_tx_vif = NULL; link_conf->bssid_index = 0; link_conf->nontransmitted = false; link_conf->ema_ap = false; @@ -1683,6 +1694,9 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, ieee80211_free_key_list(local, &keys); } + ieee80211_stop_mbssid(sdata); + RCU_INIT_POINTER(link_conf->tx_bss_conf, NULL); + link_conf->enable_beacon = false; sdata->beacon_rate_set = false; sdata->vif.cfg.ssid_len = 0; @@ -3703,6 +3717,7 @@ void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; + struct ieee80211_bss_conf *tx_bss_conf; struct ieee80211_link_data *link_data; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) @@ -3716,25 +3731,24 @@ void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id) return; } - /* TODO: MBSSID with MLO changes */ - if (vif->mbssid_tx_vif == vif) { + tx_bss_conf = rcu_dereference(link_data->conf->tx_bss_conf); + if (tx_bss_conf == link_data->conf) { /* Trigger ieee80211_csa_finish() on the non-transmitting * interfaces when channel switch is received on * transmitting interface */ - struct ieee80211_sub_if_data *iter; - - list_for_each_entry_rcu(iter, &local->interfaces, list) { - if (!ieee80211_sdata_running(iter)) - continue; + struct ieee80211_link_data *iter; - if (iter == sdata || iter->vif.mbssid_tx_vif != vif) + for_each_sdata_link(local, iter) { + if (iter->sdata == sdata || + rcu_access_pointer(iter->conf->tx_bss_conf) != tx_bss_conf) continue; - wiphy_work_queue(iter->local->hw.wiphy, - &iter->deflink.csa.finalize_work); + wiphy_work_queue(iter->sdata->local->hw.wiphy, + &iter->csa.finalize_work); } } + wiphy_work_queue(local->hw.wiphy, &link_data->csa.finalize_work); rcu_read_unlock(); @@ -4836,17 +4850,19 @@ ieee80211_color_change_bss_config_notify(struct ieee80211_link_data *link, ieee80211_link_info_change_notify(sdata, link, changed); - if (!sdata->vif.bss_conf.nontransmitted && sdata->vif.mbssid_tx_vif) { - struct ieee80211_sub_if_data *child; + if (!link->conf->nontransmitted && + rcu_access_pointer(link->conf->tx_bss_conf)) { + struct ieee80211_link_data *tmp; - list_for_each_entry(child, &sdata->local->interfaces, list) { - if (child != sdata && child->vif.mbssid_tx_vif == &sdata->vif) { - child->vif.bss_conf.he_bss_color.color = color; - child->vif.bss_conf.he_bss_color.enabled = enable; - ieee80211_link_info_change_notify(child, - &child->deflink, - BSS_CHANGED_HE_BSS_COLOR); - } + for_each_sdata_link(sdata->local, tmp) { + if (tmp->sdata == sdata || + rcu_access_pointer(tmp->conf->tx_bss_conf) != link->conf) + continue; + + tmp->conf->he_bss_color.color = color; + tmp->conf->he_bss_color.enabled = enable; + ieee80211_link_info_change_notify(tmp->sdata, tmp, + BSS_CHANGED_HE_BSS_COLOR); } } } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 99a31017fa59..30809f0b35f7 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2807,6 +2807,8 @@ int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); +void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata); + #if IS_ENABLED(CONFIG_MAC80211_KUNIT_TEST) #define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym) #define VISIBLE_IF_MAC80211_KUNIT diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 7d93e5aa595b..7c27f3cd841c 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -732,30 +732,59 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do ieee80211_add_virtual_monitor(local); } -static void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata) +void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata) { - struct ieee80211_sub_if_data *tx_sdata, *non_tx_sdata, *tmp_sdata; - struct ieee80211_vif *tx_vif = sdata->vif.mbssid_tx_vif; + struct ieee80211_sub_if_data *tx_sdata; + struct ieee80211_bss_conf *link_conf, *tx_bss_conf; + struct ieee80211_link_data *tx_link, *link; + unsigned int link_id; - if (!tx_vif) - return; + lockdep_assert_wiphy(sdata->local->hw.wiphy); + + /* Check if any of the links of current sdata is an MBSSID. */ + for_each_vif_active_link(&sdata->vif, link_conf, link_id) { + tx_bss_conf = sdata_dereference(link_conf->tx_bss_conf, sdata); + if (!tx_bss_conf) + continue; - tx_sdata = vif_to_sdata(tx_vif); - sdata->vif.mbssid_tx_vif = NULL; + tx_sdata = vif_to_sdata(tx_bss_conf->vif); + RCU_INIT_POINTER(link_conf->tx_bss_conf, NULL); - list_for_each_entry_safe(non_tx_sdata, tmp_sdata, - &tx_sdata->local->interfaces, list) { - if (non_tx_sdata != sdata && non_tx_sdata != tx_sdata && - non_tx_sdata->vif.mbssid_tx_vif == tx_vif && - ieee80211_sdata_running(non_tx_sdata)) { - non_tx_sdata->vif.mbssid_tx_vif = NULL; - dev_close(non_tx_sdata->wdev.netdev); + /* If we are not tx sdata reset tx sdata's tx_bss_conf to avoid recusrion + * while closing tx sdata at the end of outer loop below. + */ + if (sdata != tx_sdata) { + tx_link = sdata_dereference(tx_sdata->link[tx_bss_conf->link_id], + tx_sdata); + if (!tx_link) + continue; + + RCU_INIT_POINTER(tx_link->conf->tx_bss_conf, NULL); + } + + /* loop through sdatas to find if any of their links + * belong to same MBSSID set as the one getting deleted. + */ + for_each_sdata_link(tx_sdata->local, link) { + struct ieee80211_sub_if_data *link_sdata = link->sdata; + + if (link_sdata == sdata || link_sdata == tx_sdata || + rcu_access_pointer(link->conf->tx_bss_conf) != tx_bss_conf) + continue; + + RCU_INIT_POINTER(link->conf->tx_bss_conf, NULL); + + /* Remove all links of matching MLD until dynamic link + * removal can be supported. + */ + cfg80211_stop_iface(link_sdata->wdev.wiphy, &link_sdata->wdev, + GFP_KERNEL); } - } - if (sdata != tx_sdata && ieee80211_sdata_running(tx_sdata)) { - tx_sdata->vif.mbssid_tx_vif = NULL; - dev_close(tx_sdata->wdev.netdev); + /* If we are not tx sdata, remove links of tx sdata and proceed */ + if (sdata != tx_sdata && ieee80211_sdata_running(tx_sdata)) + cfg80211_stop_iface(tx_sdata->wdev.wiphy, + &tx_sdata->wdev, GFP_KERNEL); } } @@ -763,21 +792,25 @@ static int ieee80211_stop(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - /* close dependent VLAN and MBSSID interfaces before locking wiphy */ + /* close dependent VLAN interfaces before locking wiphy */ if (sdata->vif.type == NL80211_IFTYPE_AP) { struct ieee80211_sub_if_data *vlan, *tmpsdata; list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans, u.vlan.list) dev_close(vlan->dev); - - ieee80211_stop_mbssid(sdata); } guard(wiphy)(sdata->local->hw.wiphy); wiphy_work_cancel(sdata->local->hw.wiphy, &sdata->activate_links_work); + /* Close the dependent MBSSID interfaces with wiphy lock as we may be + * terminating its partner links too in case of MLD. + */ + if (sdata->vif.type == NL80211_IFTYPE_AP) + ieee80211_stop_mbssid(sdata); + ieee80211_do_stop(sdata, true); return 0; -- cgit v1.2.3 From 52358dd63e348c3b6c488acc105be1aeda8fb923 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 18 Apr 2025 11:04:01 +0200 Subject: net: phy: remove function stubs All callers of these functions depend on PHYLIB or select it directly or indirectly by selecting PHYLINK. Stubs make sense for optional functionality, but that's not the case here. MDIO_XGENE usually is selected by NET_XGENE which also selects PHYLIB. Add a dependency to PHYLIB nevertheless, in order not to break randconfig builds. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/f7a69a1f-60e9-4ac0-8b7c-481e0cc850e7@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/mdio/Kconfig | 1 + include/linux/phy.h | 37 ------------------------------------- 2 files changed, 1 insertion(+), 37 deletions(-) (limited to 'include') diff --git a/drivers/net/mdio/Kconfig b/drivers/net/mdio/Kconfig index 058fcdaf6c18..38a4901da32f 100644 --- a/drivers/net/mdio/Kconfig +++ b/drivers/net/mdio/Kconfig @@ -57,6 +57,7 @@ config MDIO_SUN4I config MDIO_XGENE tristate "APM X-Gene SoC MDIO bus controller" depends on ARCH_XGENE || COMPILE_TEST + depends on PHYLIB help This module provides a driver for the MDIO busses found in the APM X-Gene SoC's. diff --git a/include/linux/phy.h b/include/linux/phy.h index 066a28a4b64b..3beaf225ee88 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1753,7 +1753,6 @@ int phy_modify_paged(struct phy_device *phydev, int page, u32 regnum, struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, bool is_c45, struct phy_c45_device_ids *c45_ids); -#if IS_ENABLED(CONFIG_PHYLIB) int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id); struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode); struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode); @@ -1761,42 +1760,6 @@ struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode); struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); void phy_device_free(struct phy_device *phydev); -#else -static inline int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id) -{ - return 0; -} -static inline -struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode) -{ - return 0; -} - -static inline -struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode) -{ - return NULL; -} - -static inline -struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode) -{ - return NULL; -} - -static inline -struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45) -{ - return NULL; -} - -static inline int phy_device_register(struct phy_device *phy) -{ - return 0; -} - -static inline void phy_device_free(struct phy_device *phydev) { } -#endif /* CONFIG_PHYLIB */ void phy_device_remove(struct phy_device *phydev); int phy_get_c45_ids(struct phy_device *phydev); int phy_init_hw(struct phy_device *phydev); -- cgit v1.2.3 From 834d97843e3bca86f17cc517885f54f3433427b2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:53 -0700 Subject: ipv6: Protect fib6_link_table() with spinlock. We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT. If the request specifies a new table ID, fib6_new_table() is called to create a new routing table. Two concurrent requests could specify the same table ID, so we need a lock to protect net->ipv6.fib_table_hash[h]. Let's add a spinlock to protect the hash bucket linkage. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250418000443.43734-13-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- include/net/netns/ipv6.h | 1 + net/ipv6/ip6_fib.c | 26 +++++++++++++++++++++----- 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 5f2cfd84570a..47dc70d8100a 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -72,6 +72,7 @@ struct netns_ipv6 { struct rt6_statistics *rt6_stats; struct timer_list ip6_fib_timer; struct hlist_head *fib_table_hash; + spinlock_t fib_table_hash_lock; struct fib6_table *fib6_main_tbl; struct list_head fib6_walkers; rwlock_t fib6_walker_lock; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index bf727149fdec..79b672f3fc53 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -249,19 +249,33 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) struct fib6_table *fib6_new_table(struct net *net, u32 id) { - struct fib6_table *tb; + struct fib6_table *tb, *new_tb; if (id == 0) id = RT6_TABLE_MAIN; + tb = fib6_get_table(net, id); if (tb) return tb; - tb = fib6_alloc_table(net, id); - if (tb) - fib6_link_table(net, tb); + new_tb = fib6_alloc_table(net, id); + if (!new_tb) + return NULL; + + spin_lock_bh(&net->ipv6.fib_table_hash_lock); + + tb = fib6_get_table(net, id); + if (unlikely(tb)) { + spin_unlock_bh(&net->ipv6.fib_table_hash_lock); + kfree(new_tb); + return tb; + } - return tb; + fib6_link_table(net, new_tb); + + spin_unlock_bh(&net->ipv6.fib_table_hash_lock); + + return new_tb; } EXPORT_SYMBOL_GPL(fib6_new_table); @@ -2423,6 +2437,8 @@ static int __net_init fib6_net_init(struct net *net) if (!net->ipv6.fib_table_hash) goto out_rt6_stats; + spin_lock_init(&net->ipv6.fib_table_hash_lock); + net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) -- cgit v1.2.3 From accb46b56bc3bc99ee69ba18b06ca60266ad6fca Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:54 -0700 Subject: ipv6: Defer fib6_purge_rt() in fib6_add_rt2node() to fib6_add(). The next patch adds per-nexthop spinlock which protects nh->f6i_list. When rt->nh is not NULL, fib6_add_rt2node() will be called under the lock. fib6_add_rt2node() could call fib6_purge_rt() for another route, which could holds another nexthop lock. Then, deadlock could happen between two nexthops. Let's defer fib6_purge_rt() after fib6_add_rt2node(). Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250418000443.43734-14-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- include/net/ip6_fib.h | 1 + net/ipv6/ip6_fib.c | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 7c87873ae211..88b0dd4d8e09 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -198,6 +198,7 @@ struct fib6_info { fib6_destroying:1, unused:4; + struct list_head purge_link; struct rcu_head rcu; struct nexthop *nh; struct fib6_nh fib6_nh[]; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 79b672f3fc53..9e9db5470bbf 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1083,8 +1083,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, - struct nl_info *info, - struct netlink_ext_ack *extack) + struct nl_info *info, struct netlink_ext_ack *extack, + struct list_head *purge_list) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); @@ -1308,10 +1308,9 @@ add: } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); + list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ @@ -1324,10 +1323,9 @@ add: if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); + list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { @@ -1397,6 +1395,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; + LIST_HEAD(purge_list); struct fib6_node *fn; #ifdef CONFIG_IPV6_SUBTREES struct fib6_node *pn = NULL; @@ -1499,8 +1498,16 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, } #endif - err = fib6_add_rt2node(fn, rt, info, extack); + err = fib6_add_rt2node(fn, rt, info, extack, &purge_list); if (!err) { + struct fib6_info *iter, *next; + + list_for_each_entry_safe(iter, next, &purge_list, purge_link) { + list_del(&iter->purge_link); + fib6_purge_rt(iter, fn, info->nl_net); + fib6_info_release(iter); + } + if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); -- cgit v1.2.3 From 081efd18326e353c6fbfdeff903a83edde953f72 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:55 -0700 Subject: ipv6: Protect nh->f6i_list with spinlock and flag. We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT. Then, we may be going to add a route tied to a dying nexthop. The nexthop itself is not freed during the RCU grace period, but if we link a route after __remove_nexthop_fib() is called for the nexthop, the route will be leaked. To avoid the race between IPv6 route addition under RCU vs nexthop deletion under RTNL, let's add a dead flag and protect it and nh->f6i_list with a spinlock. __remove_nexthop_fib() acquires the nexthop's spinlock and sets false to nh->dead, then calls ip6_del_rt() for the linked route one by one without the spinlock because fib6_purge_rt() acquires it later. While adding an IPv6 route, fib6_add() acquires the nexthop lock and checks the dead flag just before inserting the route. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250418000443.43734-15-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- include/net/nexthop.h | 2 ++ net/ipv4/nexthop.c | 18 +++++++++++++++--- net/ipv6/ip6_fib.c | 39 ++++++++++++++++++++++++++++++++++----- 3 files changed, 51 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/nexthop.h b/include/net/nexthop.h index d9fb44e8b321..572e69cda476 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -152,6 +152,8 @@ struct nexthop { u8 protocol; /* app managing this nh */ u8 nh_flags; bool is_group; + bool dead; + spinlock_t lock; /* protect dead and f6i_list */ refcount_t refcnt; struct rcu_head rcu; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index d9cf06b297d1..6ba6cb1340c1 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -541,6 +541,7 @@ static struct nexthop *nexthop_alloc(void) INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); INIT_LIST_HEAD(&nh->fdb_list); + spin_lock_init(&nh->lock); } return nh; } @@ -2118,7 +2119,7 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) /* not called for nexthop replace */ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) { - struct fib6_info *f6i, *tmp; + struct fib6_info *f6i; bool do_flush = false; struct fib_info *fi; @@ -2129,13 +2130,24 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) if (do_flush) fib_flush(net); - /* ip6_del_rt removes the entry from this list hence the _safe */ - list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { + spin_lock_bh(&nh->lock); + + nh->dead = true; + + while (!list_empty(&nh->f6i_list)) { + f6i = list_first_entry(&nh->f6i_list, typeof(*f6i), nh_list); + /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); + + spin_unlock_bh(&nh->lock); ipv6_stub->ip6_del_rt(net, f6i, !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)); + + spin_lock_bh(&nh->lock); } + + spin_unlock_bh(&nh->lock); } static void __remove_nexthop(struct net *net, struct nexthop *nh, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 9e9db5470bbf..1f860340690c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1048,8 +1048,14 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt, table); - if (rt->nh && !list_empty(&rt->nh_list)) - list_del_init(&rt->nh_list); + if (rt->nh) { + spin_lock(&rt->nh->lock); + + if (!list_empty(&rt->nh_list)) + list_del_init(&rt->nh_list); + + spin_unlock(&rt->nh->lock); + } if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split @@ -1341,6 +1347,28 @@ add: return 0; } +static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt, + struct nl_info *info, struct netlink_ext_ack *extack, + struct list_head *purge_list) +{ + int err; + + spin_lock(&rt->nh->lock); + + if (rt->nh->dead) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -EINVAL; + } else { + err = fib6_add_rt2node(fn, rt, info, extack, purge_list); + if (!err) + list_add(&rt->nh_list, &rt->nh->f6i_list); + } + + spin_unlock(&rt->nh->lock); + + return err; +} + static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && @@ -1498,7 +1526,10 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, } #endif - err = fib6_add_rt2node(fn, rt, info, extack, &purge_list); + if (rt->nh) + err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list); + else + err = fib6_add_rt2node(fn, rt, info, extack, &purge_list); if (!err) { struct fib6_info *iter, *next; @@ -1508,8 +1539,6 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, fib6_info_release(iter); } - if (rt->nh) - list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); if (rt->fib6_flags & RTF_EXPIRES) -- cgit v1.2.3 From 39144062ea335495d659b08c9e3133ab746a0b1b Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 23 Apr 2025 00:51:47 +0100 Subject: rxrpc: Remove deadcode Remove three functions that are no longer used. rxrpc_get_txbuf() last use was removed by 2020's commit 5e6ef4f1017c ("rxrpc: Make the I/O thread take over the call and local processor work") rxrpc_kernel_get_epoch() last use was removed by 2020's commit 44746355ccb1 ("afs: Don't get epoch from a server because it may be ambiguous") rxrpc_kernel_set_max_life() last use was removed by 2023's commit db099c625b13 ("rxrpc: Fix timeout of a call that hasn't yet been granted a channel") Both of the rxrpc_kernel_* functions were documented. Remove that documentation as well as the code. Signed-off-by: Dr. David Alan Gilbert Acked-by: David Howells Link: https://patch.msgid.link/20250422235147.146460-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- Documentation/networking/rxrpc.rst | 24 ---------------------- include/net/af_rxrpc.h | 3 --- net/rxrpc/af_rxrpc.c | 41 -------------------------------------- net/rxrpc/ar-internal.h | 1 - net/rxrpc/txbuf.c | 8 -------- 5 files changed, 77 deletions(-) (limited to 'include') diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index fe2ea73be441..d63e3e27dd06 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -1062,30 +1062,6 @@ The kernel interface functions are as follows: first function to change. Note that this must be called in TASK_RUNNING state. - (#) Get remote client epoch:: - - u32 rxrpc_kernel_get_epoch(struct socket *sock, - struct rxrpc_call *call) - - This allows the epoch that's contained in packets of an incoming client - call to be queried. This value is returned. The function always - successful if the call is still in progress. It shouldn't be called once - the call has expired. Note that calling this on a local client call only - returns the local epoch. - - This value can be used to determine if the remote client has been - restarted as it shouldn't change otherwise. - - (#) Set the maximum lifespan on a call:: - - void rxrpc_kernel_set_max_life(struct socket *sock, - struct rxrpc_call *call, - unsigned long hard_timeout) - - This sets the maximum lifespan on a call to hard_timeout (which is in - jiffies). In the event of the timeout occurring, the call will be - aborted and -ETIME or -ETIMEDOUT will be returned. - (#) Apply the RXRPC_MIN_SECURITY_LEVEL sockopt to a socket from within in the kernel:: diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index f15341594cc8..0fb4c41c9bbf 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -88,9 +88,6 @@ int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, unsigned int debug_id); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); -u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); -void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, - unsigned long); int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); int rxrpc_sock_set_security_keyring(struct sock *, struct key *); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 3a558c1a541e..36df0274d7b7 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -460,22 +460,6 @@ bool rxrpc_kernel_check_life(const struct socket *sock, } EXPORT_SYMBOL(rxrpc_kernel_check_life); -/** - * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call. - * @sock: The socket the call is on - * @call: The call to query - * - * Allow a kernel service to retrieve the epoch value from a service call to - * see if the client at the other end rebooted. - * - * Return: The epoch of the call's connection. - */ -u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call) -{ - return call->conn->proto.epoch; -} -EXPORT_SYMBOL(rxrpc_kernel_get_epoch); - /** * rxrpc_kernel_set_notifications - Set table of callback operations * @sock: The socket to install table upon @@ -492,31 +476,6 @@ void rxrpc_kernel_set_notifications(struct socket *sock, } EXPORT_SYMBOL(rxrpc_kernel_set_notifications); -/** - * rxrpc_kernel_set_max_life - Set maximum lifespan on a call - * @sock: The socket the call is on - * @call: The call to configure - * @hard_timeout: The maximum lifespan of the call in ms - * - * Set the maximum lifespan of a call. The call will end with ETIME or - * ETIMEDOUT if it takes longer than this. - */ -void rxrpc_kernel_set_max_life(struct socket *sock, struct rxrpc_call *call, - unsigned long hard_timeout) -{ - ktime_t delay = ms_to_ktime(hard_timeout), expect_term_by; - - mutex_lock(&call->user_mutex); - - expect_term_by = ktime_add(ktime_get_real(), delay); - WRITE_ONCE(call->expect_term_by, expect_term_by); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); - rxrpc_poke_call(call, rxrpc_call_poke_set_timeout); - - mutex_unlock(&call->user_mutex); -} -EXPORT_SYMBOL(rxrpc_kernel_set_max_life); - /* * connect an RxRPC socket * - this just targets it at a specific destination; no actual connection diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ca62a1db3286..5bd3922c310d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1503,7 +1503,6 @@ static inline void rxrpc_sysctl_exit(void) {} extern atomic_t rxrpc_nr_txbuf; struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, size_t data_align, gfp_t gfp); -void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index c550991d48fa..29767038691a 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -60,14 +60,6 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ return txb; } -void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) -{ - int r; - - __refcount_inc(&txb->ref, &r); - trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, r + 1, what); -} - void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) { int r = refcount_read(&txb->ref); -- cgit v1.2.3 From bc2550b4e195754fbb24aac1f012d3dd9e3b4edc Mon Sep 17 00:00:00 2001 From: Jeremy Harris Date: Wed, 23 Apr 2025 13:43:33 +0100 Subject: tcp: fastopen: note that a child socket was created tcp: fastopen: note that a child socket was created This uses up the last bit in a field of tcp_sock. Signed-off-by: Jeremy Harris Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250423124334.4916-2-jgh@exim.org Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 3 ++- net/ipv4/tcp.c | 1 + net/ipv4/tcp_fastopen.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1669d95bb0f9..a8af71623ba7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -385,7 +385,8 @@ struct tcp_sock { syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ - syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ + syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ + syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e0e96f8fd47c..df3eb0fb7cb3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3409,6 +3409,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rack.reo_wnd_persist = 0; tp->rack.dsack_seen = 0; tp->syn_data_acked = 0; + tp->syn_fastopen_child = 0; tp->rx_opt.saw_tstamp = 0; tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 1a6b1bc54245..9b83d639b5ac 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -401,6 +401,7 @@ fastopen: } NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); + tcp_sk(child)->syn_fastopen_child = 1; return child; } NET_INC_STATS(sock_net(sk), -- cgit v1.2.3 From 2b13042d3636327eb50c8a0ee06f629d52d1b8fb Mon Sep 17 00:00:00 2001 From: Jeremy Harris Date: Wed, 23 Apr 2025 13:43:34 +0100 Subject: tcp: fastopen: pass TFO child indication through getsockopt tcp: fastopen: pass TFO child indication through getsockopt Note that this uses up the last bit of a field in struct tcp_info Signed-off-by: Jeremy Harris Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250423124334.4916-3-jgh@exim.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index dc8fdc80e16b..bdac8c42fa82 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -184,6 +184,7 @@ enum tcp_fastopen_client_fail { #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ +#define TCPI_OPT_TFO_CHILD 128 /* child from a Fast Open option on SYN */ /* * Sender's congestion state indicating normal or abnormal situations diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index df3eb0fb7cb3..86c427f16636 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4165,6 +4165,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_options |= TCPI_OPT_SYN_DATA; if (tp->tcp_usec_ts) info->tcpi_options |= TCPI_OPT_USEC_TS; + if (tp->syn_fastopen_child) + info->tcpi_options |= TCPI_OPT_TFO_CHILD; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato, -- cgit v1.2.3 From d57ee99831e336576359beb26e2b140511c99106 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 23 Apr 2025 17:08:08 +0200 Subject: net: ethernet: mtk_wed: annotate RCU release in attach() There are some sparse warnings in wifi, and it seems that it's actually possible to annotate a function pointer with __releases(), making the sparse warnings go away. In a way that also serves as documentation that rcu_read_unlock() must be called in the attach method, so add that annotation. Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20250423150811.456205-2-johannes@sipsolutions.net Signed-off-by: Jakub Kicinski --- include/linux/soc/mediatek/mtk_wed.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index a476648858a6..d8949a4ed0dc 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -192,7 +192,7 @@ struct mtk_wed_device { }; struct mtk_wed_ops { - int (*attach)(struct mtk_wed_device *dev); + int (*attach)(struct mtk_wed_device *dev) __releases(RCU); int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring, void __iomem *regs, bool reset); int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring, -- cgit v1.2.3 From 34dd0fecaa02d654c447d43a7e4c72f9b18b7033 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 25 Apr 2025 16:55:31 +0200 Subject: net: sched: generalize check for no-queue qdisc on TX queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "noqueue" qdisc can either be directly attached, or get default attached if net_device priv_flags has IFF_NO_QUEUE. In both cases, the allocated Qdisc structure gets it's enqueue function pointer reset to NULL by noqueue_init() via noqueue_qdisc_ops. This is a common case for software virtual net_devices. For these devices with no-queue, the transmission path in __dev_queue_xmit() will bypass the qdisc layer. Directly invoking device drivers ndo_start_xmit (via dev_hard_start_xmit). In this mode the device driver is not allowed to ask for packets to be queued (either via returning NETDEV_TX_BUSY or stopping the TXQ). The simplest and most reliable way to identify this no-queue case is by checking if enqueue == NULL. The vrf driver currently open-codes this check (!qdisc->enqueue). While functionally correct, this low-level detail is better encapsulated in a dedicated helper for clarity and long-term maintainability. To make this behavior more explicit and reusable, this patch introduce a new helper: qdisc_txq_has_no_queue(). Helper will also be used by the veth driver in the next patch, which introduces optional qdisc-based backpressure. This is a non-functional change. Reviewed-by: David Ahern Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/174559293172.827981.7583862632045264175.stgit@firesoul Signed-off-by: Jakub Kicinski --- drivers/net/vrf.c | 4 +--- include/net/sch_generic.h | 8 ++++++++ 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 7168b33adadb..9a4beea6ee0c 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -343,15 +343,13 @@ unlock: static bool qdisc_tx_is_default(const struct net_device *dev) { struct netdev_queue *txq; - struct Qdisc *qdisc; if (dev->num_tx_queues > 1) return false; txq = netdev_get_tx_queue(dev, 0); - qdisc = rcu_access_pointer(txq->qdisc); - return !qdisc->enqueue; + return qdisc_txq_has_no_queue(txq); } /* Local traffic destined to local address. Reinsert the packet to rx diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index d48c657191cd..b6c177f7141c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -803,6 +803,14 @@ static inline bool qdisc_tx_changing(const struct net_device *dev) return false; } +/* "noqueue" qdisc identified by not having any enqueue, see noqueue_init() */ +static inline bool qdisc_txq_has_no_queue(const struct netdev_queue *txq) +{ + struct Qdisc *qdisc = rcu_access_pointer(txq->qdisc); + + return qdisc->enqueue == NULL; +} + /* Is the device using the noop qdisc on all queues? */ static inline bool qdisc_tx_is_noop(const struct net_device *dev) { -- cgit v1.2.3 From 0014af802193aa3547484b5db0f1a258bad28c81 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Apr 2025 15:55:53 +0200 Subject: netfilter: nf_tables: export set count and backend name to userspace nf_tables picks a suitable set backend implementation (bitmap, hash, rbtree..) based on the userspace requirements. Figuring out the chosen backend requires information about the set flags and the kernel version. Export this to userspace so nft can include this information in '--debug=netlink' output. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++++ net/netfilter/nf_tables_api.c | 26 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 49c944e78463..7d6bc19a0153 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -394,6 +394,8 @@ enum nft_set_field_attributes { * @NFTA_SET_HANDLE: set handle (NLA_U64) * @NFTA_SET_EXPR: set expression (NLA_NESTED: nft_expr_attributes) * @NFTA_SET_EXPRESSIONS: list of expressions (NLA_NESTED: nft_list_attributes) + * @NFTA_SET_TYPE: set backend type (NLA_STRING) + * @NFTA_SET_COUNT: number of set elements (NLA_U32) */ enum nft_set_attributes { NFTA_SET_UNSPEC, @@ -415,6 +417,8 @@ enum nft_set_attributes { NFTA_SET_HANDLE, NFTA_SET_EXPR, NFTA_SET_EXPRESSIONS, + NFTA_SET_TYPE, + NFTA_SET_COUNT, __NFTA_SET_MAX }; #define NFTA_SET_MAX (__NFTA_SET_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a133e1c175ce..b28f6730e26d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4569,6 +4569,8 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_HANDLE] = { .type = NLA_U64 }, [NFTA_SET_EXPR] = { .type = NLA_NESTED }, [NFTA_SET_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), + [NFTA_SET_TYPE] = { .type = NLA_REJECT }, + [NFTA_SET_COUNT] = { .type = NLA_REJECT }, }; static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { @@ -4763,6 +4765,27 @@ static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size) return size; } +static noinline_for_stack int +nf_tables_fill_set_info(struct sk_buff *skb, const struct nft_set *set) +{ + unsigned int nelems; + char str[40]; + int ret; + + ret = snprintf(str, sizeof(str), "%ps", set->ops); + + /* Not expected to happen and harmless: NFTA_SET_TYPE is dumped + * to userspace purely for informational/debug purposes. + */ + DEBUG_NET_WARN_ON_ONCE(ret >= sizeof(str)); + + if (nla_put_string(skb, NFTA_SET_TYPE, str)) + return -EMSGSIZE; + + nelems = nft_set_userspace_size(set->ops, atomic_read(&set->nelems)); + return nla_put_be32(skb, NFTA_SET_COUNT, htonl(nelems)); +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { @@ -4843,6 +4866,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nla_nest_end(skb, nest); + if (nf_tables_fill_set_info(skb, set)) + goto nla_put_failure; + if (set->num_exprs == 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0) -- cgit v1.2.3 From 32607a332cfea5a4b2a185f3e3d605a9bf4f8df0 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 24 Apr 2025 10:35:18 -0400 Subject: ipv4: prefer multipath nexthop that matches source address With multipath routes, try to ensure that packets leave on the device that is associated with the source address. Avoid the following tcpdump example: veth0 Out IP 10.1.0.2.38640 > 10.2.0.3.8000: Flags [S] veth1 Out IP 10.1.0.2.38648 > 10.2.0.3.8000: Flags [S] Which can happen easily with the most straightforward setup: ip addr add 10.0.0.1/24 dev veth0 ip addr add 10.1.0.1/24 dev veth1 ip route add 10.2.0.3 nexthop via 10.0.0.2 dev veth0 \ nexthop via 10.1.0.2 dev veth1 This is apparently considered WAI, based on the comment in ip_route_output_key_hash_rcu: * 2. Moreover, we are allowed to send packets with saddr * of another iface. --ANK It may be ok for some uses of multipath, but not all. For instance, when using two ISPs, a router may drop packets with unknown source. The behavior occurs because tcp_v4_connect makes three route lookups when establishing a connection: 1. ip_route_connect calls to select a source address, with saddr zero. 2. ip_route_connect calls again now that saddr and daddr are known. 3. ip_route_newports calls again after a source port is also chosen. With a route with multiple nexthops, each lookup may make a different choice depending on available entropy to fib_select_multipath. So it is possible for 1 to select the saddr from the first entry, but 3 to select the second entry. Leading to the above situation. Address this by preferring a match that matches the flowi4 saddr. This will make 2 and 3 make the same choice as 1. Continue to update the backup choice until a choice that matches saddr is found. Do this in fib_select_multipath itself, rather than passing an fl4_oif constraint, to avoid changing non-multipath route selection. Commit e6b45241c57a ("ipv4: reset flowi parameters on route connect") shows how that may cause regressions. Also read ipv4.sysctl_fib_multipath_use_neigh only once. No need to refresh in the loop. This does not happen in IPv6, which performs only one lookup. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250424143549.669426-2-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- include/net/ip_fib.h | 3 ++- net/ipv4/fib_semantics.c | 39 +++++++++++++++++++++++++-------------- net/ipv4/route.c | 2 +- 3 files changed, 28 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e3864b74e92a..48bb3cf41469 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -574,7 +574,8 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net, int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack); -void fib_select_multipath(struct fib_result *res, int hash); +void fib_select_multipath(struct fib_result *res, int hash, + const struct flowi4 *fl4); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 5326f1501af0..2371f311a1e1 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -2170,34 +2170,45 @@ static bool fib_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } -void fib_select_multipath(struct fib_result *res, int hash) +void fib_select_multipath(struct fib_result *res, int hash, + const struct flowi4 *fl4) { struct fib_info *fi = res->fi; struct net *net = fi->fib_net; - bool first = false; + bool found = false; + bool use_neigh; + __be32 saddr; if (unlikely(res->fi->nh)) { nexthop_path_fib_result(res, hash); return; } + use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh); + saddr = fl4 ? fl4->saddr : 0; + change_nexthops(fi) { - if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) { - if (!fib_good_nh(nexthop_nh)) - continue; - if (!first) { - res->nh_sel = nhsel; - res->nhc = &nexthop_nh->nh_common; - first = true; - } + if (use_neigh && !fib_good_nh(nexthop_nh)) + continue; + + if (!found) { + res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; + found = !saddr || nexthop_nh->nh_saddr == saddr; } if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound)) continue; - res->nh_sel = nhsel; - res->nhc = &nexthop_nh->nh_common; - return; + if (!saddr || nexthop_nh->nh_saddr == saddr) { + res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; + return; + } + + if (found) + return; + } endfor_nexthops(fi); } #endif @@ -2212,7 +2223,7 @@ void fib_select_path(struct net *net, struct fib_result *res, if (fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(net, fl4, skb, NULL); - fib_select_multipath(res, h); + fib_select_multipath(res, h, fl4); } else #endif diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 49cffbe83802..e5e4c71be3af 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2154,7 +2154,7 @@ ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); - fib_select_multipath(res, h); + fib_select_multipath(res, h, NULL); IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif -- cgit v1.2.3 From 65e9024643c7512ade3aedbb341e11d77ed7abc2 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 24 Apr 2025 10:35:19 -0400 Subject: ip: load balance tcp connections to single dst addr and port Load balance new TCP connections across nexthops also when they connect to the same service at a single remote address and port. This affects only port-based multipath hashing: fib_multipath_hash_policy 1 or 3. Local connections must choose both a source address and port when connecting to a remote service, in ip_route_connect. This "chicken-and-egg problem" (commit 2d7192d6cbab ("ipv4: Sanitize and simplify ip_route_{connect,newports}()")) is resolved by first selecting a source address, by looking up a route using the zero wildcard source port and address. As a result multiple connections to the same destination address and port have no entropy in fib_multipath_hash. This is not a problem when forwarding, as skb-based hashing has a 4-tuple. Nor when establishing UDP connections, as autobind there selects a port before reaching ip_route_connect. Load balance also TCP, by using a random port in fib_multipath_hash. Port assignment in inet_hash_connect is not atomic with ip_route_connect. Thus ports are unpredictable, effectively random. Implementation details: Do not actually pass a random fl4_sport, as that affects not only hashing, but routing more broadly, and can match a source port based policy route, which existing wildcard port 0 will not. Instead, define a new wildcard flowi flag that is used only for hashing. Selecting a random source is equivalent to just selecting a random hash entirely. But for code clarity, follow the normal 4-tuple hash process and only update this field. fib_multipath_hash can be reached with zero sport from other code paths, so explicitly pass this flowi flag, rather than trying to infer this case in the function itself. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250424143549.669426-3-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- include/net/flow.h | 1 + include/net/route.h | 3 +++ net/ipv4/route.c | 13 ++++++++++--- net/ipv6/route.c | 13 ++++++++++--- net/ipv6/tcp_ipv6.c | 2 ++ 5 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/flow.h b/include/net/flow.h index 2a3f0c42f092..a1839c278d87 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -39,6 +39,7 @@ struct flowi_common { #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 #define FLOWI_FLAG_L3MDEV_OIF 0x04 +#define FLOWI_FLAG_ANY_SPORT 0x08 __u32 flowic_secid; kuid_t flowic_uid; __u32 flowic_multipath_hash; diff --git a/include/net/route.h b/include/net/route.h index c605fd5ec0c0..8e39aa822cf9 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -326,6 +326,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, if (inet_test_bit(TRANSPARENT, sk)) flow_flags |= FLOWI_FLAG_ANYSRC; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport) + flow_flags |= FLOWI_FLAG_ANY_SPORT; + flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), protocol, flow_flags, dst, src, dport, sport, sk->sk_uid); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e5e4c71be3af..507b2e5dec50 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2037,8 +2037,12 @@ static u32 fib_multipath_custom_hash_fl4(const struct net *net, hash_keys.addrs.v4addrs.dst = fl4->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl4->flowi4_proto; - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) - hash_keys.ports.src = fl4->fl4_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl4->fl4_sport; + } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl4->fl4_dport; @@ -2093,7 +2097,10 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; - hash_keys.ports.src = fl4->fl4_sport; + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl4->fl4_sport; hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d0351e95d916..aa6b45bd3515 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2492,8 +2492,12 @@ static u32 rt6_multipath_custom_hash_fl6(const struct net *net, hash_keys.basic.ip_proto = fl6->flowi6_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) - hash_keys.ports.src = fl6->fl6_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; + } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl6->fl6_dport; @@ -2547,7 +2551,10 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; - hash_keys.ports.src = fl6->fl6_sport; + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7dcb33f879ee..e8e68a142649 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -267,6 +267,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport) + fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT; fl6.flowi6_uid = sk->sk_uid; opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); -- cgit v1.2.3 From 144530c15ec7fa95b29812d86f4be527338ea204 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 25 Apr 2025 13:46:16 -0700 Subject: pds_core: remove extra name description Fix the kernel-doc complaint include/linux/pds/pds_adminq.h:481: warning: Excess struct member 'name' description in 'pds_core_lif_getattr_comp' Reviewed-by: Simon Horman Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- include/linux/pds/pds_adminq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h index ddd111f04ca0..339156113fa5 100644 --- a/include/linux/pds/pds_adminq.h +++ b/include/linux/pds/pds_adminq.h @@ -463,7 +463,6 @@ struct pds_core_lif_getattr_cmd { * @rsvd: Word boundary padding * @comp_index: Index in the descriptor ring for which this is the completion * @state: LIF state (enum pds_core_lif_state) - * @name: LIF name string, 0 terminated * @features: Features (enum pds_core_hw_features) * @rsvd2: Word boundary padding * @color: Color bit -- cgit v1.2.3 From 7c4f4c4fa9b6fbb7e483bebd02f7b9cbc20ca5cc Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 25 Apr 2025 13:46:17 -0700 Subject: pds_core: smaller adminq poll starting interval Shorten the adminq poll starting interval in order to notice any transaction errors more quickly. Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/pds_core/adminq.c | 4 ++-- include/linux/pds/pds_adminq.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/amd/pds_core/adminq.c b/drivers/net/ethernet/amd/pds_core/adminq.c index 506f682d15c1..097bb092bdb8 100644 --- a/drivers/net/ethernet/amd/pds_core/adminq.c +++ b/drivers/net/ethernet/amd/pds_core/adminq.c @@ -225,7 +225,7 @@ int pdsc_adminq_post(struct pdsc *pdsc, union pds_core_adminq_comp *comp, bool fast_poll) { - unsigned long poll_interval = 1; + unsigned long poll_interval = 200; unsigned long poll_jiffies; unsigned long time_limit; unsigned long time_start; @@ -252,7 +252,7 @@ int pdsc_adminq_post(struct pdsc *pdsc, time_limit = time_start + HZ * pdsc->devcmd_timeout; do { /* Timeslice the actual wait to catch IO errors etc early */ - poll_jiffies = msecs_to_jiffies(poll_interval); + poll_jiffies = usecs_to_jiffies(poll_interval); remaining = wait_for_completion_timeout(wc, poll_jiffies); if (remaining) break; diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h index 339156113fa5..40ff0ec2b879 100644 --- a/include/linux/pds/pds_adminq.h +++ b/include/linux/pds/pds_adminq.h @@ -4,7 +4,7 @@ #ifndef _PDS_CORE_ADMINQ_H_ #define _PDS_CORE_ADMINQ_H_ -#define PDSC_ADMINQ_MAX_POLL_INTERVAL 256 +#define PDSC_ADMINQ_MAX_POLL_INTERVAL 256000 /* usecs */ enum pds_core_adminq_flags { PDS_AQ_FLAG_FASTPOLL = BIT(1), /* completion poll at 1ms */ -- cgit v1.2.3 From 468d8b462ac64659caec53eff34f02963d5f52c8 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 15 Apr 2025 21:15:45 -0500 Subject: iidc/ice/irdma: Rename IDC header file To prepare for the IDC upgrade to support different CORE PCI drivers, rename header file from iidc.h to iidc_rdma.h since this files functionality is specifically for RDMA support. Use net/dscp.h include in irdma osdep.h and DSCP_MAX type.h, instead of iidc header and define. Reviewed-by: Przemek Kitszel Signed-off-by: Dave Ertman Signed-off-by: Tatyana Nikolova Signed-off-by: Tony Nguyen --- MAINTAINERS | 2 +- drivers/infiniband/hw/irdma/main.h | 2 +- drivers/infiniband/hw/irdma/osdep.h | 2 +- drivers/infiniband/hw/irdma/type.h | 4 +- drivers/net/ethernet/intel/ice/ice_idc_int.h | 2 +- include/linux/net/intel/iidc.h | 109 --------------------------- include/linux/net/intel/iidc_rdma.h | 109 +++++++++++++++++++++++++++ 7 files changed, 115 insertions(+), 115 deletions(-) delete mode 100644 include/linux/net/intel/iidc.h create mode 100644 include/linux/net/intel/iidc_rdma.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 96b827049501..136309587791 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11890,7 +11890,7 @@ F: Documentation/networking/device_drivers/ethernet/intel/ F: drivers/net/ethernet/intel/ F: drivers/net/ethernet/intel/*/ F: include/linux/avf/virtchnl.h -F: include/linux/net/intel/iidc.h +F: include/linux/net/intel/*/ INTEL ETHERNET PROTOCOL DRIVER FOR RDMA M: Mustafa Ismail diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index bb0b6494ccb2..e8083b0c8cb2 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -29,7 +29,7 @@ #include #endif #include -#include +#include #include #include #include diff --git a/drivers/infiniband/hw/irdma/osdep.h b/drivers/infiniband/hw/irdma/osdep.h index 4b4f78288d12..3f73ceacccb6 100644 --- a/drivers/infiniband/hw/irdma/osdep.h +++ b/drivers/infiniband/hw/irdma/osdep.h @@ -5,8 +5,8 @@ #include #include -#include #include +#include #define STATS_TIMER_DELAY 60000 diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h index 59b34afa867b..527c6da2c1ac 100644 --- a/drivers/infiniband/hw/irdma/type.h +++ b/drivers/infiniband/hw/irdma/type.h @@ -567,7 +567,7 @@ struct irdma_sc_vsi { u8 qos_rel_bw; u8 qos_prio_type; u8 stats_idx; - u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; + u8 dscp_map[DSCP_MAX]; struct irdma_qos qos[IRDMA_MAX_USER_PRIORITY]; u64 hw_stats_regs[IRDMA_HW_STAT_INDEX_MAX_GEN_1]; bool dscp_mode:1; @@ -695,7 +695,7 @@ struct irdma_l2params { u16 qs_handle_list[IRDMA_MAX_USER_PRIORITY]; u16 mtu; u8 up2tc[IRDMA_MAX_USER_PRIORITY]; - u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; + u8 dscp_map[DSCP_MAX]; u8 num_tc; u8 vsi_rel_bw; u8 vsi_prio_type; diff --git a/drivers/net/ethernet/intel/ice/ice_idc_int.h b/drivers/net/ethernet/intel/ice/ice_idc_int.h index 4b0c86757df9..b0c504a6408e 100644 --- a/drivers/net/ethernet/intel/ice/ice_idc_int.h +++ b/drivers/net/ethernet/intel/ice/ice_idc_int.h @@ -4,7 +4,7 @@ #ifndef _ICE_IDC_INT_H_ #define _ICE_IDC_INT_H_ -#include +#include struct ice_pf; diff --git a/include/linux/net/intel/iidc.h b/include/linux/net/intel/iidc.h deleted file mode 100644 index 13274c3def66..000000000000 --- a/include/linux/net/intel/iidc.h +++ /dev/null @@ -1,109 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2021, Intel Corporation. */ - -#ifndef _IIDC_H_ -#define _IIDC_H_ - -#include -#include -#include -#include -#include -#include - -enum iidc_event_type { - IIDC_EVENT_BEFORE_MTU_CHANGE, - IIDC_EVENT_AFTER_MTU_CHANGE, - IIDC_EVENT_BEFORE_TC_CHANGE, - IIDC_EVENT_AFTER_TC_CHANGE, - IIDC_EVENT_CRIT_ERR, - IIDC_EVENT_NBITS /* must be last */ -}; - -enum iidc_reset_type { - IIDC_PFR, - IIDC_CORER, - IIDC_GLOBR, -}; - -enum iidc_rdma_protocol { - IIDC_RDMA_PROTOCOL_IWARP = BIT(0), - IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1), -}; - -#define IIDC_MAX_USER_PRIORITY 8 -#define IIDC_MAX_DSCP_MAPPING 64 -#define IIDC_DSCP_PFC_MODE 0x1 - -/* Struct to hold per RDMA Qset info */ -struct iidc_rdma_qset_params { - /* Qset TEID returned to the RDMA driver in - * ice_add_rdma_qset and used by RDMA driver - * for calls to ice_del_rdma_qset - */ - u32 teid; /* Qset TEID */ - u16 qs_handle; /* RDMA driver provides this */ - u16 vport_id; /* VSI index */ - u8 tc; /* TC branch the Qset should belong to */ -}; - -struct iidc_qos_info { - u64 tc_ctx; - u8 rel_bw; - u8 prio_type; - u8 egress_virt_up; - u8 ingress_virt_up; -}; - -/* Struct to pass QoS info */ -struct iidc_qos_params { - struct iidc_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; - u8 up2tc[IIDC_MAX_USER_PRIORITY]; - u8 vport_relative_bw; - u8 vport_priority_type; - u8 num_tc; - u8 pfc_mode; - u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; -}; - -struct iidc_event { - DECLARE_BITMAP(type, IIDC_EVENT_NBITS); - u32 reg; -}; - -struct ice_pf; - -int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type); -int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); -void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos); -int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); -void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); - -/* Structure representing auxiliary driver tailored information about the core - * PCI dev, each auxiliary driver using the IIDC interface will have an - * instance of this struct dedicated to it. - */ - -struct iidc_auxiliary_dev { - struct auxiliary_device adev; - struct ice_pf *pf; -}; - -/* structure representing the auxiliary driver. This struct is to be - * allocated and populated by the auxiliary driver's owner. The core PCI - * driver will access these ops by performing a container_of on the - * auxiliary_device->dev.driver. - */ -struct iidc_auxiliary_drv { - struct auxiliary_driver adrv; - /* This event_handler is meant to be a blocking call. For instance, - * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not - * return until the auxiliary driver is ready for the MTU change to - * happen. - */ - void (*event_handler)(struct ice_pf *pf, struct iidc_event *event); -}; - -#endif /* _IIDC_H_*/ diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h new file mode 100644 index 000000000000..0cd75404e459 --- /dev/null +++ b/include/linux/net/intel/iidc_rdma.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021, Intel Corporation. */ + +#ifndef _IIDC_RDMA_H_ +#define _IIDC_RDMA_H_ + +#include +#include +#include +#include +#include +#include + +enum iidc_event_type { + IIDC_EVENT_BEFORE_MTU_CHANGE, + IIDC_EVENT_AFTER_MTU_CHANGE, + IIDC_EVENT_BEFORE_TC_CHANGE, + IIDC_EVENT_AFTER_TC_CHANGE, + IIDC_EVENT_CRIT_ERR, + IIDC_EVENT_NBITS /* must be last */ +}; + +enum iidc_reset_type { + IIDC_PFR, + IIDC_CORER, + IIDC_GLOBR, +}; + +enum iidc_rdma_protocol { + IIDC_RDMA_PROTOCOL_IWARP = BIT(0), + IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1), +}; + +#define IIDC_MAX_USER_PRIORITY 8 +#define IIDC_MAX_DSCP_MAPPING 64 +#define IIDC_DSCP_PFC_MODE 0x1 + +/* Struct to hold per RDMA Qset info */ +struct iidc_rdma_qset_params { + /* Qset TEID returned to the RDMA driver in + * ice_add_rdma_qset and used by RDMA driver + * for calls to ice_del_rdma_qset + */ + u32 teid; /* Qset TEID */ + u16 qs_handle; /* RDMA driver provides this */ + u16 vport_id; /* VSI index */ + u8 tc; /* TC branch the Qset should belong to */ +}; + +struct iidc_qos_info { + u64 tc_ctx; + u8 rel_bw; + u8 prio_type; + u8 egress_virt_up; + u8 ingress_virt_up; +}; + +/* Struct to pass QoS info */ +struct iidc_qos_params { + struct iidc_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; + u8 up2tc[IIDC_MAX_USER_PRIORITY]; + u8 vport_relative_bw; + u8 vport_priority_type; + u8 num_tc; + u8 pfc_mode; + u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; +}; + +struct iidc_event { + DECLARE_BITMAP(type, IIDC_EVENT_NBITS); + u32 reg; +}; + +struct ice_pf; + +int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); +int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); +int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type); +int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); +void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos); +int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); +void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); + +/* Structure representing auxiliary driver tailored information about the core + * PCI dev, each auxiliary driver using the IIDC interface will have an + * instance of this struct dedicated to it. + */ + +struct iidc_auxiliary_dev { + struct auxiliary_device adev; + struct ice_pf *pf; +}; + +/* structure representing the auxiliary driver. This struct is to be + * allocated and populated by the auxiliary driver's owner. The core PCI + * driver will access these ops by performing a container_of on the + * auxiliary_device->dev.driver. + */ +struct iidc_auxiliary_drv { + struct auxiliary_driver adrv; + /* This event_handler is meant to be a blocking call. For instance, + * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not + * return until the auxiliary driver is ready for the MTU change to + * happen. + */ + void (*event_handler)(struct ice_pf *pf, struct iidc_event *event); +}; + +#endif /* _IIDC_RDMA_H_*/ -- cgit v1.2.3 From 97b5631aae6896369712d6b7131afbc95c753587 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 15 Apr 2025 21:15:46 -0500 Subject: iidc/ice/irdma: Rename to iidc_* convention In preparation of supporting more than a single core PCI driver for RDMA, homogenize naming to iidc_rdma_* and IIDC_RDMA_* form. Reviewed-by: Przemek Kitszel Signed-off-by: Dave Ertman Signed-off-by: Tatyana Nikolova Signed-off-by: Tony Nguyen --- drivers/infiniband/hw/irdma/main.c | 41 +++++++++++++++------------- drivers/net/ethernet/intel/ice/ice_dcb_lib.c | 8 +++--- drivers/net/ethernet/intel/ice/ice_idc.c | 23 +++++++++------- drivers/net/ethernet/intel/ice/ice_idc_int.h | 2 +- drivers/net/ethernet/intel/ice/ice_main.c | 8 +++--- include/linux/net/intel/iidc_rdma.h | 38 ++++++++++++++------------ 6 files changed, 64 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index 1ee8969595d3..46489c0ab511 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -61,7 +61,7 @@ static void irdma_log_invalid_mtu(u16 mtu, struct irdma_sc_dev *dev) } static void irdma_fill_qos_info(struct irdma_l2params *l2params, - struct iidc_qos_params *qos_info) + struct iidc_rdma_qos_params *qos_info) { int i; @@ -85,12 +85,12 @@ static void irdma_fill_qos_info(struct irdma_l2params *l2params, } } -static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_event *event) +static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_rdma_event *event) { struct irdma_device *iwdev = dev_get_drvdata(&pf->adev->dev); struct irdma_l2params l2params = {}; - if (*event->type & BIT(IIDC_EVENT_AFTER_MTU_CHANGE)) { + if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_MTU_CHANGE)) { ibdev_dbg(&iwdev->ibdev, "CLNT: new MTU = %d\n", iwdev->netdev->mtu); if (iwdev->vsi.mtu != iwdev->netdev->mtu) { l2params.mtu = iwdev->netdev->mtu; @@ -98,13 +98,13 @@ static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_event *event irdma_log_invalid_mtu(l2params.mtu, &iwdev->rf->sc_dev); irdma_change_l2params(&iwdev->vsi, &l2params); } - } else if (*event->type & BIT(IIDC_EVENT_BEFORE_TC_CHANGE)) { + } else if (*event->type & BIT(IIDC_RDMA_EVENT_BEFORE_TC_CHANGE)) { if (iwdev->vsi.tc_change_pending) return; irdma_prep_tc_change(iwdev); - } else if (*event->type & BIT(IIDC_EVENT_AFTER_TC_CHANGE)) { - struct iidc_qos_params qos_info = {}; + } else if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_TC_CHANGE)) { + struct iidc_rdma_qos_params qos_info = {}; if (!iwdev->vsi.tc_change_pending) return; @@ -116,7 +116,7 @@ static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_event *event if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) iwdev->dcb_vlan_mode = qos_info.num_tc > 1 && !l2params.dscp_mode; irdma_change_l2params(&iwdev->vsi, &l2params); - } else if (*event->type & BIT(IIDC_EVENT_CRIT_ERR)) { + } else if (*event->type & BIT(IIDC_RDMA_EVENT_CRIT_ERR)) { ibdev_warn(&iwdev->ibdev, "ICE OICR event notification: oicr = 0x%08x\n", event->reg); if (event->reg & IRDMAPFINT_OICR_PE_CRITERR_M) { @@ -245,11 +245,12 @@ static void irdma_deinit_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf) static void irdma_remove(struct auxiliary_device *aux_dev) { - struct iidc_auxiliary_dev *iidc_adev = container_of(aux_dev, - struct iidc_auxiliary_dev, - adev); - struct ice_pf *pf = iidc_adev->pf; struct irdma_device *iwdev = auxiliary_get_drvdata(aux_dev); + struct iidc_rdma_core_auxiliary_dev *iidc_adev; + struct ice_pf *pf; + + iidc_adev = container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev); + pf = iidc_adev->pf; irdma_ib_unregister_device(iwdev); ice_rdma_update_vsi_filter(pf, iwdev->vsi_num, false); @@ -292,17 +293,19 @@ static void irdma_fill_device_info(struct irdma_device *iwdev, struct ice_pf *pf static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_device_id *id) { - struct iidc_auxiliary_dev *iidc_adev = container_of(aux_dev, - struct iidc_auxiliary_dev, - adev); - struct ice_pf *pf = iidc_adev->pf; - struct ice_vsi *vsi = ice_get_main_vsi(pf); - struct iidc_qos_params qos_info = {}; + struct iidc_rdma_core_auxiliary_dev *iidc_adev; + struct iidc_rdma_qos_params qos_info = {}; + struct irdma_l2params l2params = {}; struct irdma_device *iwdev; struct irdma_pci_f *rf; - struct irdma_l2params l2params = {}; + struct ice_vsi *vsi; + struct ice_pf *pf; int err; + iidc_adev = container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev); + pf = iidc_adev->pf; + vsi = ice_get_main_vsi(pf); + if (!vsi) return -EIO; iwdev = ib_alloc_device(irdma_device, ibdev); @@ -367,7 +370,7 @@ static const struct auxiliary_device_id irdma_auxiliary_id_table[] = { MODULE_DEVICE_TABLE(auxiliary, irdma_auxiliary_id_table); -static struct iidc_auxiliary_drv irdma_auxiliary_drv = { +static struct iidc_rdma_core_auxiliary_drv irdma_auxiliary_drv = { .adrv = { .id_table = irdma_auxiliary_id_table, .probe = irdma_probe, diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index a7c510832824..fe16c59796db 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -352,8 +352,8 @@ int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked) struct ice_aqc_port_ets_elem buf = { 0 }; struct ice_dcbx_cfg *old_cfg, *curr_cfg; struct device *dev = ice_pf_to_dev(pf); + struct iidc_rdma_event *event; int ret = ICE_DCB_NO_HW_CHG; - struct iidc_event *event; struct ice_vsi *pf_vsi; curr_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg; @@ -405,7 +405,7 @@ int ice_pf_dcb_cfg(struct ice_pf *pf, struct ice_dcbx_cfg *new_cfg, bool locked) goto free_cfg; } - set_bit(IIDC_EVENT_BEFORE_TC_CHANGE, event->type); + set_bit(IIDC_RDMA_EVENT_BEFORE_TC_CHANGE, event->type); ice_send_event_to_aux(pf, event); kfree(event); @@ -740,7 +740,7 @@ static int ice_dcb_noncontig_cfg(struct ice_pf *pf) void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked) { struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg; - struct iidc_event *event; + struct iidc_rdma_event *event; u8 tc_map = 0; int v, ret; @@ -789,7 +789,7 @@ void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked) if (!event) return; - set_bit(IIDC_EVENT_AFTER_TC_CHANGE, event->type); + set_bit(IIDC_RDMA_EVENT_AFTER_TC_CHANGE, event->type); ice_send_event_to_aux(pf, event); kfree(event); } diff --git a/drivers/net/ethernet/intel/ice/ice_idc.c b/drivers/net/ethernet/intel/ice/ice_idc.c index bab3e81cad5d..d55107077d8c 100644 --- a/drivers/net/ethernet/intel/ice/ice_idc.c +++ b/drivers/net/ethernet/intel/ice/ice_idc.c @@ -15,7 +15,8 @@ static DEFINE_XARRAY_ALLOC1(ice_aux_id); * This function has to be called with a device_lock on the * pf->adev.dev to avoid race conditions. */ -static struct iidc_auxiliary_drv *ice_get_auxiliary_drv(struct ice_pf *pf) +static +struct iidc_rdma_core_auxiliary_drv *ice_get_auxiliary_drv(struct ice_pf *pf) { struct auxiliary_device *adev; @@ -23,8 +24,8 @@ static struct iidc_auxiliary_drv *ice_get_auxiliary_drv(struct ice_pf *pf) if (!adev || !adev->dev.driver) return NULL; - return container_of(adev->dev.driver, struct iidc_auxiliary_drv, - adrv.driver); + return container_of(adev->dev.driver, + struct iidc_rdma_core_auxiliary_drv, adrv.driver); } /** @@ -32,9 +33,9 @@ static struct iidc_auxiliary_drv *ice_get_auxiliary_drv(struct ice_pf *pf) * @pf: pointer to PF struct * @event: event struct */ -void ice_send_event_to_aux(struct ice_pf *pf, struct iidc_event *event) +void ice_send_event_to_aux(struct ice_pf *pf, struct iidc_rdma_event *event) { - struct iidc_auxiliary_drv *iadrv; + struct iidc_rdma_core_auxiliary_drv *iadrv; if (WARN_ON_ONCE(!in_task())) return; @@ -141,7 +142,8 @@ EXPORT_SYMBOL_GPL(ice_del_rdma_qset); * @pf: struct for PF * @reset_type: type of reset */ -int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type) +int ice_rdma_request_reset(struct ice_pf *pf, + enum iidc_rdma_reset_type reset_type) { enum ice_reset_req reset; @@ -205,7 +207,7 @@ EXPORT_SYMBOL_GPL(ice_rdma_update_vsi_filter); * @pf: pointer to PF struct * @qos: set of QoS values */ -void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos) +void ice_get_qos_params(struct ice_pf *pf, struct iidc_rdma_qos_params *qos) { struct ice_dcbx_cfg *dcbx_cfg; unsigned int i; @@ -263,9 +265,10 @@ EXPORT_SYMBOL_GPL(ice_free_rdma_qvector); */ static void ice_adev_release(struct device *dev) { - struct iidc_auxiliary_dev *iadev; + struct iidc_rdma_core_auxiliary_dev *iadev; - iadev = container_of(dev, struct iidc_auxiliary_dev, adev.dev); + iadev = container_of(dev, struct iidc_rdma_core_auxiliary_dev, + adev.dev); kfree(iadev); } @@ -275,7 +278,7 @@ static void ice_adev_release(struct device *dev) */ int ice_plug_aux_dev(struct ice_pf *pf) { - struct iidc_auxiliary_dev *iadev; + struct iidc_rdma_core_auxiliary_dev *iadev; struct auxiliary_device *adev; int ret; diff --git a/drivers/net/ethernet/intel/ice/ice_idc_int.h b/drivers/net/ethernet/intel/ice/ice_idc_int.h index b0c504a6408e..03cd7d8d1aaa 100644 --- a/drivers/net/ethernet/intel/ice/ice_idc_int.h +++ b/drivers/net/ethernet/intel/ice/ice_idc_int.h @@ -8,6 +8,6 @@ struct ice_pf; -void ice_send_event_to_aux(struct ice_pf *pf, struct iidc_event *event); +void ice_send_event_to_aux(struct ice_pf *pf, struct iidc_rdma_event *event); #endif /* !_ICE_IDC_INT_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index d390157b59fe..6a20f9f2e608 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2401,11 +2401,11 @@ static void ice_service_task(struct work_struct *work) } if (test_and_clear_bit(ICE_AUX_ERR_PENDING, pf->state)) { - struct iidc_event *event; + struct iidc_rdma_event *event; event = kzalloc(sizeof(*event), GFP_KERNEL); if (event) { - set_bit(IIDC_EVENT_CRIT_ERR, event->type); + set_bit(IIDC_RDMA_EVENT_CRIT_ERR, event->type); /* report the entire OICR value to AUX driver */ swap(event->reg, pf->oicr_err_reg); ice_send_event_to_aux(pf, event); @@ -2424,11 +2424,11 @@ static void ice_service_task(struct work_struct *work) ice_plug_aux_dev(pf); if (test_and_clear_bit(ICE_FLAG_MTU_CHANGED, pf->flags)) { - struct iidc_event *event; + struct iidc_rdma_event *event; event = kzalloc(sizeof(*event), GFP_KERNEL); if (event) { - set_bit(IIDC_EVENT_AFTER_MTU_CHANGE, event->type); + set_bit(IIDC_RDMA_EVENT_AFTER_MTU_CHANGE, event->type); ice_send_event_to_aux(pf, event); kfree(event); } diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h index 0cd75404e459..2b24a9912fa0 100644 --- a/include/linux/net/intel/iidc_rdma.h +++ b/include/linux/net/intel/iidc_rdma.h @@ -11,16 +11,16 @@ #include #include -enum iidc_event_type { - IIDC_EVENT_BEFORE_MTU_CHANGE, - IIDC_EVENT_AFTER_MTU_CHANGE, - IIDC_EVENT_BEFORE_TC_CHANGE, - IIDC_EVENT_AFTER_TC_CHANGE, - IIDC_EVENT_CRIT_ERR, - IIDC_EVENT_NBITS /* must be last */ +enum iidc_rdma_event_type { + IIDC_RDMA_EVENT_BEFORE_MTU_CHANGE, + IIDC_RDMA_EVENT_AFTER_MTU_CHANGE, + IIDC_RDMA_EVENT_BEFORE_TC_CHANGE, + IIDC_RDMA_EVENT_AFTER_TC_CHANGE, + IIDC_RDMA_EVENT_CRIT_ERR, + IIDC_RDMA_EVENT_NBITS /* must be last */ }; -enum iidc_reset_type { +enum iidc_rdma_reset_type { IIDC_PFR, IIDC_CORER, IIDC_GLOBR, @@ -47,7 +47,7 @@ struct iidc_rdma_qset_params { u8 tc; /* TC branch the Qset should belong to */ }; -struct iidc_qos_info { +struct iidc_rdma_qos_info { u64 tc_ctx; u8 rel_bw; u8 prio_type; @@ -56,8 +56,8 @@ struct iidc_qos_info { }; /* Struct to pass QoS info */ -struct iidc_qos_params { - struct iidc_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; +struct iidc_rdma_qos_params { + struct iidc_rdma_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; u8 up2tc[IIDC_MAX_USER_PRIORITY]; u8 vport_relative_bw; u8 vport_priority_type; @@ -66,8 +66,8 @@ struct iidc_qos_params { u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; }; -struct iidc_event { - DECLARE_BITMAP(type, IIDC_EVENT_NBITS); +struct iidc_rdma_event { + DECLARE_BITMAP(type, IIDC_RDMA_EVENT_NBITS); u32 reg; }; @@ -75,9 +75,11 @@ struct ice_pf; int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type); +int ice_rdma_request_reset(struct ice_pf *pf, + enum iidc_rdma_reset_type reset_type); int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); -void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos); +void ice_get_qos_params(struct ice_pf *pf, + struct iidc_rdma_qos_params *qos); int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); @@ -86,7 +88,7 @@ void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); * instance of this struct dedicated to it. */ -struct iidc_auxiliary_dev { +struct iidc_rdma_core_auxiliary_dev { struct auxiliary_device adev; struct ice_pf *pf; }; @@ -96,14 +98,14 @@ struct iidc_auxiliary_dev { * driver will access these ops by performing a container_of on the * auxiliary_device->dev.driver. */ -struct iidc_auxiliary_drv { +struct iidc_rdma_core_auxiliary_drv { struct auxiliary_driver adrv; /* This event_handler is meant to be a blocking call. For instance, * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not * return until the auxiliary driver is ready for the MTU change to * happen. */ - void (*event_handler)(struct ice_pf *pf, struct iidc_event *event); + void (*event_handler)(struct ice_pf *pf, struct iidc_rdma_event *event); }; #endif /* _IIDC_RDMA_H_*/ -- cgit v1.2.3 From d9251a560ba67bbedd53b81aee32e1ad95f42000 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 15 Apr 2025 21:15:47 -0500 Subject: iidc/ice/irdma: Break iidc.h into two headers In preparation of supporting more than a single core PCI driver for RDMA, break the iidc_rdma.h header file into two more focused headers. Only the elements universal to all Intel drivers will remain in the generic iidc_rdma.h header. Move the ice specific information to an ice specific header file named iidc_rdma_ice.h. Reviewed-by: Przemek Kitszel Signed-off-by: Dave Ertman Signed-off-by: Tatyana Nikolova Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_idc_int.h | 1 + include/linux/net/intel/iidc_rdma.h | 14 +------------- include/linux/net/intel/iidc_rdma_ice.h | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+), 13 deletions(-) create mode 100644 include/linux/net/intel/iidc_rdma_ice.h (limited to 'include') diff --git a/drivers/net/ethernet/intel/ice/ice_idc_int.h b/drivers/net/ethernet/intel/ice/ice_idc_int.h index 03cd7d8d1aaa..17dbfcfb6a2a 100644 --- a/drivers/net/ethernet/intel/ice/ice_idc_int.h +++ b/drivers/net/ethernet/intel/ice/ice_idc_int.h @@ -5,6 +5,7 @@ #define _ICE_IDC_INT_H_ #include +#include struct ice_pf; diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h index 2b24a9912fa0..1e8136395154 100644 --- a/include/linux/net/intel/iidc_rdma.h +++ b/include/linux/net/intel/iidc_rdma.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2021, Intel Corporation. */ +/* Copyright (C) 2021-2025, Intel Corporation. */ #ifndef _IIDC_RDMA_H_ #define _IIDC_RDMA_H_ @@ -71,18 +71,6 @@ struct iidc_rdma_event { u32 reg; }; -struct ice_pf; - -int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_rdma_request_reset(struct ice_pf *pf, - enum iidc_rdma_reset_type reset_type); -int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); -void ice_get_qos_params(struct ice_pf *pf, - struct iidc_rdma_qos_params *qos); -int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); -void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); - /* Structure representing auxiliary driver tailored information about the core * PCI dev, each auxiliary driver using the IIDC interface will have an * instance of this struct dedicated to it. diff --git a/include/linux/net/intel/iidc_rdma_ice.h b/include/linux/net/intel/iidc_rdma_ice.h new file mode 100644 index 000000000000..78d10003d776 --- /dev/null +++ b/include/linux/net/intel/iidc_rdma_ice.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021-2025, Intel Corporation. */ + +#ifndef _IIDC_RDMA_ICE_H_ +#define _IIDC_RDMA_ICE_H_ + +struct ice_pf; + +int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); +int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); +int ice_rdma_request_reset(struct ice_pf *pf, + enum iidc_rdma_reset_type reset_type); +int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); +void ice_get_qos_params(struct ice_pf *pf, + struct iidc_rdma_qos_params *qos); +int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); +void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); + +#endif /* _IIDC_RDMA_ICE_H_*/ -- cgit v1.2.3 From 8239b771b94b639556c1987185fd82b2a896c923 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Tue, 15 Apr 2025 21:15:48 -0500 Subject: ice: Replace ice specific DSCP mapping num with a kernel define Replace ice driver specific DSCP mapping number defines ICE_DSCP_NUM_VAL and IIDC_MAX_DSCP_MAPPING with an equivalent kernel define DSCP_MAX. Reviewed-by: Przemek Kitszel Signed-off-by: Tatyana Nikolova Signed-off-by: Dave Ertman Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_dcb.c | 2 +- drivers/net/ethernet/intel/ice/ice_dcb_nl.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_idc.c | 2 +- drivers/net/ethernet/intel/ice/ice_type.h | 6 +++--- include/linux/net/intel/iidc_rdma.h | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/intel/ice/ice_dcb.c b/drivers/net/ethernet/intel/ice/ice_dcb.c index 74418c445cc4..64737fc62306 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb.c @@ -1288,7 +1288,7 @@ ice_add_dscp_up_tlv(struct ice_lldp_org_tlv *tlv, struct ice_dcbx_cfg *dcbcfg) tlv->ouisubtype = htonl(ouisubtype); /* bytes 0 - 63 - IPv4 DSCP2UP LUT */ - for (i = 0; i < ICE_DSCP_NUM_VAL; i++) { + for (i = 0; i < DSCP_MAX; i++) { /* IPv4 mapping */ buf[i] = dcbcfg->dscp_map[i]; /* IPv6 mapping */ diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c index 6d50b90a7359..a10c1c8d8697 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c @@ -754,7 +754,7 @@ static int ice_dcbnl_setapp(struct net_device *netdev, struct dcb_app *app) if (!ice_is_feature_supported(pf, ICE_F_DSCP)) return -EOPNOTSUPP; - if (app->protocol >= ICE_DSCP_NUM_VAL) { + if (app->protocol >= DSCP_MAX) { netdev_err(netdev, "DSCP value 0x%04X out of range\n", app->protocol); return -EINVAL; @@ -931,7 +931,7 @@ static int ice_dcbnl_delapp(struct net_device *netdev, struct dcb_app *app) /* if the last DSCP mapping just got deleted, need to switch * to L2 VLAN QoS mode */ - if (bitmap_empty(new_cfg->dscp_mapped, ICE_DSCP_NUM_VAL) && + if (bitmap_empty(new_cfg->dscp_mapped, DSCP_MAX) && new_cfg->pfc_mode == ICE_QOS_MODE_DSCP) { ret = ice_aq_set_pfc_mode(&pf->hw, ICE_AQC_PFC_VLAN_BASED_PFC, diff --git a/drivers/net/ethernet/intel/ice/ice_idc.c b/drivers/net/ethernet/intel/ice/ice_idc.c index d55107077d8c..4421ba024396 100644 --- a/drivers/net/ethernet/intel/ice/ice_idc.c +++ b/drivers/net/ethernet/intel/ice/ice_idc.c @@ -225,7 +225,7 @@ void ice_get_qos_params(struct ice_pf *pf, struct iidc_rdma_qos_params *qos) qos->pfc_mode = dcbx_cfg->pfc_mode; if (qos->pfc_mode == IIDC_DSCP_PFC_MODE) - for (i = 0; i < IIDC_MAX_DSCP_MAPPING; i++) + for (i = 0; i < DSCP_MAX; i++) qos->dscp_map[i] = dcbx_cfg->dscp_map[i]; } EXPORT_SYMBOL_GPL(ice_get_qos_params); diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h index 0aab21113cc4..529f978ea45a 100644 --- a/drivers/net/ethernet/intel/ice/ice_type.h +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -19,6 +19,7 @@ #include "ice_vlan_mode.h" #include "ice_fwlog.h" #include +#include static inline bool ice_is_tc_ena(unsigned long bitmap, u8 tc) { @@ -695,7 +696,6 @@ struct ice_dcb_app_priority_table { #define ICE_MAX_USER_PRIORITY 8 #define ICE_DCBX_MAX_APPS 64 -#define ICE_DSCP_NUM_VAL 64 #define ICE_LLDPDU_SIZE 1500 #define ICE_TLV_STATUS_OPER 0x1 #define ICE_TLV_STATUS_SYNC 0x2 @@ -718,9 +718,9 @@ struct ice_dcbx_cfg { u8 pfc_mode; struct ice_dcb_app_priority_table app[ICE_DCBX_MAX_APPS]; /* when DSCP mapping defined by user set its bit to 1 */ - DECLARE_BITMAP(dscp_mapped, ICE_DSCP_NUM_VAL); + DECLARE_BITMAP(dscp_mapped, DSCP_MAX); /* array holding DSCP -> UP/TC values for DSCP L3 QoS mode */ - u8 dscp_map[ICE_DSCP_NUM_VAL]; + u8 dscp_map[DSCP_MAX]; u8 dcbx_mode; #define ICE_DCBX_MODE_CEE 0x1 #define ICE_DCBX_MODE_IEEE 0x2 diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h index 1e8136395154..7f1910289534 100644 --- a/include/linux/net/intel/iidc_rdma.h +++ b/include/linux/net/intel/iidc_rdma.h @@ -10,6 +10,7 @@ #include #include #include +#include enum iidc_rdma_event_type { IIDC_RDMA_EVENT_BEFORE_MTU_CHANGE, @@ -32,7 +33,6 @@ enum iidc_rdma_protocol { }; #define IIDC_MAX_USER_PRIORITY 8 -#define IIDC_MAX_DSCP_MAPPING 64 #define IIDC_DSCP_PFC_MODE 0x1 /* Struct to hold per RDMA Qset info */ @@ -63,7 +63,7 @@ struct iidc_rdma_qos_params { u8 vport_priority_type; u8 num_tc; u8 pfc_mode; - u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; + u8 dscp_map[DSCP_MAX]; }; struct iidc_rdma_event { -- cgit v1.2.3 From a3e1c0ad835702555d90565584ab6f723adf7f94 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 29 Apr 2025 08:04:46 +0200 Subject: net: phy: factor out provider part from mdio_bus.c After 52358dd63e34 ("net: phy: remove function stubs") there's a problem if CONFIG_MDIO_BUS is set, but CONFIG_PHYLIB is not. mdiobus_scan() uses phylib functions like get_phy_device(). Bringing back the stub wouldn't make much sense, because it would allow to compile mdiobus_scan(), but the function would be unusable. The stub returned NULL, and we have the following in mdiobus_scan(): phydev = get_phy_device(bus, addr, c45); if (IS_ERR(phydev)) return phydev; So calling mdiobus_scan() w/o CONFIG_PHYLIB would cause a crash later in mdiobus_scan(). In general the PHYLIB functionality isn't optional here. Consequently, MDIO bus providers depend on PHYLIB. Therefore factor it out and build it together with the libphy core modules. In addition make all MDIO bus providers under /drivers/net/mdio depend on PHYLIB. Same applies to enetc MDIO bus provider. Note that PHYLIB selects MDIO_DEVRES, therefore we can omit this here. Fixes: 52358dd63e34 ("net: phy: remove function stubs") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202504270639.mT0lh2o1-lkp@intel.com/ Reviewed-by: Jacob Keller Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/c74772a9-dab6-44bf-a657-389df89d85c2@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/Kconfig | 3 +- drivers/net/mdio/Kconfig | 16 +- drivers/net/phy/Makefile | 2 +- drivers/net/phy/mdio_bus.c | 462 +------------------------ drivers/net/phy/mdio_bus_provider.c | 484 +++++++++++++++++++++++++++ include/linux/phy.h | 1 + 6 files changed, 494 insertions(+), 474 deletions(-) create mode 100644 drivers/net/phy/mdio_bus_provider.c (limited to 'include') diff --git a/drivers/net/ethernet/freescale/enetc/Kconfig b/drivers/net/ethernet/freescale/enetc/Kconfig index 5367e8af1e1a..7250d3bbf6bb 100644 --- a/drivers/net/ethernet/freescale/enetc/Kconfig +++ b/drivers/net/ethernet/freescale/enetc/Kconfig @@ -73,8 +73,7 @@ config FSL_ENETC_IERB config FSL_ENETC_MDIO tristate "ENETC MDIO driver" - depends on PCI && MDIO_BUS - select MDIO_DEVRES + depends on PCI && PHYLIB help This driver supports NXP ENETC Central MDIO controller as a PCIe physical function (PF) device. diff --git a/drivers/net/mdio/Kconfig b/drivers/net/mdio/Kconfig index f680ed676797..107236cd6bd9 100644 --- a/drivers/net/mdio/Kconfig +++ b/drivers/net/mdio/Kconfig @@ -19,30 +19,25 @@ config MDIO_BUS reflects whether the mdio_bus/mdio_device code is built as a loadable module or built-in. +if PHYLIB + config FWNODE_MDIO - def_tristate PHYLIB - depends on (ACPI || OF) || COMPILE_TEST + def_tristate (ACPI || OF) || COMPILE_TEST select FIXED_PHY help FWNODE MDIO bus (Ethernet PHY) accessors config OF_MDIO - def_tristate PHYLIB - depends on OF - depends on PHYLIB + def_tristate OF select FIXED_PHY help OpenFirmware MDIO bus (Ethernet PHY) accessors config ACPI_MDIO - def_tristate PHYLIB - depends on ACPI - depends on PHYLIB + def_tristate ACPI help ACPI MDIO bus (Ethernet PHY) accessors -if MDIO_BUS - config MDIO_DEVRES tristate @@ -57,7 +52,6 @@ config MDIO_SUN4I config MDIO_XGENE tristate "APM X-Gene SoC MDIO bus controller" depends on ARCH_XGENE || COMPILE_TEST - depends on PHYLIB help This module provides a driver for the MDIO busses found in the APM X-Gene SoC's. diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index 23ce205ae91d..631859d44451 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -3,7 +3,7 @@ libphy-y := phy.o phy-c45.o phy-core.o phy_device.o \ linkmode.o phy_link_topology.o \ - phy_package.o phy_caps.o + phy_package.o phy_caps.o mdio_bus_provider.o mdio-bus-y += mdio_bus.o mdio_device.o ifdef CONFIG_MDIO_DEVICE diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index ede596c1a69d..f5ccbe33a384 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -8,17 +8,14 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include #include #include #include #include #include #include -#include #include #include -#include #include #include #include @@ -27,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -37,8 +33,6 @@ #define CREATE_TRACE_POINTS #include -#include "mdio-boardinfo.h" - static int mdiobus_register_gpiod(struct mdio_device *mdiodev) { /* Deassert the optional reset signal */ @@ -136,45 +130,6 @@ bool mdiobus_is_registered_device(struct mii_bus *bus, int addr) } EXPORT_SYMBOL(mdiobus_is_registered_device); -/** - * mdiobus_alloc_size - allocate a mii_bus structure - * @size: extra amount of memory to allocate for private storage. - * If non-zero, then bus->priv is points to that memory. - * - * Description: called by a bus driver to allocate an mii_bus - * structure to fill in. - */ -struct mii_bus *mdiobus_alloc_size(size_t size) -{ - struct mii_bus *bus; - size_t aligned_size = ALIGN(sizeof(*bus), NETDEV_ALIGN); - size_t alloc_size; - int i; - - /* If we alloc extra space, it should be aligned */ - if (size) - alloc_size = aligned_size + size; - else - alloc_size = sizeof(*bus); - - bus = kzalloc(alloc_size, GFP_KERNEL); - if (!bus) - return NULL; - - bus->state = MDIOBUS_ALLOCATED; - if (size) - bus->priv = (void *)bus + aligned_size; - - /* Initialise the interrupts to polling and 64-bit seqcounts */ - for (i = 0; i < PHY_MAX_ADDR; i++) { - bus->irq[i] = PHY_POLL; - u64_stats_init(&bus->stats[i].syncp); - } - - return bus; -} -EXPORT_SYMBOL(mdiobus_alloc_size); - /** * mdiobus_release - mii_bus device release callback * @d: the target struct device that contains the mii_bus @@ -403,11 +358,12 @@ static const struct attribute_group *mdio_bus_groups[] = { NULL, }; -static struct class mdio_bus_class = { +const struct class mdio_bus_class = { .name = "mdio_bus", .dev_release = mdiobus_release, .dev_groups = mdio_bus_groups, }; +EXPORT_SYMBOL_GPL(mdio_bus_class); /** * mdio_find_bus - Given the name of a mdiobus, find the mii_bus. @@ -451,422 +407,8 @@ struct mii_bus *of_mdio_find_bus(struct device_node *mdio_bus_np) return d ? to_mii_bus(d) : NULL; } EXPORT_SYMBOL(of_mdio_find_bus); - -/* Walk the list of subnodes of a mdio bus and look for a node that - * matches the mdio device's address with its 'reg' property. If - * found, set the of_node pointer for the mdio device. This allows - * auto-probed phy devices to be supplied with information passed in - * via DT. - * If a PHY package is found, PHY is searched also there. - */ -static int of_mdiobus_find_phy(struct device *dev, struct mdio_device *mdiodev, - struct device_node *np) -{ - struct device_node *child; - - for_each_available_child_of_node(np, child) { - int addr; - - if (of_node_name_eq(child, "ethernet-phy-package")) { - /* Validate PHY package reg presence */ - if (!of_property_present(child, "reg")) { - of_node_put(child); - return -EINVAL; - } - - if (!of_mdiobus_find_phy(dev, mdiodev, child)) { - /* The refcount for the PHY package will be - * incremented later when PHY join the Package. - */ - of_node_put(child); - return 0; - } - - continue; - } - - addr = of_mdio_parse_addr(dev, child); - if (addr < 0) - continue; - - if (addr == mdiodev->addr) { - device_set_node(dev, of_fwnode_handle(child)); - /* The refcount on "child" is passed to the mdio - * device. Do _not_ use of_node_put(child) here. - */ - return 0; - } - } - - return -ENODEV; -} - -static void of_mdiobus_link_mdiodev(struct mii_bus *bus, - struct mdio_device *mdiodev) -{ - struct device *dev = &mdiodev->dev; - - if (dev->of_node || !bus->dev.of_node) - return; - - of_mdiobus_find_phy(dev, mdiodev, bus->dev.of_node); -} -#else /* !IS_ENABLED(CONFIG_OF_MDIO) */ -static inline void of_mdiobus_link_mdiodev(struct mii_bus *mdio, - struct mdio_device *mdiodev) -{ -} #endif -/** - * mdiobus_create_device - create a full MDIO device given - * a mdio_board_info structure - * @bus: MDIO bus to create the devices on - * @bi: mdio_board_info structure describing the devices - * - * Returns 0 on success or < 0 on error. - */ -static int mdiobus_create_device(struct mii_bus *bus, - struct mdio_board_info *bi) -{ - struct mdio_device *mdiodev; - int ret = 0; - - mdiodev = mdio_device_create(bus, bi->mdio_addr); - if (IS_ERR(mdiodev)) - return -ENODEV; - - strscpy(mdiodev->modalias, bi->modalias, - sizeof(mdiodev->modalias)); - mdiodev->bus_match = mdio_device_bus_match; - mdiodev->dev.platform_data = (void *)bi->platform_data; - - ret = mdio_device_register(mdiodev); - if (ret) - mdio_device_free(mdiodev); - - return ret; -} - -static struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr, bool c45) -{ - struct phy_device *phydev = ERR_PTR(-ENODEV); - struct fwnode_handle *fwnode; - char node_name[16]; - int err; - - phydev = get_phy_device(bus, addr, c45); - if (IS_ERR(phydev)) - return phydev; - - /* For DT, see if the auto-probed phy has a corresponding child - * in the bus node, and set the of_node pointer in this case. - */ - of_mdiobus_link_mdiodev(bus, &phydev->mdio); - - /* Search for a swnode for the phy in the swnode hierarchy of the bus. - * If there is no swnode for the phy provided, just ignore it. - */ - if (dev_fwnode(&bus->dev) && !dev_fwnode(&phydev->mdio.dev)) { - snprintf(node_name, sizeof(node_name), "ethernet-phy@%d", - addr); - fwnode = fwnode_get_named_child_node(dev_fwnode(&bus->dev), - node_name); - if (fwnode) - device_set_node(&phydev->mdio.dev, fwnode); - } - - err = phy_device_register(phydev); - if (err) { - phy_device_free(phydev); - return ERR_PTR(-ENODEV); - } - - return phydev; -} - -/** - * mdiobus_scan_c22 - scan one address on a bus for C22 MDIO devices. - * @bus: mii_bus to scan - * @addr: address on bus to scan - * - * This function scans one address on the MDIO bus, looking for - * devices which can be identified using a vendor/product ID in - * registers 2 and 3. Not all MDIO devices have such registers, but - * PHY devices typically do. Hence this function assumes anything - * found is a PHY, or can be treated as a PHY. Other MDIO devices, - * such as switches, will probably not be found during the scan. - */ -struct phy_device *mdiobus_scan_c22(struct mii_bus *bus, int addr) -{ - return mdiobus_scan(bus, addr, false); -} -EXPORT_SYMBOL(mdiobus_scan_c22); - -/** - * mdiobus_scan_c45 - scan one address on a bus for C45 MDIO devices. - * @bus: mii_bus to scan - * @addr: address on bus to scan - * - * This function scans one address on the MDIO bus, looking for - * devices which can be identified using a vendor/product ID in - * registers 2 and 3. Not all MDIO devices have such registers, but - * PHY devices typically do. Hence this function assumes anything - * found is a PHY, or can be treated as a PHY. Other MDIO devices, - * such as switches, will probably not be found during the scan. - */ -static struct phy_device *mdiobus_scan_c45(struct mii_bus *bus, int addr) -{ - return mdiobus_scan(bus, addr, true); -} - -static int mdiobus_scan_bus_c22(struct mii_bus *bus) -{ - int i; - - for (i = 0; i < PHY_MAX_ADDR; i++) { - if ((bus->phy_mask & BIT(i)) == 0) { - struct phy_device *phydev; - - phydev = mdiobus_scan_c22(bus, i); - if (IS_ERR(phydev) && (PTR_ERR(phydev) != -ENODEV)) - return PTR_ERR(phydev); - } - } - return 0; -} - -static int mdiobus_scan_bus_c45(struct mii_bus *bus) -{ - int i; - - for (i = 0; i < PHY_MAX_ADDR; i++) { - if ((bus->phy_mask & BIT(i)) == 0) { - struct phy_device *phydev; - - /* Don't scan C45 if we already have a C22 device */ - if (bus->mdio_map[i]) - continue; - - phydev = mdiobus_scan_c45(bus, i); - if (IS_ERR(phydev) && (PTR_ERR(phydev) != -ENODEV)) - return PTR_ERR(phydev); - } - } - return 0; -} - -/* There are some C22 PHYs which do bad things when where is a C45 - * transaction on the bus, like accepting a read themselves, and - * stomping over the true devices reply, to performing a write to - * themselves which was intended for another device. Now that C22 - * devices have been found, see if any of them are bad for C45, and if we - * should skip the C45 scan. - */ -static bool mdiobus_prevent_c45_scan(struct mii_bus *bus) -{ - int i; - - for (i = 0; i < PHY_MAX_ADDR; i++) { - struct phy_device *phydev; - u32 oui; - - phydev = mdiobus_get_phy(bus, i); - if (!phydev) - continue; - oui = phydev->phy_id >> 10; - - if (oui == MICREL_OUI) - return true; - } - return false; -} - -/** - * __mdiobus_register - bring up all the PHYs on a given bus and attach them to bus - * @bus: target mii_bus - * @owner: module containing bus accessor functions - * - * Description: Called by a bus driver to bring up all the PHYs - * on a given bus, and attach them to the bus. Drivers should use - * mdiobus_register() rather than __mdiobus_register() unless they - * need to pass a specific owner module. MDIO devices which are not - * PHYs will not be brought up by this function. They are expected - * to be explicitly listed in DT and instantiated by of_mdiobus_register(). - * - * Returns 0 on success or < 0 on error. - */ -int __mdiobus_register(struct mii_bus *bus, struct module *owner) -{ - struct mdio_device *mdiodev; - struct gpio_desc *gpiod; - bool prevent_c45_scan; - int i, err; - - if (!bus || !bus->name) - return -EINVAL; - - /* An access method always needs both read and write operations */ - if (!!bus->read != !!bus->write || !!bus->read_c45 != !!bus->write_c45) - return -EINVAL; - - /* At least one method is mandatory */ - if (!bus->read && !bus->read_c45) - return -EINVAL; - - if (bus->parent && bus->parent->of_node) - bus->parent->of_node->fwnode.flags |= - FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD; - - WARN(bus->state != MDIOBUS_ALLOCATED && - bus->state != MDIOBUS_UNREGISTERED, - "%s: not in ALLOCATED or UNREGISTERED state\n", bus->id); - - bus->owner = owner; - bus->dev.parent = bus->parent; - bus->dev.class = &mdio_bus_class; - bus->dev.groups = NULL; - dev_set_name(&bus->dev, "%s", bus->id); - - /* If the bus state is allocated, we're registering a fresh bus - * that may have a fwnode associated with it. Grab a reference - * to the fwnode. This will be dropped when the bus is released. - * If the bus was set to unregistered, it means that the bus was - * previously registered, and we've already grabbed a reference. - */ - if (bus->state == MDIOBUS_ALLOCATED) - fwnode_handle_get(dev_fwnode(&bus->dev)); - - /* We need to set state to MDIOBUS_UNREGISTERED to correctly release - * the device in mdiobus_free() - * - * State will be updated later in this function in case of success - */ - bus->state = MDIOBUS_UNREGISTERED; - - err = device_register(&bus->dev); - if (err) { - pr_err("mii_bus %s failed to register\n", bus->id); - return -EINVAL; - } - - mutex_init(&bus->mdio_lock); - mutex_init(&bus->shared_lock); - - /* assert bus level PHY GPIO reset */ - gpiod = devm_gpiod_get_optional(&bus->dev, "reset", GPIOD_OUT_HIGH); - if (IS_ERR(gpiod)) { - err = dev_err_probe(&bus->dev, PTR_ERR(gpiod), - "mii_bus %s couldn't get reset GPIO\n", - bus->id); - device_del(&bus->dev); - return err; - } else if (gpiod) { - bus->reset_gpiod = gpiod; - fsleep(bus->reset_delay_us); - gpiod_set_value_cansleep(gpiod, 0); - if (bus->reset_post_delay_us > 0) - fsleep(bus->reset_post_delay_us); - } - - if (bus->reset) { - err = bus->reset(bus); - if (err) - goto error_reset_gpiod; - } - - if (bus->read) { - err = mdiobus_scan_bus_c22(bus); - if (err) - goto error; - } - - prevent_c45_scan = mdiobus_prevent_c45_scan(bus); - - if (!prevent_c45_scan && bus->read_c45) { - err = mdiobus_scan_bus_c45(bus); - if (err) - goto error; - } - - mdiobus_setup_mdiodev_from_board_info(bus, mdiobus_create_device); - - bus->state = MDIOBUS_REGISTERED; - dev_dbg(&bus->dev, "probed\n"); - return 0; - -error: - for (i = 0; i < PHY_MAX_ADDR; i++) { - mdiodev = bus->mdio_map[i]; - if (!mdiodev) - continue; - - mdiodev->device_remove(mdiodev); - mdiodev->device_free(mdiodev); - } -error_reset_gpiod: - /* Put PHYs in RESET to save power */ - if (bus->reset_gpiod) - gpiod_set_value_cansleep(bus->reset_gpiod, 1); - - device_del(&bus->dev); - return err; -} -EXPORT_SYMBOL(__mdiobus_register); - -void mdiobus_unregister(struct mii_bus *bus) -{ - struct mdio_device *mdiodev; - int i; - - if (WARN_ON_ONCE(bus->state != MDIOBUS_REGISTERED)) - return; - bus->state = MDIOBUS_UNREGISTERED; - - for (i = 0; i < PHY_MAX_ADDR; i++) { - mdiodev = bus->mdio_map[i]; - if (!mdiodev) - continue; - - if (mdiodev->reset_gpio) - gpiod_put(mdiodev->reset_gpio); - - mdiodev->device_remove(mdiodev); - mdiodev->device_free(mdiodev); - } - - /* Put PHYs in RESET to save power */ - if (bus->reset_gpiod) - gpiod_set_value_cansleep(bus->reset_gpiod, 1); - - device_del(&bus->dev); -} -EXPORT_SYMBOL(mdiobus_unregister); - -/** - * mdiobus_free - free a struct mii_bus - * @bus: mii_bus to free - * - * This function releases the reference to the underlying device - * object in the mii_bus. If this is the last reference, the mii_bus - * will be freed. - */ -void mdiobus_free(struct mii_bus *bus) -{ - /* For compatibility with error handling in drivers. */ - if (bus->state == MDIOBUS_ALLOCATED) { - kfree(bus); - return; - } - - WARN(bus->state != MDIOBUS_UNREGISTERED, - "%s: not in UNREGISTERED state\n", bus->id); - bus->state = MDIOBUS_RELEASED; - - put_device(&bus->dev); -} -EXPORT_SYMBOL(mdiobus_free); - static void mdiobus_stats_acct(struct mdio_bus_stats *stats, bool op, int ret) { preempt_disable(); diff --git a/drivers/net/phy/mdio_bus_provider.c b/drivers/net/phy/mdio_bus_provider.c new file mode 100644 index 000000000000..65850e36284d --- /dev/null +++ b/drivers/net/phy/mdio_bus_provider.c @@ -0,0 +1,484 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* MDIO Bus provider interface + * + * Author: Andy Fleming + * + * Copyright (c) 2004 Freescale Semiconductor, Inc. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mdio-boardinfo.h" + +/** + * mdiobus_alloc_size - allocate a mii_bus structure + * @size: extra amount of memory to allocate for private storage. + * If non-zero, then bus->priv is points to that memory. + * + * Description: called by a bus driver to allocate an mii_bus + * structure to fill in. + */ +struct mii_bus *mdiobus_alloc_size(size_t size) +{ + struct mii_bus *bus; + size_t aligned_size = ALIGN(sizeof(*bus), NETDEV_ALIGN); + size_t alloc_size; + int i; + + /* If we alloc extra space, it should be aligned */ + if (size) + alloc_size = aligned_size + size; + else + alloc_size = sizeof(*bus); + + bus = kzalloc(alloc_size, GFP_KERNEL); + if (!bus) + return NULL; + + bus->state = MDIOBUS_ALLOCATED; + if (size) + bus->priv = (void *)bus + aligned_size; + + /* Initialise the interrupts to polling and 64-bit seqcounts */ + for (i = 0; i < PHY_MAX_ADDR; i++) { + bus->irq[i] = PHY_POLL; + u64_stats_init(&bus->stats[i].syncp); + } + + return bus; +} +EXPORT_SYMBOL(mdiobus_alloc_size); + +#if IS_ENABLED(CONFIG_OF_MDIO) +/* Walk the list of subnodes of a mdio bus and look for a node that + * matches the mdio device's address with its 'reg' property. If + * found, set the of_node pointer for the mdio device. This allows + * auto-probed phy devices to be supplied with information passed in + * via DT. + * If a PHY package is found, PHY is searched also there. + */ +static int of_mdiobus_find_phy(struct device *dev, struct mdio_device *mdiodev, + struct device_node *np) +{ + struct device_node *child; + + for_each_available_child_of_node(np, child) { + int addr; + + if (of_node_name_eq(child, "ethernet-phy-package")) { + /* Validate PHY package reg presence */ + if (!of_property_present(child, "reg")) { + of_node_put(child); + return -EINVAL; + } + + if (!of_mdiobus_find_phy(dev, mdiodev, child)) { + /* The refcount for the PHY package will be + * incremented later when PHY join the Package. + */ + of_node_put(child); + return 0; + } + + continue; + } + + addr = of_mdio_parse_addr(dev, child); + if (addr < 0) + continue; + + if (addr == mdiodev->addr) { + device_set_node(dev, of_fwnode_handle(child)); + /* The refcount on "child" is passed to the mdio + * device. Do _not_ use of_node_put(child) here. + */ + return 0; + } + } + + return -ENODEV; +} + +static void of_mdiobus_link_mdiodev(struct mii_bus *bus, + struct mdio_device *mdiodev) +{ + struct device *dev = &mdiodev->dev; + + if (dev->of_node || !bus->dev.of_node) + return; + + of_mdiobus_find_phy(dev, mdiodev, bus->dev.of_node); +} +#endif + +/** + * mdiobus_create_device - create a full MDIO device given + * a mdio_board_info structure + * @bus: MDIO bus to create the devices on + * @bi: mdio_board_info structure describing the devices + * + * Returns 0 on success or < 0 on error. + */ +static int mdiobus_create_device(struct mii_bus *bus, + struct mdio_board_info *bi) +{ + struct mdio_device *mdiodev; + int ret = 0; + + mdiodev = mdio_device_create(bus, bi->mdio_addr); + if (IS_ERR(mdiodev)) + return -ENODEV; + + strscpy(mdiodev->modalias, bi->modalias, + sizeof(mdiodev->modalias)); + mdiodev->bus_match = mdio_device_bus_match; + mdiodev->dev.platform_data = (void *)bi->platform_data; + + ret = mdio_device_register(mdiodev); + if (ret) + mdio_device_free(mdiodev); + + return ret; +} + +static struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr, bool c45) +{ + struct phy_device *phydev = ERR_PTR(-ENODEV); + struct fwnode_handle *fwnode; + char node_name[16]; + int err; + + phydev = get_phy_device(bus, addr, c45); + if (IS_ERR(phydev)) + return phydev; + +#if IS_ENABLED(CONFIG_OF_MDIO) + /* For DT, see if the auto-probed phy has a corresponding child + * in the bus node, and set the of_node pointer in this case. + */ + of_mdiobus_link_mdiodev(bus, &phydev->mdio); +#endif + + /* Search for a swnode for the phy in the swnode hierarchy of the bus. + * If there is no swnode for the phy provided, just ignore it. + */ + if (dev_fwnode(&bus->dev) && !dev_fwnode(&phydev->mdio.dev)) { + snprintf(node_name, sizeof(node_name), "ethernet-phy@%d", + addr); + fwnode = fwnode_get_named_child_node(dev_fwnode(&bus->dev), + node_name); + if (fwnode) + device_set_node(&phydev->mdio.dev, fwnode); + } + + err = phy_device_register(phydev); + if (err) { + phy_device_free(phydev); + return ERR_PTR(-ENODEV); + } + + return phydev; +} + +/** + * mdiobus_scan_c22 - scan one address on a bus for C22 MDIO devices. + * @bus: mii_bus to scan + * @addr: address on bus to scan + * + * This function scans one address on the MDIO bus, looking for + * devices which can be identified using a vendor/product ID in + * registers 2 and 3. Not all MDIO devices have such registers, but + * PHY devices typically do. Hence this function assumes anything + * found is a PHY, or can be treated as a PHY. Other MDIO devices, + * such as switches, will probably not be found during the scan. + */ +struct phy_device *mdiobus_scan_c22(struct mii_bus *bus, int addr) +{ + return mdiobus_scan(bus, addr, false); +} +EXPORT_SYMBOL(mdiobus_scan_c22); + +/** + * mdiobus_scan_c45 - scan one address on a bus for C45 MDIO devices. + * @bus: mii_bus to scan + * @addr: address on bus to scan + * + * This function scans one address on the MDIO bus, looking for + * devices which can be identified using a vendor/product ID in + * registers 2 and 3. Not all MDIO devices have such registers, but + * PHY devices typically do. Hence this function assumes anything + * found is a PHY, or can be treated as a PHY. Other MDIO devices, + * such as switches, will probably not be found during the scan. + */ +static struct phy_device *mdiobus_scan_c45(struct mii_bus *bus, int addr) +{ + return mdiobus_scan(bus, addr, true); +} + +static int mdiobus_scan_bus_c22(struct mii_bus *bus) +{ + int i; + + for (i = 0; i < PHY_MAX_ADDR; i++) { + if ((bus->phy_mask & BIT(i)) == 0) { + struct phy_device *phydev; + + phydev = mdiobus_scan_c22(bus, i); + if (IS_ERR(phydev) && (PTR_ERR(phydev) != -ENODEV)) + return PTR_ERR(phydev); + } + } + return 0; +} + +static int mdiobus_scan_bus_c45(struct mii_bus *bus) +{ + int i; + + for (i = 0; i < PHY_MAX_ADDR; i++) { + if ((bus->phy_mask & BIT(i)) == 0) { + struct phy_device *phydev; + + /* Don't scan C45 if we already have a C22 device */ + if (bus->mdio_map[i]) + continue; + + phydev = mdiobus_scan_c45(bus, i); + if (IS_ERR(phydev) && (PTR_ERR(phydev) != -ENODEV)) + return PTR_ERR(phydev); + } + } + return 0; +} + +/* There are some C22 PHYs which do bad things when where is a C45 + * transaction on the bus, like accepting a read themselves, and + * stomping over the true devices reply, to performing a write to + * themselves which was intended for another device. Now that C22 + * devices have been found, see if any of them are bad for C45, and if we + * should skip the C45 scan. + */ +static bool mdiobus_prevent_c45_scan(struct mii_bus *bus) +{ + int i; + + for (i = 0; i < PHY_MAX_ADDR; i++) { + struct phy_device *phydev; + u32 oui; + + phydev = mdiobus_get_phy(bus, i); + if (!phydev) + continue; + oui = phydev->phy_id >> 10; + + if (oui == MICREL_OUI) + return true; + } + return false; +} + +/** + * __mdiobus_register - bring up all the PHYs on a given bus and attach them to bus + * @bus: target mii_bus + * @owner: module containing bus accessor functions + * + * Description: Called by a bus driver to bring up all the PHYs + * on a given bus, and attach them to the bus. Drivers should use + * mdiobus_register() rather than __mdiobus_register() unless they + * need to pass a specific owner module. MDIO devices which are not + * PHYs will not be brought up by this function. They are expected + * to be explicitly listed in DT and instantiated by of_mdiobus_register(). + * + * Returns 0 on success or < 0 on error. + */ +int __mdiobus_register(struct mii_bus *bus, struct module *owner) +{ + struct mdio_device *mdiodev; + struct gpio_desc *gpiod; + bool prevent_c45_scan; + int i, err; + + if (!bus || !bus->name) + return -EINVAL; + + /* An access method always needs both read and write operations */ + if (!!bus->read != !!bus->write || !!bus->read_c45 != !!bus->write_c45) + return -EINVAL; + + /* At least one method is mandatory */ + if (!bus->read && !bus->read_c45) + return -EINVAL; + + if (bus->parent && bus->parent->of_node) + bus->parent->of_node->fwnode.flags |= + FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD; + + WARN(bus->state != MDIOBUS_ALLOCATED && + bus->state != MDIOBUS_UNREGISTERED, + "%s: not in ALLOCATED or UNREGISTERED state\n", bus->id); + + bus->owner = owner; + bus->dev.parent = bus->parent; + bus->dev.class = &mdio_bus_class; + bus->dev.groups = NULL; + dev_set_name(&bus->dev, "%s", bus->id); + + /* If the bus state is allocated, we're registering a fresh bus + * that may have a fwnode associated with it. Grab a reference + * to the fwnode. This will be dropped when the bus is released. + * If the bus was set to unregistered, it means that the bus was + * previously registered, and we've already grabbed a reference. + */ + if (bus->state == MDIOBUS_ALLOCATED) + fwnode_handle_get(dev_fwnode(&bus->dev)); + + /* We need to set state to MDIOBUS_UNREGISTERED to correctly release + * the device in mdiobus_free() + * + * State will be updated later in this function in case of success + */ + bus->state = MDIOBUS_UNREGISTERED; + + err = device_register(&bus->dev); + if (err) { + pr_err("mii_bus %s failed to register\n", bus->id); + return -EINVAL; + } + + mutex_init(&bus->mdio_lock); + mutex_init(&bus->shared_lock); + + /* assert bus level PHY GPIO reset */ + gpiod = devm_gpiod_get_optional(&bus->dev, "reset", GPIOD_OUT_HIGH); + if (IS_ERR(gpiod)) { + err = dev_err_probe(&bus->dev, PTR_ERR(gpiod), + "mii_bus %s couldn't get reset GPIO\n", + bus->id); + device_del(&bus->dev); + return err; + } else if (gpiod) { + bus->reset_gpiod = gpiod; + fsleep(bus->reset_delay_us); + gpiod_set_value_cansleep(gpiod, 0); + if (bus->reset_post_delay_us > 0) + fsleep(bus->reset_post_delay_us); + } + + if (bus->reset) { + err = bus->reset(bus); + if (err) + goto error_reset_gpiod; + } + + if (bus->read) { + err = mdiobus_scan_bus_c22(bus); + if (err) + goto error; + } + + prevent_c45_scan = mdiobus_prevent_c45_scan(bus); + + if (!prevent_c45_scan && bus->read_c45) { + err = mdiobus_scan_bus_c45(bus); + if (err) + goto error; + } + + mdiobus_setup_mdiodev_from_board_info(bus, mdiobus_create_device); + + bus->state = MDIOBUS_REGISTERED; + dev_dbg(&bus->dev, "probed\n"); + return 0; + +error: + for (i = 0; i < PHY_MAX_ADDR; i++) { + mdiodev = bus->mdio_map[i]; + if (!mdiodev) + continue; + + mdiodev->device_remove(mdiodev); + mdiodev->device_free(mdiodev); + } +error_reset_gpiod: + /* Put PHYs in RESET to save power */ + if (bus->reset_gpiod) + gpiod_set_value_cansleep(bus->reset_gpiod, 1); + + device_del(&bus->dev); + return err; +} +EXPORT_SYMBOL(__mdiobus_register); + +void mdiobus_unregister(struct mii_bus *bus) +{ + struct mdio_device *mdiodev; + int i; + + if (WARN_ON_ONCE(bus->state != MDIOBUS_REGISTERED)) + return; + bus->state = MDIOBUS_UNREGISTERED; + + for (i = 0; i < PHY_MAX_ADDR; i++) { + mdiodev = bus->mdio_map[i]; + if (!mdiodev) + continue; + + if (mdiodev->reset_gpio) + gpiod_put(mdiodev->reset_gpio); + + mdiodev->device_remove(mdiodev); + mdiodev->device_free(mdiodev); + } + + /* Put PHYs in RESET to save power */ + if (bus->reset_gpiod) + gpiod_set_value_cansleep(bus->reset_gpiod, 1); + + device_del(&bus->dev); +} +EXPORT_SYMBOL(mdiobus_unregister); + +/** + * mdiobus_free - free a struct mii_bus + * @bus: mii_bus to free + * + * This function releases the reference to the underlying device + * object in the mii_bus. If this is the last reference, the mii_bus + * will be freed. + */ +void mdiobus_free(struct mii_bus *bus) +{ + /* For compatibility with error handling in drivers. */ + if (bus->state == MDIOBUS_ALLOCATED) { + kfree(bus); + return; + } + + WARN(bus->state != MDIOBUS_UNREGISTERED, + "%s: not in UNREGISTERED state\n", bus->id); + bus->state = MDIOBUS_RELEASED; + + put_device(&bus->dev); +} +EXPORT_SYMBOL(mdiobus_free); diff --git a/include/linux/phy.h b/include/linux/phy.h index 3beaf225ee88..d62d292024bc 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2062,6 +2062,7 @@ int __phy_hwtstamp_set(struct phy_device *phydev, struct netlink_ext_ack *extack); extern const struct bus_type mdio_bus_type; +extern const struct class mdio_bus_class; struct mdio_board_info { const char *bus_id; -- cgit v1.2.3 From 66d454e99d71857faf249486912e381ec83760b4 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Fri, 2 May 2025 09:15:21 -0700 Subject: bpf: udp: Make sure iter->batch always contains a full bucket snapshot Require that iter->batch always contains a full bucket snapshot. This invariant is important to avoid skipping or repeating sockets during iteration when combined with the next few patches. Before, there were two cases where a call to bpf_iter_udp_batch may only capture part of a bucket: 1. When bpf_iter_udp_realloc_batch() returns -ENOMEM [1]. 2. When more sockets are added to the bucket while calling bpf_iter_udp_realloc_batch(), making the updated batch size insufficient [2]. In cases where the batch size only covers part of a bucket, it is possible to forget which sockets were already visited, especially if we have to process a bucket in more than two batches. This forces us to choose between repeating or skipping sockets, so don't allow this: 1. Stop iteration and propagate -ENOMEM up to userspace if reallocation fails instead of continuing with a partial batch. 2. Try bpf_iter_udp_realloc_batch() with GFP_USER just as before, but if we still aren't able to capture the full bucket, call bpf_iter_udp_realloc_batch() again while holding the bucket lock to guarantee the bucket does not change. On the second attempt use GFP_NOWAIT since we hold onto the spin lock. Introduce the udp_portaddr_for_each_entry_from macro and use it instead of udp_portaddr_for_each_entry to make it possible to continue iteration from an arbitrary socket. This is required for this patch in the GFP_NOWAIT case to allow us to fill the rest of a batch starting from the middle of a bucket and the later patch which skips sockets that were already seen. Testing all scenarios directly is a bit difficult, but I did some manual testing to exercise the code paths where GFP_NOWAIT is used and where ERR_PTR(err) is returned. I used the realloc test case included later in this series to trigger a scenario where a realloc happens inside bpf_iter_udp_batch and made a small code tweak to force the first realloc attempt to allocate a too-small batch, thus requiring another attempt with GFP_NOWAIT. Some printks showed both reallocs with the tests passing: Apr 25 23:16:24 crow kernel: go again GFP_USER Apr 25 23:16:24 crow kernel: go again GFP_NOWAIT With this setup, I also forced each of the bpf_iter_udp_realloc_batch calls to return -ENOMEM to ensure that iteration ends and that the read() in userspace fails. [1]: https://lore.kernel.org/bpf/CABi4-ogUtMrH8-NVB6W8Xg_F_KDLq=yy-yu-tKr2udXE2Mu1Lg@mail.gmail.com/ [2]: https://lore.kernel.org/bpf/7ed28273-a716-4638-912d-f86f965e54bb@linux.dev/ Signed-off-by: Jordan Rife Signed-off-by: Martin KaFai Lau --- include/linux/udp.h | 3 ++ net/ipv4/udp.c | 81 ++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 58 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index 895240177f4f..4e1a672af4c5 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -216,6 +216,9 @@ static inline void udp_allow_gso(struct sock *sk) #define udp_portaddr_for_each_entry(__sk, list) \ hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node) +#define udp_portaddr_for_each_entry_from(__sk) \ + hlist_for_each_entry_from(__sk, __sk_common.skc_portaddr_node) + #define udp_portaddr_for_each_entry_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 68a77323bc51..426a8b7c5cde 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -3433,8 +3433,9 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq) int resume_bucket, resume_offset; struct udp_table *udptable; unsigned int batch_sks = 0; - bool resized = false; + int resizes = 0; struct sock *sk; + int err = 0; resume_bucket = state->bucket; resume_offset = iter->offset; @@ -3455,18 +3456,21 @@ again: */ iter->cur_sk = 0; iter->end_sk = 0; - iter->st_bucket_done = false; + iter->st_bucket_done = true; batch_sks = 0; for (; state->bucket <= udptable->mask; state->bucket++) { struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot; if (hlist_empty(&hslot2->head)) - continue; + goto next_bucket; iter->offset = 0; spin_lock_bh(&hslot2->lock); - udp_portaddr_for_each_entry(sk, &hslot2->head) { + sk = hlist_entry_safe(hslot2->head.first, struct sock, + __sk_common.skc_portaddr_node); +fill_batch: + udp_portaddr_for_each_entry_from(sk) { if (seq_sk_match(seq, sk)) { /* Resume from the last iterated socket at the * offset in the bucket before iterator was stopped. @@ -3483,33 +3487,55 @@ again: batch_sks++; } } + + /* Allocate a larger batch and try again. */ + if (unlikely(resizes <= 1 && iter->end_sk && + iter->end_sk != batch_sks)) { + resizes++; + + /* First, try with GFP_USER to maximize the chances of + * grabbing more memory. + */ + if (resizes == 1) { + spin_unlock_bh(&hslot2->lock); + err = bpf_iter_udp_realloc_batch(iter, + batch_sks * 3 / 2, + GFP_USER); + if (err) + return ERR_PTR(err); + /* Start over. */ + goto again; + } + + /* Next, hold onto the lock, so the bucket doesn't + * change while we get the rest of the sockets. + */ + err = bpf_iter_udp_realloc_batch(iter, batch_sks, + GFP_NOWAIT); + if (err) { + spin_unlock_bh(&hslot2->lock); + return ERR_PTR(err); + } + + /* Pick up where we left off. */ + sk = iter->batch[iter->end_sk - 1]; + sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next, + struct sock, + __sk_common.skc_portaddr_node); + batch_sks = iter->end_sk; + goto fill_batch; + } + spin_unlock_bh(&hslot2->lock); if (iter->end_sk) break; +next_bucket: + resizes = 0; } - /* All done: no batch made. */ - if (!iter->end_sk) - return NULL; - - if (iter->end_sk == batch_sks) { - /* Batching is done for the current bucket; return the first - * socket to be iterated from the batch. - */ - iter->st_bucket_done = true; - goto done; - } - if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2, - GFP_USER)) { - resized = true; - /* After allocating a larger batch, retry one more time to grab - * the whole bucket. - */ - goto again; - } -done: - return iter->batch[0]; + WARN_ON_ONCE(iter->end_sk != batch_sks); + return iter->end_sk ? iter->batch[0] : NULL; } static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -3873,7 +3899,10 @@ static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, if (!new_batch) return -ENOMEM; - bpf_iter_udp_put_batch(iter); + if (flags != GFP_NOWAIT) + bpf_iter_udp_put_batch(iter); + + memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); kvfree(iter->batch); iter->batch = new_batch; iter->max_sk = new_batch_sz; -- cgit v1.2.3 From ca732e990fc8222a2d6782ae750304719e212fe8 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 1 May 2025 12:45:11 +0100 Subject: net: stmmac: add get_interfaces() platform method Add a get_interfaces() platform method to allow platforms to indicate to phylink which interface modes they support - which then allows phylink to validate on initialisation that the configured PHY interface mode is actually supported. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1uASLn-0021Qd-Mi@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 16 +++++++++++++--- include/linux/stmmac.h | 2 ++ 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index ac6ab121eb33..5f66f816a249 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1283,10 +1283,20 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) if (mdio_bus_data) config->default_an_inband = mdio_bus_data->default_an_inband; - /* Set the platform/firmware specified interface mode. Note, phylink - * deals with the PHY interface mode, not the MAC interface mode. + /* Get the PHY interface modes (at the PHY end of the link) that + * are supported by the platform. */ - __set_bit(priv->plat->phy_interface, config->supported_interfaces); + if (priv->plat->get_interfaces) + priv->plat->get_interfaces(priv, priv->plat->bsp_priv, + config->supported_interfaces); + + /* Set the platform/firmware specified interface mode if the + * supported interfaces have not already been provided using + * phy_interface as a last resort. + */ + if (phy_interface_empty(config->supported_interfaces)) + __set_bit(priv->plat->phy_interface, + config->supported_interfaces); /* If we have an xpcs, it defines which PHY interfaces are supported. */ if (priv->hw->xpcs) diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 8aed09d65b4a..537bced69c46 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -233,6 +233,8 @@ struct plat_stmmacenet_data { u8 tx_sched_algorithm; struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES]; struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES]; + void (*get_interfaces)(struct stmmac_priv *priv, void *bsp_priv, + unsigned long *interfaces); int (*set_clk_tx_rate)(void *priv, struct clk *clk_tx_i, phy_interface_t interface, int speed); void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); -- cgit v1.2.3 From 9d165dc58055d98658941a33fef9e5da866af3e9 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 1 May 2025 12:45:27 +0100 Subject: net: stmmac: remove speed_mode_2500() method Remove the speed_mode_2500() platform method which is no longer used or necessary, being superseded by the more flexible get_interfaces() method. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1uASM3-0021R3-2B@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 --- include/linux/stmmac.h | 1 - 2 files changed, 4 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 5f66f816a249..28b62bd73e23 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7736,9 +7736,6 @@ int stmmac_dvr_probe(struct device *device, goto error_mdio_register; } - if (priv->plat->speed_mode_2500) - priv->plat->speed_mode_2500(ndev, priv->plat->bsp_priv); - ret = stmmac_pcs_setup(ndev); if (ret) goto error_pcs_setup; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 537bced69c46..26ddf95d23f9 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -241,7 +241,6 @@ struct plat_stmmacenet_data { int (*fix_soc_reset)(void *priv, void __iomem *ioaddr); int (*serdes_powerup)(struct net_device *ndev, void *priv); void (*serdes_powerdown)(struct net_device *ndev, void *priv); - void (*speed_mode_2500)(struct net_device *ndev, void *priv); int (*mac_finish)(struct net_device *ndev, void *priv, unsigned int mode, -- cgit v1.2.3 From 320a66f84022028f1277bf568a5e8987eac6e797 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 1 May 2025 01:24:02 +0100 Subject: strparser: Remove unused __strp_unpause The last use of __strp_unpause() was removed in 2022 by commit 84c61fe1a75b ("tls: rx: do not use the standard strparser") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250501002402.308843-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/net/strparser.h | 2 -- net/strparser/strparser.c | 13 ------------- 2 files changed, 15 deletions(-) (limited to 'include') diff --git a/include/net/strparser.h b/include/net/strparser.h index 0a83010b3a64..0ed73e364faa 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -114,8 +114,6 @@ static inline void strp_pause(struct strparser *strp) /* May be called without holding lock for attached socket */ void strp_unpause(struct strparser *strp); -/* Must be called with process lock held (lock_sock) */ -void __strp_unpause(struct strparser *strp); static inline void save_strp_stats(struct strparser *strp, struct strp_aggr_stats *agg_stats) diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 95696f42647e..d946bfb424c7 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -485,19 +485,6 @@ int strp_init(struct strparser *strp, struct sock *sk, } EXPORT_SYMBOL_GPL(strp_init); -/* Sock process lock held (lock_sock) */ -void __strp_unpause(struct strparser *strp) -{ - strp->paused = 0; - - if (strp->need_bytes) { - if (strp_peek_len(strp) < strp->need_bytes) - return; - } - strp_read_sock(strp); -} -EXPORT_SYMBOL_GPL(__strp_unpause); - void strp_unpause(struct strparser *strp) { strp->paused = 0; -- cgit v1.2.3 From ac8f09b9210c48934c78fdc6bc167e660eaac928 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 2 May 2025 00:38:15 +0100 Subject: sctp: Remove unused sctp_assoc_del_peer and sctp_chunk_iif sctp_assoc_del_peer() last use was removed in 2015 by commit 73e6742027f5 ("sctp: Do not try to search for the transport twice") which now uses rm_peer instead of del_peer. sctp_chunk_iif() last use was removed in 2016 by commit 1f45f78f8e51 ("sctp: allow GSO frags to access the chunk too") Remove them. Signed-off-by: Dr. David Alan Gilbert Acked-by: Xin Long Link: https://patch.msgid.link/20250501233815.99832-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/net/sctp/sm.h | 1 - include/net/sctp/structs.h | 2 -- net/sctp/associola.c | 18 ------------------ net/sctp/sm_make_chunk.c | 8 -------- 4 files changed, 29 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 64c42bd56bb2..3bfd261a53cc 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -161,7 +161,6 @@ const struct sctp_sm_table_entry *sctp_sm_lookup_event( enum sctp_event_type event_type, enum sctp_state state, union sctp_subtype event_subtype); -int sctp_chunk_iif(const struct sctp_chunk *); struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *, struct sctp_chunk *, gfp_t gfp); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index dcd288fa1bb6..1ad7ce71d0a7 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -2152,8 +2152,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *, const union sctp_addr *address, const gfp_t gfp, const int peer_state); -void sctp_assoc_del_peer(struct sctp_association *asoc, - const union sctp_addr *addr); void sctp_assoc_rm_peer(struct sctp_association *asoc, struct sctp_transport *peer); void sctp_assoc_control_transport(struct sctp_association *asoc, diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 760152e751c7..5793d71852b8 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -736,24 +736,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, return peer; } -/* Delete a transport address from an association. */ -void sctp_assoc_del_peer(struct sctp_association *asoc, - const union sctp_addr *addr) -{ - struct list_head *pos; - struct list_head *temp; - struct sctp_transport *transport; - - list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { - transport = list_entry(pos, struct sctp_transport, transports); - if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) { - /* Do book keeping for removing the peer and free it. */ - sctp_assoc_rm_peer(asoc, transport); - break; - } - } -} - /* Lookup a transport by address. */ struct sctp_transport *sctp_assoc_lookup_paddr( const struct sctp_association *asoc, diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index f80208edd6a5..3ead591c72fd 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -115,14 +115,6 @@ static void sctp_control_set_owner_w(struct sctp_chunk *chunk) skb->destructor = sctp_control_release_owner; } -/* What was the inbound interface for this chunk? */ -int sctp_chunk_iif(const struct sctp_chunk *chunk) -{ - struct sk_buff *skb = chunk->skb; - - return SCTP_INPUT_CB(skb)->af->skb_iif(skb); -} - /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 2: The ECN capable field is reserved for future use of -- cgit v1.2.3 From 429ac6211494c12b668dac59811ea8a96db6d757 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 5 May 2025 13:45:11 +0200 Subject: devlink: define enum for attr types of dynamic attributes Devlink param and health reporter fmsg use attributes with dynamic type which is determined according to a different type. Currently used values are NLA_*. The problem is, they are not part of UAPI. They may change which would cause a break. To make this future safe, introduce a enum that shadows NLA_* values in it and is part of UAPI. Also, this allows to possibly carry types that are unrelated to NLA_* values. Signed-off-by: Saeed Mahameed Signed-off-by: Jiri Pirko Link: https://patch.msgid.link/20250505114513.53370-3-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 24 ++++++++++++++++++++++++ include/uapi/linux/devlink.h | 15 +++++++++++++++ net/devlink/health.c | 17 +++++++++++++++-- net/devlink/netlink_gen.c | 29 ++++++++++++++++++++++++++++- net/devlink/param.c | 30 +++++++++++++++--------------- 5 files changed, 97 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index bd9726269b4f..05fee1b7fe19 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -202,6 +202,28 @@ definitions: name: exception - name: control + - + type: enum + name: var-attr-type + entries: + - + name: u8 + value: 1 + - + name: u16 + - + name: u32 + - + name: u64 + - + name: string + - + name: flag + - + name: nul_string + value: 10 + - + name: binary attribute-sets: - @@ -498,6 +520,7 @@ attribute-sets: - name: param-type type: u8 + enum: var-attr-type # TODO: fill in the attributes in between @@ -592,6 +615,7 @@ attribute-sets: - name: fmsg-obj-value-type type: u8 + enum: var-attr-type # TODO: fill in the attributes in between diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 9401aa343673..a5ee0f13740a 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -385,6 +385,21 @@ enum devlink_linecard_state { DEVLINK_LINECARD_STATE_MAX = __DEVLINK_LINECARD_STATE_MAX - 1 }; +/* Variable attribute type. */ +enum devlink_var_attr_type { + /* Following values relate to the internal NLA_* values */ + DEVLINK_VAR_ATTR_TYPE_U8 = 1, + DEVLINK_VAR_ATTR_TYPE_U16, + DEVLINK_VAR_ATTR_TYPE_U32, + DEVLINK_VAR_ATTR_TYPE_U64, + DEVLINK_VAR_ATTR_TYPE_STRING, + DEVLINK_VAR_ATTR_TYPE_FLAG, + DEVLINK_VAR_ATTR_TYPE_NUL_STRING = 10, + DEVLINK_VAR_ATTR_TYPE_BINARY, + __DEVLINK_VAR_ATTR_TYPE_CUSTOM_BASE = 0x80, + /* Any possible custom types, unrelated to NLA_* values go below */ +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, diff --git a/net/devlink/health.c b/net/devlink/health.c index 57db6799722a..95a7a62d85a2 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -930,18 +930,31 @@ EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put); static int devlink_fmsg_item_fill_type(struct devlink_fmsg_item *msg, struct sk_buff *skb) { + enum devlink_var_attr_type var_attr_type; + switch (msg->nla_type) { case NLA_FLAG: + var_attr_type = DEVLINK_VAR_ATTR_TYPE_FLAG; + break; case NLA_U8: + var_attr_type = DEVLINK_VAR_ATTR_TYPE_U8; + break; case NLA_U32: + var_attr_type = DEVLINK_VAR_ATTR_TYPE_U32; + break; case NLA_U64: + var_attr_type = DEVLINK_VAR_ATTR_TYPE_U64; + break; case NLA_NUL_STRING: + var_attr_type = DEVLINK_VAR_ATTR_TYPE_NUL_STRING; + break; case NLA_BINARY: - return nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, - msg->nla_type); + var_attr_type = DEVLINK_VAR_ATTR_TYPE_BINARY; + break; default: return -EINVAL; } + return nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, var_attr_type); } static int diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index f9786d51f68f..e340d955cf3b 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -10,6 +10,33 @@ #include +/* Sparse enums validation callbacks */ +static int +devlink_attr_param_type_validate(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + switch (nla_get_u8(attr)) { + case DEVLINK_VAR_ATTR_TYPE_U8: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_U16: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_U32: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_U64: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_STRING: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_FLAG: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_NUL_STRING: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_BINARY: + return 0; + } + NL_SET_ERR_MSG_ATTR(extack, attr, "invalid enum value"); + return -EINVAL; +} + /* Common nested types */ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1] = { [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY, }, @@ -273,7 +300,7 @@ static const struct nla_policy devlink_param_set_nl_policy[DEVLINK_ATTR_PARAM_VA [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING, }, - [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8, }, + [DEVLINK_ATTR_PARAM_TYPE] = NLA_POLICY_VALIDATE_FN(NLA_U8, &devlink_attr_param_type_validate), [DEVLINK_ATTR_PARAM_VALUE_CMODE] = NLA_POLICY_MAX(NLA_U8, 2), }; diff --git a/net/devlink/param.c b/net/devlink/param.c index dcf0d1ccebba..d0fb7c88bdb8 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -167,19 +167,19 @@ static int devlink_param_set(struct devlink *devlink, } static int -devlink_param_type_to_nla_type(enum devlink_param_type param_type) +devlink_param_type_to_var_attr_type(enum devlink_param_type param_type) { switch (param_type) { case DEVLINK_PARAM_TYPE_U8: - return NLA_U8; + return DEVLINK_VAR_ATTR_TYPE_U8; case DEVLINK_PARAM_TYPE_U16: - return NLA_U16; + return DEVLINK_VAR_ATTR_TYPE_U16; case DEVLINK_PARAM_TYPE_U32: - return NLA_U32; + return DEVLINK_VAR_ATTR_TYPE_U32; case DEVLINK_PARAM_TYPE_STRING: - return NLA_STRING; + return DEVLINK_VAR_ATTR_TYPE_STRING; case DEVLINK_PARAM_TYPE_BOOL: - return NLA_FLAG; + return DEVLINK_VAR_ATTR_TYPE_FLAG; default: return -EINVAL; } @@ -247,7 +247,7 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_param_gset_ctx ctx; struct nlattr *param_values_list; struct nlattr *param_attr; - int nla_type; + int var_attr_type; void *hdr; int err; int i; @@ -294,10 +294,10 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC)) goto param_nest_cancel; - nla_type = devlink_param_type_to_nla_type(param->type); - if (nla_type < 0) + var_attr_type = devlink_param_type_to_var_attr_type(param->type); + if (var_attr_type < 0) goto param_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type)) + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, var_attr_type)) goto param_nest_cancel; param_values_list = nla_nest_start_noflag(msg, @@ -420,19 +420,19 @@ devlink_param_type_get_from_info(struct genl_info *info, return -EINVAL; switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) { - case NLA_U8: + case DEVLINK_VAR_ATTR_TYPE_U8: *param_type = DEVLINK_PARAM_TYPE_U8; break; - case NLA_U16: + case DEVLINK_VAR_ATTR_TYPE_U16: *param_type = DEVLINK_PARAM_TYPE_U16; break; - case NLA_U32: + case DEVLINK_VAR_ATTR_TYPE_U32: *param_type = DEVLINK_PARAM_TYPE_U32; break; - case NLA_STRING: + case DEVLINK_VAR_ATTR_TYPE_STRING: *param_type = DEVLINK_PARAM_TYPE_STRING; break; - case NLA_FLAG: + case DEVLINK_VAR_ATTR_TYPE_FLAG: *param_type = DEVLINK_PARAM_TYPE_BOOL; break; default: -- cgit v1.2.3 From f9e78932eac650cf1385244482b85e65ccaa87cf Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 5 May 2025 13:45:12 +0200 Subject: devlink: avoid param type value translations Assign DEVLINK_PARAM_TYPE_* enum values to DEVLINK_VAR_ATTR_TYPE_* to ensure the same values are used internally and in UAPI. Benefit from that by removing the value translations. Signed-off-by: Jiri Pirko Link: https://patch.msgid.link/20250505114513.53370-4-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 10 +++++----- net/devlink/param.c | 46 ++-------------------------------------------- 2 files changed, 7 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index b8783126c1ed..0091f23a40f7 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -420,11 +420,11 @@ typedef u64 devlink_resource_occ_get_t(void *priv); #define __DEVLINK_PARAM_MAX_STRING_VALUE 32 enum devlink_param_type { - DEVLINK_PARAM_TYPE_U8, - DEVLINK_PARAM_TYPE_U16, - DEVLINK_PARAM_TYPE_U32, - DEVLINK_PARAM_TYPE_STRING, - DEVLINK_PARAM_TYPE_BOOL, + DEVLINK_PARAM_TYPE_U8 = DEVLINK_VAR_ATTR_TYPE_U8, + DEVLINK_PARAM_TYPE_U16 = DEVLINK_VAR_ATTR_TYPE_U16, + DEVLINK_PARAM_TYPE_U32 = DEVLINK_VAR_ATTR_TYPE_U32, + DEVLINK_PARAM_TYPE_STRING = DEVLINK_VAR_ATTR_TYPE_STRING, + DEVLINK_PARAM_TYPE_BOOL = DEVLINK_VAR_ATTR_TYPE_FLAG, }; union devlink_param_value { diff --git a/net/devlink/param.c b/net/devlink/param.c index d0fb7c88bdb8..b29abf8d3ed4 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -166,25 +166,6 @@ static int devlink_param_set(struct devlink *devlink, return param->set(devlink, param->id, ctx, extack); } -static int -devlink_param_type_to_var_attr_type(enum devlink_param_type param_type) -{ - switch (param_type) { - case DEVLINK_PARAM_TYPE_U8: - return DEVLINK_VAR_ATTR_TYPE_U8; - case DEVLINK_PARAM_TYPE_U16: - return DEVLINK_VAR_ATTR_TYPE_U16; - case DEVLINK_PARAM_TYPE_U32: - return DEVLINK_VAR_ATTR_TYPE_U32; - case DEVLINK_PARAM_TYPE_STRING: - return DEVLINK_VAR_ATTR_TYPE_STRING; - case DEVLINK_PARAM_TYPE_BOOL: - return DEVLINK_VAR_ATTR_TYPE_FLAG; - default: - return -EINVAL; - } -} - static int devlink_nl_param_value_fill_one(struct sk_buff *msg, enum devlink_param_type type, @@ -247,7 +228,6 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_param_gset_ctx ctx; struct nlattr *param_values_list; struct nlattr *param_attr; - int var_attr_type; void *hdr; int err; int i; @@ -293,11 +273,7 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, goto param_nest_cancel; if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC)) goto param_nest_cancel; - - var_attr_type = devlink_param_type_to_var_attr_type(param->type); - if (var_attr_type < 0) - goto param_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, var_attr_type)) + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, param->type)) goto param_nest_cancel; param_values_list = nla_nest_start_noflag(msg, @@ -419,25 +395,7 @@ devlink_param_type_get_from_info(struct genl_info *info, if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE)) return -EINVAL; - switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) { - case DEVLINK_VAR_ATTR_TYPE_U8: - *param_type = DEVLINK_PARAM_TYPE_U8; - break; - case DEVLINK_VAR_ATTR_TYPE_U16: - *param_type = DEVLINK_PARAM_TYPE_U16; - break; - case DEVLINK_VAR_ATTR_TYPE_U32: - *param_type = DEVLINK_PARAM_TYPE_U32; - break; - case DEVLINK_VAR_ATTR_TYPE_STRING: - *param_type = DEVLINK_PARAM_TYPE_STRING; - break; - case DEVLINK_VAR_ATTR_TYPE_FLAG: - *param_type = DEVLINK_PARAM_TYPE_BOOL; - break; - default: - return -EINVAL; - } + *param_type = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE]); return 0; } -- cgit v1.2.3 From 22c64f37e1d4e757b0073a72f1439c2c3509c5cb Mon Sep 17 00:00:00 2001 From: Mohan Kumar G Date: Mon, 5 May 2025 20:58:36 +0530 Subject: wifi: mac80211: Update MCS15 support in link_conf As per IEEE 802.11be-2024 - 9.4.2.321, EHT operation element contains MCS15 Disable subfield as the sixth bit, which is set when MCS15 support is not enabled. Get MCS15 support from EHT operation params and add it in link_conf so that driver can use this value to know if EHT-MCS 15 reception is enabled. Co-developed-by: Dhanavandhana Kannan Signed-off-by: Dhanavandhana Kannan Signed-off-by: Mohan Kumar G Link: https://patch.msgid.link/20250505152836.3266829-1-quic_mkumarg@quicinc.com [remove pointless !! for bool assignment] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + include/net/mac80211.h | 3 +++ net/mac80211/cfg.c | 3 +++ 3 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 17f917cb4540..420c7f9aa6ee 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2325,6 +2325,7 @@ struct ieee80211_eht_cap_elem { #define IEEE80211_EHT_OPER_EHT_DEF_PE_DURATION 0x04 #define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_LIMIT 0x08 #define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_EXP_MASK 0x30 +#define IEEE80211_EHT_OPER_MCS15_DISABLE 0x40 /** * struct ieee80211_eht_operation - eht operation element diff --git a/include/net/mac80211.h b/include/net/mac80211.h index fdafc37d17cc..82617579d910 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -744,6 +744,7 @@ struct ieee80211_parsed_tpe { * @eht_80mhz_full_bw_ul_mumimo: in AP-mode, does this BSS support the * reception of an EHT TB PPDU on an RU that spans the entire PPDU * bandwidth + * @eht_disable_mcs15: disable EHT-MCS 15 reception capability. * @bss_param_ch_cnt: in BSS-mode, the BSS params change count. This * information is the latest known value. It can come from this link's * beacon or from a beacon sent by another link. @@ -852,6 +853,8 @@ struct ieee80211_bss_conf { bool eht_su_beamformee; bool eht_mu_beamformer; bool eht_80mhz_full_bw_ul_mumimo; + bool eht_disable_mcs15; + u8 bss_param_ch_cnt; u8 bss_param_ch_cnt_link_id; }; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 9017b98fea04..2cd8731d8275 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1421,6 +1421,9 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, (IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ); + link_conf->eht_disable_mcs15 = + u8_get_bits(params->eht_oper->params, + IEEE80211_EHT_OPER_MCS15_DISABLE); } else { link_conf->eht_su_beamformer = false; link_conf->eht_su_beamformee = false; -- cgit v1.2.3 From 4701073c3debd16d7f534f3eb808bd9b50601c0c Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Tue, 6 May 2025 16:07:22 +0800 Subject: net: enetc: add initial netc-lib driver to support NTMP Some NETC functionality is controlled using control messages sent to the hardware using BD ring interface with 32B descriptor similar to transmit BD ring used on ENETC. This BD ring interface is referred to as command BD ring. It is used to configure functionality where the underlying resources may be shared between different entities or being too large to configure using direct registers. Therefore, a messaging protocol called NETC Table Management Protocol (NTMP) is provided for exchanging configuration and management information between the software and the hardware using the command BD ring interface. For the management protocol of LS1028A has been retroactively named NTMP 1.0, and its implementation is in enetc_cbdr.c and enetc_qos.c. However, NTMP of i.MX95 has been upgraded to version 2.0, which is incompatible with LS1028A, because the message formats have been changed. Therefore, add the netc-lib driver to support NTMP 2.0 to operate various tables. Note that, only MAC address filter table and RSS table are supported at the moment. More tables will be supported in subsequent patches. It is worth mentioning that the purpose of the netc-lib driver is to provide some NTMP-based generic interfaces for ENETC and NETC Switch drivers. Currently, it only supports the configurations of some tables. Interfaces such as tc flower and debugfs will be added in the future. Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250506080735.3444381-2-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + drivers/net/ethernet/freescale/enetc/Kconfig | 8 + drivers/net/ethernet/freescale/enetc/Makefile | 3 + drivers/net/ethernet/freescale/enetc/ntmp.c | 462 +++++++++++++++++++++ .../net/ethernet/freescale/enetc/ntmp_private.h | 103 +++++ include/linux/fsl/ntmp.h | 121 ++++++ 6 files changed, 698 insertions(+) create mode 100644 drivers/net/ethernet/freescale/enetc/ntmp.c create mode 100644 drivers/net/ethernet/freescale/enetc/ntmp_private.h create mode 100644 include/linux/fsl/ntmp.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index c1bfbe48c5a2..890699d937b6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9378,6 +9378,7 @@ F: Documentation/devicetree/bindings/net/nxp,netc-blk-ctrl.yaml F: drivers/net/ethernet/freescale/enetc/ F: include/linux/fsl/enetc_mdio.h F: include/linux/fsl/netc_global.h +F: include/linux/fsl/ntmp.h FREESCALE eTSEC ETHERNET DRIVER (GIANFAR) M: Claudiu Manoil diff --git a/drivers/net/ethernet/freescale/enetc/Kconfig b/drivers/net/ethernet/freescale/enetc/Kconfig index 7250d3bbf6bb..616ea22ceabc 100644 --- a/drivers/net/ethernet/freescale/enetc/Kconfig +++ b/drivers/net/ethernet/freescale/enetc/Kconfig @@ -15,6 +15,13 @@ config NXP_ENETC_PF_COMMON If compiled as module (M), the module name is nxp-enetc-pf-common. +config NXP_NETC_LIB + tristate + help + This module provides common functionalities for both ENETC and NETC + Switch, such as NETC Table Management Protocol (NTMP) 2.0, common tc + flower and debugfs interfaces and so on. + config FSL_ENETC tristate "ENETC PF driver" depends on PCI_MSI @@ -40,6 +47,7 @@ config NXP_ENETC4 select FSL_ENETC_CORE select FSL_ENETC_MDIO select NXP_ENETC_PF_COMMON + select NXP_NETC_LIB select PHYLINK select DIMLIB help diff --git a/drivers/net/ethernet/freescale/enetc/Makefile b/drivers/net/ethernet/freescale/enetc/Makefile index 6fd27ee4fcd1..707a68e26971 100644 --- a/drivers/net/ethernet/freescale/enetc/Makefile +++ b/drivers/net/ethernet/freescale/enetc/Makefile @@ -6,6 +6,9 @@ fsl-enetc-core-y := enetc.o enetc_cbdr.o enetc_ethtool.o obj-$(CONFIG_NXP_ENETC_PF_COMMON) += nxp-enetc-pf-common.o nxp-enetc-pf-common-y := enetc_pf_common.o +obj-$(CONFIG_NXP_NETC_LIB) += nxp-netc-lib.o +nxp-netc-lib-y := ntmp.o + obj-$(CONFIG_FSL_ENETC) += fsl-enetc.o fsl-enetc-y := enetc_pf.o fsl-enetc-$(CONFIG_PCI_IOV) += enetc_msg.o diff --git a/drivers/net/ethernet/freescale/enetc/ntmp.c b/drivers/net/ethernet/freescale/enetc/ntmp.c new file mode 100644 index 000000000000..ba32c1bbd9e1 --- /dev/null +++ b/drivers/net/ethernet/freescale/enetc/ntmp.c @@ -0,0 +1,462 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) +/* + * NETC NTMP (NETC Table Management Protocol) 2.0 Library + * Copyright 2025 NXP + */ + +#include +#include +#include + +#include "ntmp_private.h" + +#define NETC_CBDR_TIMEOUT 1000 /* us */ +#define NETC_CBDR_DELAY_US 10 +#define NETC_CBDR_MR_EN BIT(31) + +#define NTMP_BASE_ADDR_ALIGN 128 +#define NTMP_DATA_ADDR_ALIGN 32 + +/* Define NTMP Table ID */ +#define NTMP_MAFT_ID 1 +#define NTMP_RSST_ID 3 + +/* Generic Update Actions for most tables */ +#define NTMP_GEN_UA_CFGEU BIT(0) +#define NTMP_GEN_UA_STSEU BIT(1) + +#define NTMP_ENTRY_ID_SIZE 4 +#define RSST_ENTRY_NUM 64 +#define RSST_STSE_DATA_SIZE(n) ((n) * 8) +#define RSST_CFGE_DATA_SIZE(n) (n) + +int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev, + const struct netc_cbdr_regs *regs) +{ + int cbd_num = NETC_CBDR_BD_NUM; + size_t size; + + size = cbd_num * sizeof(union netc_cbd) + NTMP_BASE_ADDR_ALIGN; + cbdr->addr_base = dma_alloc_coherent(dev, size, &cbdr->dma_base, + GFP_KERNEL); + if (!cbdr->addr_base) + return -ENOMEM; + + cbdr->dma_size = size; + cbdr->bd_num = cbd_num; + cbdr->regs = *regs; + cbdr->dev = dev; + + /* The base address of the Control BD Ring must be 128 bytes aligned */ + cbdr->dma_base_align = ALIGN(cbdr->dma_base, NTMP_BASE_ADDR_ALIGN); + cbdr->addr_base_align = PTR_ALIGN(cbdr->addr_base, + NTMP_BASE_ADDR_ALIGN); + + cbdr->next_to_clean = 0; + cbdr->next_to_use = 0; + spin_lock_init(&cbdr->ring_lock); + + /* Step 1: Configure the base address of the Control BD Ring */ + netc_write(cbdr->regs.bar0, lower_32_bits(cbdr->dma_base_align)); + netc_write(cbdr->regs.bar1, upper_32_bits(cbdr->dma_base_align)); + + /* Step 2: Configure the producer index register */ + netc_write(cbdr->regs.pir, cbdr->next_to_clean); + + /* Step 3: Configure the consumer index register */ + netc_write(cbdr->regs.cir, cbdr->next_to_use); + + /* Step4: Configure the number of BDs of the Control BD Ring */ + netc_write(cbdr->regs.lenr, cbdr->bd_num); + + /* Step 5: Enable the Control BD Ring */ + netc_write(cbdr->regs.mr, NETC_CBDR_MR_EN); + + return 0; +} +EXPORT_SYMBOL_GPL(ntmp_init_cbdr); + +void ntmp_free_cbdr(struct netc_cbdr *cbdr) +{ + /* Disable the Control BD Ring */ + netc_write(cbdr->regs.mr, 0); + dma_free_coherent(cbdr->dev, cbdr->dma_size, cbdr->addr_base, + cbdr->dma_base); + memset(cbdr, 0, sizeof(*cbdr)); +} +EXPORT_SYMBOL_GPL(ntmp_free_cbdr); + +static int ntmp_get_free_cbd_num(struct netc_cbdr *cbdr) +{ + return (cbdr->next_to_clean - cbdr->next_to_use - 1 + + cbdr->bd_num) % cbdr->bd_num; +} + +static union netc_cbd *ntmp_get_cbd(struct netc_cbdr *cbdr, int index) +{ + return &((union netc_cbd *)(cbdr->addr_base_align))[index]; +} + +static void ntmp_clean_cbdr(struct netc_cbdr *cbdr) +{ + union netc_cbd *cbd; + int i; + + i = cbdr->next_to_clean; + while (netc_read(cbdr->regs.cir) != i) { + cbd = ntmp_get_cbd(cbdr, i); + memset(cbd, 0, sizeof(*cbd)); + i = (i + 1) % cbdr->bd_num; + } + + cbdr->next_to_clean = i; +} + +static int netc_xmit_ntmp_cmd(struct ntmp_user *user, union netc_cbd *cbd) +{ + union netc_cbd *cur_cbd; + struct netc_cbdr *cbdr; + int i, err; + u16 status; + u32 val; + + /* Currently only i.MX95 ENETC is supported, and it only has one + * command BD ring + */ + cbdr = &user->ring[0]; + + spin_lock_bh(&cbdr->ring_lock); + + if (unlikely(!ntmp_get_free_cbd_num(cbdr))) + ntmp_clean_cbdr(cbdr); + + i = cbdr->next_to_use; + cur_cbd = ntmp_get_cbd(cbdr, i); + *cur_cbd = *cbd; + dma_wmb(); + + /* Update producer index of both software and hardware */ + i = (i + 1) % cbdr->bd_num; + cbdr->next_to_use = i; + netc_write(cbdr->regs.pir, i); + + err = read_poll_timeout_atomic(netc_read, val, val == i, + NETC_CBDR_DELAY_US, NETC_CBDR_TIMEOUT, + true, cbdr->regs.cir); + if (unlikely(err)) + goto cbdr_unlock; + + dma_rmb(); + /* Get the writeback command BD, because the caller may need + * to check some other fields of the response header. + */ + *cbd = *cur_cbd; + + /* Check the writeback error status */ + status = le16_to_cpu(cbd->resp_hdr.error_rr) & NTMP_RESP_ERROR; + if (unlikely(status)) { + err = -EIO; + dev_err(user->dev, "Command BD error: 0x%04x\n", status); + } + + ntmp_clean_cbdr(cbdr); + dma_wmb(); + +cbdr_unlock: + spin_unlock_bh(&cbdr->ring_lock); + + return err; +} + +static int ntmp_alloc_data_mem(struct ntmp_dma_buf *data, void **buf_align) +{ + void *buf; + + buf = dma_alloc_coherent(data->dev, data->size + NTMP_DATA_ADDR_ALIGN, + &data->dma, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + data->buf = buf; + *buf_align = PTR_ALIGN(buf, NTMP_DATA_ADDR_ALIGN); + + return 0; +} + +static void ntmp_free_data_mem(struct ntmp_dma_buf *data) +{ + dma_free_coherent(data->dev, data->size + NTMP_DATA_ADDR_ALIGN, + data->buf, data->dma); +} + +static void ntmp_fill_request_hdr(union netc_cbd *cbd, dma_addr_t dma, + int len, int table_id, int cmd, + int access_method) +{ + dma_addr_t dma_align; + + memset(cbd, 0, sizeof(*cbd)); + dma_align = ALIGN(dma, NTMP_DATA_ADDR_ALIGN); + cbd->req_hdr.addr = cpu_to_le64(dma_align); + cbd->req_hdr.len = cpu_to_le32(len); + cbd->req_hdr.cmd = cmd; + cbd->req_hdr.access_method = FIELD_PREP(NTMP_ACCESS_METHOD, + access_method); + cbd->req_hdr.table_id = table_id; + cbd->req_hdr.ver_cci_rr = FIELD_PREP(NTMP_HDR_VERSION, + NTMP_HDR_VER2); + /* For NTMP version 2.0 or later version */ + cbd->req_hdr.npf = cpu_to_le32(NTMP_NPF); +} + +static void ntmp_fill_crd(struct ntmp_cmn_req_data *crd, u8 tblv, + u8 qa, u16 ua) +{ + crd->update_act = cpu_to_le16(ua); + crd->tblv_qact = NTMP_TBLV_QACT(tblv, qa); +} + +static void ntmp_fill_crd_eid(struct ntmp_req_by_eid *rbe, u8 tblv, + u8 qa, u16 ua, u32 entry_id) +{ + ntmp_fill_crd(&rbe->crd, tblv, qa, ua); + rbe->entry_id = cpu_to_le32(entry_id); +} + +static const char *ntmp_table_name(int tbl_id) +{ + switch (tbl_id) { + case NTMP_MAFT_ID: + return "MAC Address Filter Table"; + case NTMP_RSST_ID: + return "RSS Table"; + default: + return "Unknown Table"; + }; +} + +static int ntmp_delete_entry_by_id(struct ntmp_user *user, int tbl_id, + u8 tbl_ver, u32 entry_id, u32 req_len, + u32 resp_len) +{ + struct ntmp_dma_buf data = { + .dev = user->dev, + .size = max(req_len, resp_len), + }; + struct ntmp_req_by_eid *req; + union netc_cbd cbd; + int err; + + err = ntmp_alloc_data_mem(&data, (void **)&req); + if (err) + return err; + + ntmp_fill_crd_eid(req, tbl_ver, 0, 0, entry_id); + ntmp_fill_request_hdr(&cbd, data.dma, NTMP_LEN(req_len, resp_len), + tbl_id, NTMP_CMD_DELETE, NTMP_AM_ENTRY_ID); + + err = netc_xmit_ntmp_cmd(user, &cbd); + if (err) + dev_err(user->dev, + "Failed to delete entry 0x%x of %s, err: %pe", + entry_id, ntmp_table_name(tbl_id), ERR_PTR(err)); + + ntmp_free_data_mem(&data); + + return err; +} + +static int ntmp_query_entry_by_id(struct ntmp_user *user, int tbl_id, + u32 len, struct ntmp_req_by_eid *req, + dma_addr_t dma, bool compare_eid) +{ + struct ntmp_cmn_resp_query *resp; + int cmd = NTMP_CMD_QUERY; + union netc_cbd cbd; + u32 entry_id; + int err; + + entry_id = le32_to_cpu(req->entry_id); + if (le16_to_cpu(req->crd.update_act)) + cmd = NTMP_CMD_QU; + + /* Request header */ + ntmp_fill_request_hdr(&cbd, dma, len, tbl_id, cmd, NTMP_AM_ENTRY_ID); + err = netc_xmit_ntmp_cmd(user, &cbd); + if (err) { + dev_err(user->dev, + "Failed to query entry 0x%x of %s, err: %pe\n", + entry_id, ntmp_table_name(tbl_id), ERR_PTR(err)); + return err; + } + + /* For a few tables, the first field of their response data is not + * entry_id, so directly return success. + */ + if (!compare_eid) + return 0; + + resp = (struct ntmp_cmn_resp_query *)req; + if (unlikely(le32_to_cpu(resp->entry_id) != entry_id)) { + dev_err(user->dev, + "%s: query EID 0x%x doesn't match response EID 0x%x\n", + ntmp_table_name(tbl_id), entry_id, le32_to_cpu(resp->entry_id)); + return -EIO; + } + + return 0; +} + +int ntmp_maft_add_entry(struct ntmp_user *user, u32 entry_id, + struct maft_entry_data *maft) +{ + struct ntmp_dma_buf data = { + .dev = user->dev, + .size = sizeof(struct maft_req_add), + }; + struct maft_req_add *req; + union netc_cbd cbd; + int err; + + err = ntmp_alloc_data_mem(&data, (void **)&req); + if (err) + return err; + + /* Set mac address filter table request data buffer */ + ntmp_fill_crd_eid(&req->rbe, user->tbl.maft_ver, 0, 0, entry_id); + req->keye = maft->keye; + req->cfge = maft->cfge; + + ntmp_fill_request_hdr(&cbd, data.dma, NTMP_LEN(data.size, 0), + NTMP_MAFT_ID, NTMP_CMD_ADD, NTMP_AM_ENTRY_ID); + err = netc_xmit_ntmp_cmd(user, &cbd); + if (err) + dev_err(user->dev, "Failed to add MAFT entry 0x%x, err: %pe\n", + entry_id, ERR_PTR(err)); + + ntmp_free_data_mem(&data); + + return err; +} +EXPORT_SYMBOL_GPL(ntmp_maft_add_entry); + +int ntmp_maft_query_entry(struct ntmp_user *user, u32 entry_id, + struct maft_entry_data *maft) +{ + struct ntmp_dma_buf data = { + .dev = user->dev, + .size = sizeof(struct maft_resp_query), + }; + struct maft_resp_query *resp; + struct ntmp_req_by_eid *req; + int err; + + err = ntmp_alloc_data_mem(&data, (void **)&req); + if (err) + return err; + + ntmp_fill_crd_eid(req, user->tbl.maft_ver, 0, 0, entry_id); + err = ntmp_query_entry_by_id(user, NTMP_MAFT_ID, + NTMP_LEN(sizeof(*req), data.size), + req, data.dma, true); + if (err) + goto end; + + resp = (struct maft_resp_query *)req; + maft->keye = resp->keye; + maft->cfge = resp->cfge; + +end: + ntmp_free_data_mem(&data); + + return err; +} +EXPORT_SYMBOL_GPL(ntmp_maft_query_entry); + +int ntmp_maft_delete_entry(struct ntmp_user *user, u32 entry_id) +{ + return ntmp_delete_entry_by_id(user, NTMP_MAFT_ID, user->tbl.maft_ver, + entry_id, NTMP_EID_REQ_LEN, 0); +} +EXPORT_SYMBOL_GPL(ntmp_maft_delete_entry); + +int ntmp_rsst_update_entry(struct ntmp_user *user, const u32 *table, + int count) +{ + struct ntmp_dma_buf data = {.dev = user->dev}; + struct rsst_req_update *req; + union netc_cbd cbd; + int err, i; + + if (count != RSST_ENTRY_NUM) + /* HW only takes in a full 64 entry table */ + return -EINVAL; + + data.size = struct_size(req, groups, count); + err = ntmp_alloc_data_mem(&data, (void **)&req); + if (err) + return err; + + /* Set the request data buffer */ + ntmp_fill_crd_eid(&req->rbe, user->tbl.rsst_ver, 0, + NTMP_GEN_UA_CFGEU | NTMP_GEN_UA_STSEU, 0); + for (i = 0; i < count; i++) + req->groups[i] = (u8)(table[i]); + + ntmp_fill_request_hdr(&cbd, data.dma, NTMP_LEN(data.size, 0), + NTMP_RSST_ID, NTMP_CMD_UPDATE, NTMP_AM_ENTRY_ID); + + err = netc_xmit_ntmp_cmd(user, &cbd); + if (err) + dev_err(user->dev, "Failed to update RSST entry, err: %pe\n", + ERR_PTR(err)); + + ntmp_free_data_mem(&data); + + return err; +} +EXPORT_SYMBOL_GPL(ntmp_rsst_update_entry); + +int ntmp_rsst_query_entry(struct ntmp_user *user, u32 *table, int count) +{ + struct ntmp_dma_buf data = {.dev = user->dev}; + struct ntmp_req_by_eid *req; + union netc_cbd cbd; + int err, i; + u8 *group; + + if (count != RSST_ENTRY_NUM) + /* HW only takes in a full 64 entry table */ + return -EINVAL; + + data.size = NTMP_ENTRY_ID_SIZE + RSST_STSE_DATA_SIZE(count) + + RSST_CFGE_DATA_SIZE(count); + err = ntmp_alloc_data_mem(&data, (void **)&req); + if (err) + return err; + + /* Set the request data buffer */ + ntmp_fill_crd_eid(req, user->tbl.rsst_ver, 0, 0, 0); + ntmp_fill_request_hdr(&cbd, data.dma, NTMP_LEN(sizeof(*req), data.size), + NTMP_RSST_ID, NTMP_CMD_QUERY, NTMP_AM_ENTRY_ID); + err = netc_xmit_ntmp_cmd(user, &cbd); + if (err) { + dev_err(user->dev, "Failed to query RSST entry, err: %pe\n", + ERR_PTR(err)); + goto end; + } + + group = (u8 *)req; + group += NTMP_ENTRY_ID_SIZE + RSST_STSE_DATA_SIZE(count); + for (i = 0; i < count; i++) + table[i] = group[i]; + +end: + ntmp_free_data_mem(&data); + + return err; +} +EXPORT_SYMBOL_GPL(ntmp_rsst_query_entry); + +MODULE_DESCRIPTION("NXP NETC Library"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/net/ethernet/freescale/enetc/ntmp_private.h b/drivers/net/ethernet/freescale/enetc/ntmp_private.h new file mode 100644 index 000000000000..685b388e7ddf --- /dev/null +++ b/drivers/net/ethernet/freescale/enetc/ntmp_private.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* + * NTMP table request and response data buffer formats + * Copyright 2025 NXP + */ + +#ifndef __NTMP_PRIVATE_H +#define __NTMP_PRIVATE_H + +#include + +#define NTMP_EID_REQ_LEN 8 +#define NETC_CBDR_BD_NUM 256 + +union netc_cbd { + struct { + __le64 addr; + __le32 len; +#define NTMP_RESP_LEN GENMASK(19, 0) +#define NTMP_REQ_LEN GENMASK(31, 20) +#define NTMP_LEN(req, resp) (FIELD_PREP(NTMP_REQ_LEN, (req)) | \ + ((resp) & NTMP_RESP_LEN)) + u8 cmd; +#define NTMP_CMD_DELETE BIT(0) +#define NTMP_CMD_UPDATE BIT(1) +#define NTMP_CMD_QUERY BIT(2) +#define NTMP_CMD_ADD BIT(3) +#define NTMP_CMD_QU (NTMP_CMD_QUERY | NTMP_CMD_UPDATE) + u8 access_method; +#define NTMP_ACCESS_METHOD GENMASK(7, 4) +#define NTMP_AM_ENTRY_ID 0 +#define NTMP_AM_EXACT_KEY 1 +#define NTMP_AM_SEARCH 2 +#define NTMP_AM_TERNARY_KEY 3 + u8 table_id; + u8 ver_cci_rr; +#define NTMP_HDR_VERSION GENMASK(5, 0) +#define NTMP_HDR_VER2 2 +#define NTMP_CCI BIT(6) +#define NTMP_RR BIT(7) + __le32 resv[3]; + __le32 npf; +#define NTMP_NPF BIT(15) + } req_hdr; /* NTMP Request Message Header Format */ + + struct { + __le32 resv0[3]; + __le16 num_matched; + __le16 error_rr; +#define NTMP_RESP_ERROR GENMASK(11, 0) +#define NTMP_RESP_RR BIT(15) + __le32 resv1[4]; + } resp_hdr; /* NTMP Response Message Header Format */ +}; + +struct ntmp_dma_buf { + struct device *dev; + size_t size; + void *buf; + dma_addr_t dma; +}; + +struct ntmp_cmn_req_data { + __le16 update_act; + u8 dbg_opt; + u8 tblv_qact; +#define NTMP_QUERY_ACT GENMASK(3, 0) +#define NTMP_TBL_VER GENMASK(7, 4) +#define NTMP_TBLV_QACT(v, a) (FIELD_PREP(NTMP_TBL_VER, (v)) | \ + ((a) & NTMP_QUERY_ACT)) +}; + +struct ntmp_cmn_resp_query { + __le32 entry_id; +}; + +/* Generic structure for request data by entry ID */ +struct ntmp_req_by_eid { + struct ntmp_cmn_req_data crd; + __le32 entry_id; +}; + +/* MAC Address Filter Table Request Data Buffer Format of Add action */ +struct maft_req_add { + struct ntmp_req_by_eid rbe; + struct maft_keye_data keye; + struct maft_cfge_data cfge; +}; + +/* MAC Address Filter Table Response Data Buffer Format of Query action */ +struct maft_resp_query { + __le32 entry_id; + struct maft_keye_data keye; + struct maft_cfge_data cfge; +}; + +/* RSS Table Request Data Buffer Format of Update action */ +struct rsst_req_update { + struct ntmp_req_by_eid rbe; + u8 groups[]; +}; + +#endif diff --git a/include/linux/fsl/ntmp.h b/include/linux/fsl/ntmp.h new file mode 100644 index 000000000000..916dc4fe7de3 --- /dev/null +++ b/include/linux/fsl/ntmp.h @@ -0,0 +1,121 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* Copyright 2025 NXP */ +#ifndef __NETC_NTMP_H +#define __NETC_NTMP_H + +#include +#include + +struct maft_keye_data { + u8 mac_addr[ETH_ALEN]; + __le16 resv; +}; + +struct maft_cfge_data { + __le16 si_bitmap; + __le16 resv; +}; + +struct netc_cbdr_regs { + void __iomem *pir; + void __iomem *cir; + void __iomem *mr; + + void __iomem *bar0; + void __iomem *bar1; + void __iomem *lenr; +}; + +struct netc_tbl_vers { + u8 maft_ver; + u8 rsst_ver; +}; + +struct netc_cbdr { + struct device *dev; + struct netc_cbdr_regs regs; + + int bd_num; + int next_to_use; + int next_to_clean; + + int dma_size; + void *addr_base; + void *addr_base_align; + dma_addr_t dma_base; + dma_addr_t dma_base_align; + + /* Serialize the order of command BD ring */ + spinlock_t ring_lock; +}; + +struct ntmp_user { + int cbdr_num; /* number of control BD ring */ + struct device *dev; + struct netc_cbdr *ring; + struct netc_tbl_vers tbl; +}; + +struct maft_entry_data { + struct maft_keye_data keye; + struct maft_cfge_data cfge; +}; + +#if IS_ENABLED(CONFIG_NXP_NETC_LIB) +int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev, + const struct netc_cbdr_regs *regs); +void ntmp_free_cbdr(struct netc_cbdr *cbdr); + +/* NTMP APIs */ +int ntmp_maft_add_entry(struct ntmp_user *user, u32 entry_id, + struct maft_entry_data *maft); +int ntmp_maft_query_entry(struct ntmp_user *user, u32 entry_id, + struct maft_entry_data *maft); +int ntmp_maft_delete_entry(struct ntmp_user *user, u32 entry_id); +int ntmp_rsst_update_entry(struct ntmp_user *user, const u32 *table, + int count); +int ntmp_rsst_query_entry(struct ntmp_user *user, + u32 *table, int count); +#else +static inline int ntmp_init_cbdr(struct netc_cbdr *cbdr, struct device *dev, + const struct netc_cbdr_regs *regs) +{ + return 0; +} + +static inline void ntmp_free_cbdr(struct netc_cbdr *cbdr) +{ +} + +static inline int ntmp_maft_add_entry(struct ntmp_user *user, u32 entry_id, + struct maft_entry_data *maft) +{ + return 0; +} + +static inline int ntmp_maft_query_entry(struct ntmp_user *user, u32 entry_id, + struct maft_entry_data *maft) +{ + return 0; +} + +static inline int ntmp_maft_delete_entry(struct ntmp_user *user, u32 entry_id) +{ + return 0; +} + +static inline int ntmp_rsst_update_entry(struct ntmp_user *user, + const u32 *table, int count) +{ + return 0; +} + +static inline int ntmp_rsst_query_entry(struct ntmp_user *user, + u32 *table, int count) +{ + return 0; +} + +#endif + +#endif -- cgit v1.2.3 From c24a65b6a27c78d8540409800886b6622ea86ebf Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 15 Apr 2025 21:15:49 -0500 Subject: iidc/ice/irdma: Update IDC to support multiple consumers In preparation of supporting more than a single core PCI driver for RDMA, move ice specific structs like qset_params, qos_info and qos_params from iidc_rdma.h to iidc_rdma_ice.h. Previously, the ice driver was just exporting its entire PF struct to the auxiliary driver, but since each core driver will have its own different PF struct, implement a universal struct that all core drivers can provide to the auxiliary driver through the probe call. Reviewed-by: Przemek Kitszel Signed-off-by: Dave Ertman Co-developed-by: Mustafa Ismail Signed-off-by: Mustafa Ismail Co-developed-by: Shiraz Saleem Signed-off-by: Shiraz Saleem Co-developed-by: Tatyana Nikolova Signed-off-by: Tatyana Nikolova Signed-off-by: Tony Nguyen --- drivers/infiniband/hw/irdma/main.c | 102 ++++++------ drivers/infiniband/hw/irdma/main.h | 1 + drivers/net/ethernet/intel/ice/devlink/devlink.c | 45 ++++-- drivers/net/ethernet/intel/ice/ice.h | 6 +- drivers/net/ethernet/intel/ice/ice_dcb_lib.c | 39 ++++- drivers/net/ethernet/intel/ice/ice_dcb_lib.h | 9 ++ drivers/net/ethernet/intel/ice/ice_ethtool.c | 8 +- drivers/net/ethernet/intel/ice/ice_idc.c | 192 +++++++++++++++-------- drivers/net/ethernet/intel/ice/ice_main.c | 10 +- include/linux/net/intel/iidc_rdma.h | 67 +++----- include/linux/net/intel/iidc_rdma_ice.h | 69 ++++++-- 11 files changed, 348 insertions(+), 200 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index 46489c0ab511..abb532bc8ce4 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -1,10 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2015 - 2021 Intel Corporation */ #include "main.h" -#include "../../../net/ethernet/intel/ice/ice.h" MODULE_ALIAS("i40iw"); -MODULE_AUTHOR("Intel Corporation, "); MODULE_DESCRIPTION("Intel(R) Ethernet Protocol Driver for RDMA"); MODULE_LICENSE("Dual BSD/GPL"); @@ -85,9 +83,10 @@ static void irdma_fill_qos_info(struct irdma_l2params *l2params, } } -static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_rdma_event *event) +static void irdma_iidc_event_handler(struct iidc_rdma_core_dev_info *cdev_info, + struct iidc_rdma_event *event) { - struct irdma_device *iwdev = dev_get_drvdata(&pf->adev->dev); + struct irdma_device *iwdev = dev_get_drvdata(&cdev_info->adev->dev); struct irdma_l2params l2params = {}; if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_MTU_CHANGE)) { @@ -104,17 +103,18 @@ static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_rdma_event * irdma_prep_tc_change(iwdev); } else if (*event->type & BIT(IIDC_RDMA_EVENT_AFTER_TC_CHANGE)) { - struct iidc_rdma_qos_params qos_info = {}; + struct iidc_rdma_priv_dev_info *iidc_priv = cdev_info->iidc_priv; if (!iwdev->vsi.tc_change_pending) return; l2params.tc_changed = true; ibdev_dbg(&iwdev->ibdev, "CLNT: TC Change\n"); - ice_get_qos_params(pf, &qos_info); - irdma_fill_qos_info(&l2params, &qos_info); + + irdma_fill_qos_info(&l2params, &iidc_priv->qos_info); if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) - iwdev->dcb_vlan_mode = qos_info.num_tc > 1 && !l2params.dscp_mode; + iwdev->dcb_vlan_mode = + l2params.num_tc > 1 && !l2params.dscp_mode; irdma_change_l2params(&iwdev->vsi, &l2params); } else if (*event->type & BIT(IIDC_RDMA_EVENT_CRIT_ERR)) { ibdev_warn(&iwdev->ibdev, "ICE OICR event notification: oicr = 0x%08x\n", @@ -151,10 +151,8 @@ static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_rdma_event * */ static void irdma_request_reset(struct irdma_pci_f *rf) { - struct ice_pf *pf = rf->cdev; - ibdev_warn(&rf->iwdev->ibdev, "Requesting a reset\n"); - ice_rdma_request_reset(pf, IIDC_PFR); + ice_rdma_request_reset(rf->cdev, IIDC_FUNC_RESET); } /** @@ -166,14 +164,15 @@ static int irdma_lan_register_qset(struct irdma_sc_vsi *vsi, struct irdma_ws_node *tc_node) { struct irdma_device *iwdev = vsi->back_vsi; - struct ice_pf *pf = iwdev->rf->cdev; + struct iidc_rdma_core_dev_info *cdev_info; struct iidc_rdma_qset_params qset = {}; int ret; + cdev_info = iwdev->rf->cdev; qset.qs_handle = tc_node->qs_handle; qset.tc = tc_node->traffic_class; qset.vport_id = vsi->vsi_idx; - ret = ice_add_rdma_qset(pf, &qset); + ret = ice_add_rdma_qset(cdev_info, &qset); if (ret) { ibdev_dbg(&iwdev->ibdev, "WS: LAN alloc_res for rdma qset failed.\n"); return ret; @@ -194,19 +193,20 @@ static void irdma_lan_unregister_qset(struct irdma_sc_vsi *vsi, struct irdma_ws_node *tc_node) { struct irdma_device *iwdev = vsi->back_vsi; - struct ice_pf *pf = iwdev->rf->cdev; + struct iidc_rdma_core_dev_info *cdev_info; struct iidc_rdma_qset_params qset = {}; + cdev_info = iwdev->rf->cdev; qset.qs_handle = tc_node->qs_handle; qset.tc = tc_node->traffic_class; qset.vport_id = vsi->vsi_idx; qset.teid = tc_node->l2_sched_node_id; - if (ice_del_rdma_qset(pf, &qset)) + if (ice_del_rdma_qset(cdev_info, &qset)) ibdev_dbg(&iwdev->ibdev, "WS: LAN free_res for rdma qset failed.\n"); } -static int irdma_init_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf) +static int irdma_init_interrupts(struct irdma_pci_f *rf, struct iidc_rdma_core_dev_info *cdev) { int i; @@ -217,12 +217,12 @@ static int irdma_init_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf) return -ENOMEM; for (i = 0; i < rf->msix_count; i++) - if (ice_alloc_rdma_qvector(pf, &rf->msix_entries[i])) + if (ice_alloc_rdma_qvector(cdev, &rf->msix_entries[i])) break; if (i < IRDMA_MIN_MSIX) { for (; i > 0; i--) - ice_free_rdma_qvector(pf, &rf->msix_entries[i]); + ice_free_rdma_qvector(cdev, &rf->msix_entries[i]); kfree(rf->msix_entries); return -ENOMEM; @@ -233,12 +233,12 @@ static int irdma_init_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf) return 0; } -static void irdma_deinit_interrupts(struct irdma_pci_f *rf, struct ice_pf *pf) +static void irdma_deinit_interrupts(struct irdma_pci_f *rf, struct iidc_rdma_core_dev_info *cdev) { int i; for (i = 0; i < rf->msix_count; i++) - ice_free_rdma_qvector(pf, &rf->msix_entries[i]); + ice_free_rdma_qvector(cdev, &rf->msix_entries[i]); kfree(rf->msix_entries); } @@ -247,41 +247,49 @@ static void irdma_remove(struct auxiliary_device *aux_dev) { struct irdma_device *iwdev = auxiliary_get_drvdata(aux_dev); struct iidc_rdma_core_auxiliary_dev *iidc_adev; - struct ice_pf *pf; + struct iidc_rdma_core_dev_info *cdev_info; iidc_adev = container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev); - pf = iidc_adev->pf; + cdev_info = iidc_adev->cdev_info; + ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, false); irdma_ib_unregister_device(iwdev); - ice_rdma_update_vsi_filter(pf, iwdev->vsi_num, false); - irdma_deinit_interrupts(iwdev->rf, pf); + irdma_deinit_interrupts(iwdev->rf, cdev_info); - pr_debug("INIT: Gen2 PF[%d] device remove success\n", PCI_FUNC(pf->pdev->devfn)); + pr_debug("INIT: Gen2 PF[%d] device remove success\n", PCI_FUNC(cdev_info->pdev->devfn)); } -static void irdma_fill_device_info(struct irdma_device *iwdev, struct ice_pf *pf, - struct ice_vsi *vsi) +static void irdma_fill_device_info(struct irdma_device *iwdev, + struct iidc_rdma_core_dev_info *cdev_info) { + struct iidc_rdma_priv_dev_info *iidc_priv = cdev_info->iidc_priv; struct irdma_pci_f *rf = iwdev->rf; - rf->cdev = pf; + rf->sc_dev.hw = &rf->hw; + rf->iwdev = iwdev; + rf->cdev = cdev_info; + rf->hw.hw_addr = iidc_priv->hw_addr; + rf->pcidev = cdev_info->pdev; + rf->hw.device = &rf->pcidev->dev; + rf->pf_id = iidc_priv->pf_id; rf->gen_ops.register_qset = irdma_lan_register_qset; rf->gen_ops.unregister_qset = irdma_lan_unregister_qset; - rf->hw.hw_addr = pf->hw.hw_addr; - rf->pcidev = pf->pdev; - rf->pf_id = pf->hw.pf_id; - rf->default_vsi.vsi_idx = vsi->vsi_num; - rf->protocol_used = pf->rdma_mode & IIDC_RDMA_PROTOCOL_ROCEV2 ? - IRDMA_ROCE_PROTOCOL_ONLY : IRDMA_IWARP_PROTOCOL_ONLY; + + rf->default_vsi.vsi_idx = iidc_priv->vport_id; + rf->protocol_used = + cdev_info->rdma_protocol == IIDC_RDMA_PROTOCOL_ROCEV2 ? + IRDMA_ROCE_PROTOCOL_ONLY : IRDMA_IWARP_PROTOCOL_ONLY; rf->rdma_ver = IRDMA_GEN_2; rf->rsrc_profile = IRDMA_HMC_PROFILE_DEFAULT; rf->rst_to = IRDMA_RST_TIMEOUT_HZ; rf->gen_ops.request_reset = irdma_request_reset; rf->limits_sel = 7; rf->iwdev = iwdev; + mutex_init(&iwdev->ah_tbl_lock); - iwdev->netdev = vsi->netdev; - iwdev->vsi_num = vsi->vsi_num; + + iwdev->netdev = iidc_priv->netdev; + iwdev->vsi_num = iidc_priv->vport_id; iwdev->init_state = INITIAL_STATE; iwdev->roce_cwnd = IRDMA_ROCE_CWND_DEFAULT; iwdev->roce_ackcreds = IRDMA_ROCE_ACKCREDS_DEFAULT; @@ -294,20 +302,17 @@ static void irdma_fill_device_info(struct irdma_device *iwdev, struct ice_pf *pf static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_device_id *id) { struct iidc_rdma_core_auxiliary_dev *iidc_adev; - struct iidc_rdma_qos_params qos_info = {}; + struct iidc_rdma_core_dev_info *cdev_info; + struct iidc_rdma_priv_dev_info *iidc_priv; struct irdma_l2params l2params = {}; struct irdma_device *iwdev; struct irdma_pci_f *rf; - struct ice_vsi *vsi; - struct ice_pf *pf; int err; iidc_adev = container_of(aux_dev, struct iidc_rdma_core_auxiliary_dev, adev); - pf = iidc_adev->pf; - vsi = ice_get_main_vsi(pf); + cdev_info = iidc_adev->cdev_info; + iidc_priv = cdev_info->iidc_priv; - if (!vsi) - return -EIO; iwdev = ib_alloc_device(irdma_device, ibdev); if (!iwdev) return -ENOMEM; @@ -317,10 +322,10 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_ return -ENOMEM; } - irdma_fill_device_info(iwdev, pf, vsi); + irdma_fill_device_info(iwdev, cdev_info); rf = iwdev->rf; - err = irdma_init_interrupts(rf, pf); + err = irdma_init_interrupts(rf, cdev_info); if (err) goto err_init_interrupts; @@ -329,8 +334,7 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_ goto err_ctrl_init; l2params.mtu = iwdev->netdev->mtu; - ice_get_qos_params(pf, &qos_info); - irdma_fill_qos_info(&l2params, &qos_info); + irdma_fill_qos_info(&l2params, &iidc_priv->qos_info); if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) iwdev->dcb_vlan_mode = l2params.num_tc > 1 && !l2params.dscp_mode; @@ -342,7 +346,7 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_ if (err) goto err_ibreg; - ice_rdma_update_vsi_filter(pf, iwdev->vsi_num, true); + ice_rdma_update_vsi_filter(cdev_info, iwdev->vsi_num, true); ibdev_dbg(&iwdev->ibdev, "INIT: Gen2 PF[%d] device probe success\n", PCI_FUNC(rf->pcidev->devfn)); auxiliary_set_drvdata(aux_dev, iwdev); @@ -354,7 +358,7 @@ err_ibreg: err_rt_init: irdma_ctrl_deinit_hw(rf); err_ctrl_init: - irdma_deinit_interrupts(rf, pf); + irdma_deinit_interrupts(rf, cdev_info); err_init_interrupts: kfree(iwdev->rf); ib_dealloc_device(&iwdev->ibdev); diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index e8083b0c8cb2..674acc952168 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -30,6 +30,7 @@ #endif #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink.c b/drivers/net/ethernet/intel/ice/devlink/devlink.c index fcb199efbea5..4af60e2f37df 100644 --- a/drivers/net/ethernet/intel/ice/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ice/devlink/devlink.c @@ -1339,8 +1339,13 @@ ice_devlink_enable_roce_get(struct devlink *devlink, u32 id, struct devlink_param_gset_ctx *ctx) { struct ice_pf *pf = devlink_priv(devlink); + struct iidc_rdma_core_dev_info *cdev; - ctx->val.vbool = pf->rdma_mode & IIDC_RDMA_PROTOCOL_ROCEV2 ? true : false; + cdev = pf->cdev_info; + if (!cdev) + return -ENODEV; + + ctx->val.vbool = !!(cdev->rdma_protocol & IIDC_RDMA_PROTOCOL_ROCEV2); return 0; } @@ -1350,19 +1355,24 @@ static int ice_devlink_enable_roce_set(struct devlink *devlink, u32 id, struct netlink_ext_ack *extack) { struct ice_pf *pf = devlink_priv(devlink); + struct iidc_rdma_core_dev_info *cdev; bool roce_ena = ctx->val.vbool; int ret; + cdev = pf->cdev_info; + if (!cdev) + return -ENODEV; + if (!roce_ena) { ice_unplug_aux_dev(pf); - pf->rdma_mode &= ~IIDC_RDMA_PROTOCOL_ROCEV2; + cdev->rdma_protocol &= ~IIDC_RDMA_PROTOCOL_ROCEV2; return 0; } - pf->rdma_mode |= IIDC_RDMA_PROTOCOL_ROCEV2; + cdev->rdma_protocol |= IIDC_RDMA_PROTOCOL_ROCEV2; ret = ice_plug_aux_dev(pf); if (ret) - pf->rdma_mode &= ~IIDC_RDMA_PROTOCOL_ROCEV2; + cdev->rdma_protocol &= ~IIDC_RDMA_PROTOCOL_ROCEV2; return ret; } @@ -1373,11 +1383,16 @@ ice_devlink_enable_roce_validate(struct devlink *devlink, u32 id, struct netlink_ext_ack *extack) { struct ice_pf *pf = devlink_priv(devlink); + struct iidc_rdma_core_dev_info *cdev; + + cdev = pf->cdev_info; + if (!cdev) + return -ENODEV; if (!test_bit(ICE_FLAG_RDMA_ENA, pf->flags)) return -EOPNOTSUPP; - if (pf->rdma_mode & IIDC_RDMA_PROTOCOL_IWARP) { + if (cdev->rdma_protocol & IIDC_RDMA_PROTOCOL_IWARP) { NL_SET_ERR_MSG_MOD(extack, "iWARP is currently enabled. This device cannot enable iWARP and RoCEv2 simultaneously"); return -EOPNOTSUPP; } @@ -1390,8 +1405,13 @@ ice_devlink_enable_iw_get(struct devlink *devlink, u32 id, struct devlink_param_gset_ctx *ctx) { struct ice_pf *pf = devlink_priv(devlink); + struct iidc_rdma_core_dev_info *cdev; - ctx->val.vbool = pf->rdma_mode & IIDC_RDMA_PROTOCOL_IWARP; + cdev = pf->cdev_info; + if (!cdev) + return -ENODEV; + + ctx->val.vbool = !!(cdev->rdma_protocol & IIDC_RDMA_PROTOCOL_IWARP); return 0; } @@ -1401,19 +1421,24 @@ static int ice_devlink_enable_iw_set(struct devlink *devlink, u32 id, struct netlink_ext_ack *extack) { struct ice_pf *pf = devlink_priv(devlink); + struct iidc_rdma_core_dev_info *cdev; bool iw_ena = ctx->val.vbool; int ret; + cdev = pf->cdev_info; + if (!cdev) + return -ENODEV; + if (!iw_ena) { ice_unplug_aux_dev(pf); - pf->rdma_mode &= ~IIDC_RDMA_PROTOCOL_IWARP; + cdev->rdma_protocol &= ~IIDC_RDMA_PROTOCOL_IWARP; return 0; } - pf->rdma_mode |= IIDC_RDMA_PROTOCOL_IWARP; + cdev->rdma_protocol |= IIDC_RDMA_PROTOCOL_IWARP; ret = ice_plug_aux_dev(pf); if (ret) - pf->rdma_mode &= ~IIDC_RDMA_PROTOCOL_IWARP; + cdev->rdma_protocol &= ~IIDC_RDMA_PROTOCOL_IWARP; return ret; } @@ -1428,7 +1453,7 @@ ice_devlink_enable_iw_validate(struct devlink *devlink, u32 id, if (!test_bit(ICE_FLAG_RDMA_ENA, pf->flags)) return -EOPNOTSUPP; - if (pf->rdma_mode & IIDC_RDMA_PROTOCOL_ROCEV2) { + if (pf->cdev_info->rdma_protocol & IIDC_RDMA_PROTOCOL_ROCEV2) { NL_SET_ERR_MSG_MOD(extack, "RoCEv2 is currently enabled. This device cannot enable iWARP and RoCEv2 simultaneously"); return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index fd083647c14a..e27d9044bcb3 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -401,7 +401,6 @@ struct ice_vsi { u16 req_rxq; /* User requested Rx queues */ u16 num_rx_desc; u16 num_tx_desc; - u16 qset_handle[ICE_MAX_TRAFFIC_CLASS]; struct ice_tc_cfg tc_cfg; struct bpf_prog *xdp_prog; struct ice_tx_ring **xdp_rings; /* XDP ring array */ @@ -557,7 +556,6 @@ struct ice_pf { struct devlink_port devlink_port; /* OS reserved IRQ details */ - struct msix_entry *msix_entries; struct ice_irq_tracker irq_tracker; struct ice_virt_irq_tracker virt_irq_tracker; @@ -592,7 +590,6 @@ struct ice_pf { struct gnss_serial *gnss_serial; struct gnss_device *gnss_dev; u16 num_rdma_msix; /* Total MSIX vectors for RDMA driver */ - u16 rdma_base_vector; /* spinlock to protect the AdminQ wait list */ spinlock_t aq_wait_lock; @@ -625,14 +622,12 @@ struct ice_pf { struct ice_hw_port_stats stats_prev; struct ice_hw hw; u8 stat_prev_loaded:1; /* has previous stats been loaded */ - u8 rdma_mode; u16 dcbx_cap; u32 tx_timeout_count; unsigned long tx_timeout_last_recovery; u32 tx_timeout_recovery_level; char int_name[ICE_INT_NAME_STR_LEN]; char int_name_ll_ts[ICE_INT_NAME_STR_LEN]; - struct auxiliary_device *adev; int aux_idx; u32 sw_int_count; /* count of tc_flower filters specific to channel (aka where filter @@ -664,6 +659,7 @@ struct ice_pf { struct ice_dplls dplls; struct device *hwmon_dev; struct ice_health health_reporters; + struct iidc_rdma_core_dev_info *cdev_info; u8 num_quanta_prof_used; }; diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index fe16c59796db..c5ef33c100d6 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -740,6 +740,8 @@ static int ice_dcb_noncontig_cfg(struct ice_pf *pf) void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked) { struct ice_dcbx_cfg *dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg; + struct iidc_rdma_priv_dev_info *privd; + struct iidc_rdma_core_dev_info *cdev; struct iidc_rdma_event *event; u8 tc_map = 0; int v, ret; @@ -783,7 +785,11 @@ void ice_pf_dcb_recfg(struct ice_pf *pf, bool locked) if (vsi->type == ICE_VSI_PF) ice_dcbnl_set_all(vsi); } - if (!locked) { + + cdev = pf->cdev_info; + if (cdev && !locked) { + privd = cdev->iidc_priv; + ice_setup_dcb_qos_info(pf, &privd->qos_info); /* Notify the AUX drivers that TC change is finished */ event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) @@ -944,6 +950,37 @@ ice_tx_prepare_vlan_flags_dcb(struct ice_tx_ring *tx_ring, } } +/** + * ice_setup_dcb_qos_info - Setup DCB QoS information + * @pf: ptr to ice_pf + * @qos_info: QoS param instance + */ +void ice_setup_dcb_qos_info(struct ice_pf *pf, struct iidc_rdma_qos_params *qos_info) +{ + struct ice_dcbx_cfg *dcbx_cfg; + unsigned int i; + u32 up2tc; + + if (!pf || !qos_info) + return; + + dcbx_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg; + up2tc = rd32(&pf->hw, PRTDCB_TUP2TC); + + qos_info->num_tc = ice_dcb_get_num_tc(dcbx_cfg); + + for (i = 0; i < IIDC_MAX_USER_PRIORITY; i++) + qos_info->up2tc[i] = (up2tc >> (i * 3)) & 0x7; + + for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) + qos_info->tc_info[i].rel_bw = dcbx_cfg->etscfg.tcbwtable[i]; + + qos_info->pfc_mode = dcbx_cfg->pfc_mode; + if (qos_info->pfc_mode == IIDC_DSCP_PFC_MODE) + for (i = 0; i < DSCP_MAX; i++) + qos_info->dscp_map[i] = dcbx_cfg->dscp_map[i]; +} + /** * ice_dcb_is_mib_change_pending - Check if MIB change is pending * @state: MIB change state diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h index 800879a88c5e..da9ba814b4e8 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.h @@ -31,6 +31,9 @@ void ice_tx_prepare_vlan_flags_dcb(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first); void +ice_setup_dcb_qos_info(struct ice_pf *pf, + struct iidc_rdma_qos_params *qos_info); +void ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf, struct ice_rq_event_info *event); /** @@ -134,5 +137,11 @@ static inline void ice_update_dcb_stats(struct ice_pf *pf) { } static inline void ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf, struct ice_rq_event_info *event) { } static inline void ice_set_cgd_num(struct ice_tlan_ctx *tlan_ctx, u8 dcb_tc) { } +static inline void +ice_setup_dcb_qos_info(struct ice_pf *pf, struct iidc_rdma_qos_params *qos_info) +{ + qos_info->num_tc = 1; + qos_info->tc_info[0].rel_bw = 100; +} #endif /* CONFIG_DCB */ #endif /* _ICE_DCB_LIB_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 7c2dc347e4e5..46fbcd391a80 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3964,11 +3964,11 @@ static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch) return -EINVAL; } - if (pf->adev) { + if (pf->cdev_info && pf->cdev_info->adev) { mutex_lock(&pf->adev_mutex); - device_lock(&pf->adev->dev); + device_lock(&pf->cdev_info->adev->dev); locked = true; - if (pf->adev->dev.driver) { + if (pf->cdev_info->adev->dev.driver) { netdev_err(dev, "Cannot change channels when RDMA is active\n"); ret = -EBUSY; goto adev_unlock; @@ -3987,7 +3987,7 @@ static int ice_set_channels(struct net_device *dev, struct ethtool_channels *ch) adev_unlock: if (locked) { - device_unlock(&pf->adev->dev); + device_unlock(&pf->cdev_info->adev->dev); mutex_unlock(&pf->adev_mutex); } return ret; diff --git a/drivers/net/ethernet/intel/ice/ice_idc.c b/drivers/net/ethernet/intel/ice/ice_idc.c index 4421ba024396..6ab53e430f91 100644 --- a/drivers/net/ethernet/intel/ice/ice_idc.c +++ b/drivers/net/ethernet/intel/ice/ice_idc.c @@ -9,18 +9,20 @@ static DEFINE_XARRAY_ALLOC1(ice_aux_id); /** - * ice_get_auxiliary_drv - retrieve iidc_auxiliary_drv struct - * @pf: pointer to PF struct + * ice_get_auxiliary_drv - retrieve iidc_rdma_core_auxiliary_drv struct + * @cdev: pointer to iidc_rdma_core_dev_info struct * * This function has to be called with a device_lock on the - * pf->adev.dev to avoid race conditions. + * cdev->adev.dev to avoid race conditions. + * + * Return: pointer to the matched auxiliary driver struct */ -static -struct iidc_rdma_core_auxiliary_drv *ice_get_auxiliary_drv(struct ice_pf *pf) +static struct iidc_rdma_core_auxiliary_drv * +ice_get_auxiliary_drv(struct iidc_rdma_core_dev_info *cdev) { struct auxiliary_device *adev; - adev = pf->adev; + adev = cdev->adev; if (!adev || !adev->dev.driver) return NULL; @@ -36,41 +38,51 @@ struct iidc_rdma_core_auxiliary_drv *ice_get_auxiliary_drv(struct ice_pf *pf) void ice_send_event_to_aux(struct ice_pf *pf, struct iidc_rdma_event *event) { struct iidc_rdma_core_auxiliary_drv *iadrv; + struct iidc_rdma_core_dev_info *cdev; if (WARN_ON_ONCE(!in_task())) return; + cdev = pf->cdev_info; + if (!cdev) + return; + mutex_lock(&pf->adev_mutex); - if (!pf->adev) + if (!cdev->adev) goto finish; - device_lock(&pf->adev->dev); - iadrv = ice_get_auxiliary_drv(pf); + device_lock(&cdev->adev->dev); + iadrv = ice_get_auxiliary_drv(cdev); if (iadrv && iadrv->event_handler) - iadrv->event_handler(pf, event); - device_unlock(&pf->adev->dev); + iadrv->event_handler(cdev, event); + device_unlock(&cdev->adev->dev); finish: mutex_unlock(&pf->adev_mutex); } /** * ice_add_rdma_qset - Add Leaf Node for RDMA Qset - * @pf: PF struct + * @cdev: pointer to iidc_rdma_core_dev_info struct * @qset: Resource to be allocated + * + * Return: Zero on success or error code encountered */ -int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset) +int ice_add_rdma_qset(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_qset_params *qset) { u16 max_rdmaqs[ICE_MAX_TRAFFIC_CLASS]; struct ice_vsi *vsi; struct device *dev; + struct ice_pf *pf; u32 qset_teid; u16 qs_handle; int status; int i; - if (WARN_ON(!pf || !qset)) + if (WARN_ON(!cdev || !qset)) return -EINVAL; + pf = pci_get_drvdata(cdev->pdev); dev = ice_pf_to_dev(pf); if (!ice_is_rdma_ena(pf)) @@ -101,7 +113,6 @@ int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset) dev_err(dev, "Failed VSI RDMA Qset enable\n"); return status; } - vsi->qset_handle[qset->tc] = qset->qs_handle; qset->teid = qset_teid; return 0; @@ -110,18 +121,23 @@ EXPORT_SYMBOL_GPL(ice_add_rdma_qset); /** * ice_del_rdma_qset - Delete leaf node for RDMA Qset - * @pf: PF struct + * @cdev: pointer to iidc_rdma_core_dev_info struct * @qset: Resource to be freed + * + * Return: Zero on success, error code on failure */ -int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset) +int ice_del_rdma_qset(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_qset_params *qset) { struct ice_vsi *vsi; + struct ice_pf *pf; u32 teid; u16 q_id; - if (WARN_ON(!pf || !qset)) + if (WARN_ON(!cdev || !qset)) return -EINVAL; + pf = pci_get_drvdata(cdev->pdev); vsi = ice_find_vsi(pf, qset->vport_id); if (!vsi) { dev_err(ice_pf_to_dev(pf), "RDMA Invalid VSI\n"); @@ -131,37 +147,36 @@ int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset) q_id = qset->qs_handle; teid = qset->teid; - vsi->qset_handle[qset->tc] = 0; - return ice_dis_vsi_rdma_qset(vsi->port_info, 1, &teid, &q_id); } EXPORT_SYMBOL_GPL(ice_del_rdma_qset); /** * ice_rdma_request_reset - accept request from RDMA to perform a reset - * @pf: struct for PF + * @cdev: pointer to iidc_rdma_core_dev_info struct * @reset_type: type of reset + * + * Return: Zero on success, error code on failure */ -int ice_rdma_request_reset(struct ice_pf *pf, +int ice_rdma_request_reset(struct iidc_rdma_core_dev_info *cdev, enum iidc_rdma_reset_type reset_type) { enum ice_reset_req reset; + struct ice_pf *pf; - if (WARN_ON(!pf)) + if (WARN_ON(!cdev)) return -EINVAL; + pf = pci_get_drvdata(cdev->pdev); + switch (reset_type) { - case IIDC_PFR: + case IIDC_FUNC_RESET: reset = ICE_RESET_PFR; break; - case IIDC_CORER: + case IIDC_DEV_RESET: reset = ICE_RESET_CORER; break; - case IIDC_GLOBR: - reset = ICE_RESET_GLOBR; - break; default: - dev_err(ice_pf_to_dev(pf), "incorrect reset request\n"); return -EINVAL; } @@ -171,18 +186,23 @@ EXPORT_SYMBOL_GPL(ice_rdma_request_reset); /** * ice_rdma_update_vsi_filter - update main VSI filters for RDMA - * @pf: pointer to struct for PF + * @cdev: pointer to iidc_rdma_core_dev_info struct * @vsi_id: VSI HW idx to update filter on * @enable: bool whether to enable or disable filters + * + * Return: Zero on success, error code on failure */ -int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable) +int ice_rdma_update_vsi_filter(struct iidc_rdma_core_dev_info *cdev, + u16 vsi_id, bool enable) { struct ice_vsi *vsi; + struct ice_pf *pf; int status; - if (WARN_ON(!pf)) + if (WARN_ON(!cdev)) return -EINVAL; + pf = pci_get_drvdata(cdev->pdev); vsi = ice_find_vsi(pf, vsi_id); if (!vsi) return -EINVAL; @@ -203,37 +223,23 @@ int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable) EXPORT_SYMBOL_GPL(ice_rdma_update_vsi_filter); /** - * ice_get_qos_params - parse QoS params for RDMA consumption - * @pf: pointer to PF struct - * @qos: set of QoS values + * ice_alloc_rdma_qvector - alloc vector resources reserved for RDMA driver + * @cdev: pointer to iidc_rdma_core_dev_info struct + * @entry: MSI-X entry to be removed + * + * Return: Zero on success, error code on failure */ -void ice_get_qos_params(struct ice_pf *pf, struct iidc_rdma_qos_params *qos) +int ice_alloc_rdma_qvector(struct iidc_rdma_core_dev_info *cdev, + struct msix_entry *entry) { - struct ice_dcbx_cfg *dcbx_cfg; - unsigned int i; - u32 up2tc; - - dcbx_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg; - up2tc = rd32(&pf->hw, PRTDCB_TUP2TC); - - qos->num_tc = ice_dcb_get_num_tc(dcbx_cfg); - for (i = 0; i < IIDC_MAX_USER_PRIORITY; i++) - qos->up2tc[i] = (up2tc >> (i * 3)) & 0x7; - - for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) - qos->tc_info[i].rel_bw = dcbx_cfg->etscfg.tcbwtable[i]; - - qos->pfc_mode = dcbx_cfg->pfc_mode; - if (qos->pfc_mode == IIDC_DSCP_PFC_MODE) - for (i = 0; i < DSCP_MAX; i++) - qos->dscp_map[i] = dcbx_cfg->dscp_map[i]; -} -EXPORT_SYMBOL_GPL(ice_get_qos_params); + struct msi_map map; + struct ice_pf *pf; -int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry) -{ - struct msi_map map = ice_alloc_irq(pf, true); + if (WARN_ON(!cdev)) + return -EINVAL; + pf = pci_get_drvdata(cdev->pdev); + map = ice_alloc_irq(pf, true); if (map.index < 0) return -ENOMEM; @@ -246,12 +252,19 @@ EXPORT_SYMBOL_GPL(ice_alloc_rdma_qvector); /** * ice_free_rdma_qvector - free vector resources reserved for RDMA driver - * @pf: board private structure to initialize + * @cdev: pointer to iidc_rdma_core_dev_info struct * @entry: MSI-X entry to be removed */ -void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry) +void ice_free_rdma_qvector(struct iidc_rdma_core_dev_info *cdev, + struct msix_entry *entry) { struct msi_map map; + struct ice_pf *pf; + + if (WARN_ON(!cdev || !entry)) + return; + + pf = pci_get_drvdata(cdev->pdev); map.index = entry->entry; map.virq = entry->vector; @@ -275,10 +288,13 @@ static void ice_adev_release(struct device *dev) /** * ice_plug_aux_dev - allocate and register AUX device * @pf: pointer to pf struct + * + * Return: Zero on success, error code on failure */ int ice_plug_aux_dev(struct ice_pf *pf) { struct iidc_rdma_core_auxiliary_dev *iadev; + struct iidc_rdma_core_dev_info *cdev; struct auxiliary_device *adev; int ret; @@ -288,17 +304,22 @@ int ice_plug_aux_dev(struct ice_pf *pf) if (!ice_is_rdma_ena(pf)) return 0; + cdev = pf->cdev_info; + if (!cdev) + return -ENODEV; + iadev = kzalloc(sizeof(*iadev), GFP_KERNEL); if (!iadev) return -ENOMEM; adev = &iadev->adev; - iadev->pf = pf; + iadev->cdev_info = cdev; adev->id = pf->aux_idx; adev->dev.release = ice_adev_release; adev->dev.parent = &pf->pdev->dev; - adev->name = pf->rdma_mode & IIDC_RDMA_PROTOCOL_ROCEV2 ? "roce" : "iwarp"; + adev->name = cdev->rdma_protocol & IIDC_RDMA_PROTOCOL_ROCEV2 ? + "roce" : "iwarp"; ret = auxiliary_device_init(adev); if (ret) { @@ -313,7 +334,7 @@ int ice_plug_aux_dev(struct ice_pf *pf) } mutex_lock(&pf->adev_mutex); - pf->adev = adev; + cdev->adev = adev; mutex_unlock(&pf->adev_mutex); return 0; @@ -327,8 +348,8 @@ void ice_unplug_aux_dev(struct ice_pf *pf) struct auxiliary_device *adev; mutex_lock(&pf->adev_mutex); - adev = pf->adev; - pf->adev = NULL; + adev = pf->cdev_info->adev; + pf->cdev_info->adev = NULL; mutex_unlock(&pf->adev_mutex); if (adev) { @@ -343,7 +364,9 @@ void ice_unplug_aux_dev(struct ice_pf *pf) */ int ice_init_rdma(struct ice_pf *pf) { + struct iidc_rdma_priv_dev_info *privd; struct device *dev = &pf->pdev->dev; + struct iidc_rdma_core_dev_info *cdev; int ret; if (!ice_is_rdma_ena(pf)) { @@ -351,22 +374,50 @@ int ice_init_rdma(struct ice_pf *pf) return 0; } + cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); + if (!cdev) + return -ENOMEM; + + pf->cdev_info = cdev; + + privd = kzalloc(sizeof(*privd), GFP_KERNEL); + if (!privd) { + ret = -ENOMEM; + goto err_privd_alloc; + } + + privd->pf_id = pf->hw.pf_id; ret = xa_alloc(&ice_aux_id, &pf->aux_idx, NULL, XA_LIMIT(1, INT_MAX), GFP_KERNEL); if (ret) { dev_err(dev, "Failed to allocate device ID for AUX driver\n"); - return -ENOMEM; + ret = -ENOMEM; + goto err_alloc_xa; } - pf->rdma_mode |= IIDC_RDMA_PROTOCOL_ROCEV2; + cdev->iidc_priv = privd; + privd->netdev = pf->vsi[0]->netdev; + + privd->hw_addr = (u8 __iomem *)pf->hw.hw_addr; + cdev->pdev = pf->pdev; + privd->vport_id = pf->vsi[0]->vsi_num; + + pf->cdev_info->rdma_protocol |= IIDC_RDMA_PROTOCOL_ROCEV2; + ice_setup_dcb_qos_info(pf, &privd->qos_info); ret = ice_plug_aux_dev(pf); if (ret) goto err_plug_aux_dev; return 0; err_plug_aux_dev: - pf->adev = NULL; + pf->cdev_info->adev = NULL; xa_erase(&ice_aux_id, pf->aux_idx); +err_alloc_xa: + kfree(privd); +err_privd_alloc: + kfree(cdev); + pf->cdev_info = NULL; + return ret; } @@ -381,4 +432,7 @@ void ice_deinit_rdma(struct ice_pf *pf) ice_unplug_aux_dev(pf); xa_erase(&ice_aux_id, pf->aux_idx); + kfree(pf->cdev_info->iidc_priv); + kfree(pf->cdev_info); + pf->cdev_info = NULL; } diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 6a20f9f2e608..eace0e3f15e8 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -9310,6 +9310,7 @@ ice_setup_tc(struct net_device *netdev, enum tc_setup_type type, void *type_data) { struct ice_netdev_priv *np = netdev_priv(netdev); + struct iidc_rdma_core_dev_info *cdev; struct ice_pf *pf = np->vsi->back; bool locked = false; int err; @@ -9326,11 +9327,12 @@ ice_setup_tc(struct net_device *netdev, enum tc_setup_type type, return -EOPNOTSUPP; } - if (pf->adev) { + cdev = pf->cdev_info; + if (cdev && cdev->adev) { mutex_lock(&pf->adev_mutex); - device_lock(&pf->adev->dev); + device_lock(&cdev->adev->dev); locked = true; - if (pf->adev->dev.driver) { + if (cdev->adev->dev.driver) { netdev_err(netdev, "Cannot change qdisc when RDMA is active\n"); err = -EBUSY; goto adev_unlock; @@ -9344,7 +9346,7 @@ ice_setup_tc(struct net_device *netdev, enum tc_setup_type type, adev_unlock: if (locked) { - device_unlock(&pf->adev->dev); + device_unlock(&cdev->adev->dev); mutex_unlock(&pf->adev_mutex); } return err; diff --git a/include/linux/net/intel/iidc_rdma.h b/include/linux/net/intel/iidc_rdma.h index 7f1910289534..8baad1082042 100644 --- a/include/linux/net/intel/iidc_rdma.h +++ b/include/linux/net/intel/iidc_rdma.h @@ -5,7 +5,6 @@ #define _IIDC_RDMA_H_ #include -#include #include #include #include @@ -17,14 +16,19 @@ enum iidc_rdma_event_type { IIDC_RDMA_EVENT_AFTER_MTU_CHANGE, IIDC_RDMA_EVENT_BEFORE_TC_CHANGE, IIDC_RDMA_EVENT_AFTER_TC_CHANGE, + IIDC_RDMA_EVENT_WARN_RESET, IIDC_RDMA_EVENT_CRIT_ERR, IIDC_RDMA_EVENT_NBITS /* must be last */ }; +struct iidc_rdma_event { + DECLARE_BITMAP(type, IIDC_RDMA_EVENT_NBITS); + u32 reg; +}; + enum iidc_rdma_reset_type { - IIDC_PFR, - IIDC_CORER, - IIDC_GLOBR, + IIDC_FUNC_RESET, + IIDC_DEV_RESET, }; enum iidc_rdma_protocol { @@ -32,53 +36,22 @@ enum iidc_rdma_protocol { IIDC_RDMA_PROTOCOL_ROCEV2 = BIT(1), }; -#define IIDC_MAX_USER_PRIORITY 8 -#define IIDC_DSCP_PFC_MODE 0x1 - -/* Struct to hold per RDMA Qset info */ -struct iidc_rdma_qset_params { - /* Qset TEID returned to the RDMA driver in - * ice_add_rdma_qset and used by RDMA driver - * for calls to ice_del_rdma_qset - */ - u32 teid; /* Qset TEID */ - u16 qs_handle; /* RDMA driver provides this */ - u16 vport_id; /* VSI index */ - u8 tc; /* TC branch the Qset should belong to */ -}; - -struct iidc_rdma_qos_info { - u64 tc_ctx; - u8 rel_bw; - u8 prio_type; - u8 egress_virt_up; - u8 ingress_virt_up; -}; - -/* Struct to pass QoS info */ -struct iidc_rdma_qos_params { - struct iidc_rdma_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; - u8 up2tc[IIDC_MAX_USER_PRIORITY]; - u8 vport_relative_bw; - u8 vport_priority_type; - u8 num_tc; - u8 pfc_mode; - u8 dscp_map[DSCP_MAX]; -}; - -struct iidc_rdma_event { - DECLARE_BITMAP(type, IIDC_RDMA_EVENT_NBITS); - u32 reg; +/* Structure to be populated by core LAN PCI driver */ +struct iidc_rdma_core_dev_info { + struct pci_dev *pdev; /* PCI device of corresponding to main function */ + struct auxiliary_device *adev; + /* Current active RDMA protocol */ + enum iidc_rdma_protocol rdma_protocol; + void *iidc_priv; /* elements unique to each driver */ }; /* Structure representing auxiliary driver tailored information about the core * PCI dev, each auxiliary driver using the IIDC interface will have an * instance of this struct dedicated to it. */ - struct iidc_rdma_core_auxiliary_dev { struct auxiliary_device adev; - struct ice_pf *pf; + struct iidc_rdma_core_dev_info *cdev_info; }; /* structure representing the auxiliary driver. This struct is to be @@ -88,12 +61,8 @@ struct iidc_rdma_core_auxiliary_dev { */ struct iidc_rdma_core_auxiliary_drv { struct auxiliary_driver adrv; - /* This event_handler is meant to be a blocking call. For instance, - * when a BEFORE_MTU_CHANGE event comes in, the event_handler will not - * return until the auxiliary driver is ready for the MTU change to - * happen. - */ - void (*event_handler)(struct ice_pf *pf, struct iidc_rdma_event *event); + void (*event_handler)(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_event *event); }; #endif /* _IIDC_RDMA_H_*/ diff --git a/include/linux/net/intel/iidc_rdma_ice.h b/include/linux/net/intel/iidc_rdma_ice.h index 78d10003d776..b40eed0e13fe 100644 --- a/include/linux/net/intel/iidc_rdma_ice.h +++ b/include/linux/net/intel/iidc_rdma_ice.h @@ -4,16 +4,67 @@ #ifndef _IIDC_RDMA_ICE_H_ #define _IIDC_RDMA_ICE_H_ -struct ice_pf; +#include -int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset); -int ice_rdma_request_reset(struct ice_pf *pf, +#define IIDC_MAX_USER_PRIORITY 8 +#define IIDC_DSCP_PFC_MODE 0x1 + +/** + * struct iidc_rdma_qset_params - Struct to hold per RDMA Qset info + * @teid: TEID of the Qset node + * @qs_handle: SW index of the Qset, RDMA provides this + * @vport_id: VSI index + * @tc: Traffic Class branch the QSet should belong to + */ +struct iidc_rdma_qset_params { + /* Qset TEID returned to the RDMA driver in + * ice_add_rdma_qset and used by RDMA driver + * for calls to ice_del_rdma_qset + */ + u32 teid; + u16 qs_handle; + u16 vport_id; + u8 tc; +}; + +struct iidc_rdma_qos_info { + u64 tc_ctx; + u8 rel_bw; + u8 prio_type; + u8 egress_virt_up; + u8 ingress_virt_up; +}; + +/* Struct to pass QoS info */ +struct iidc_rdma_qos_params { + struct iidc_rdma_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; + u8 up2tc[IIDC_MAX_USER_PRIORITY]; + u8 vport_relative_bw; + u8 vport_priority_type; + u8 num_tc; + u8 pfc_mode; + u8 dscp_map[DSCP_MAX]; +}; + +struct iidc_rdma_priv_dev_info { + u8 pf_id; + u16 vport_id; + struct net_device *netdev; + struct iidc_rdma_qos_params qos_info; + u8 __iomem *hw_addr; +}; + +int ice_add_rdma_qset(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_qset_params *qset); +int ice_del_rdma_qset(struct iidc_rdma_core_dev_info *cdev, + struct iidc_rdma_qset_params *qset); +int ice_rdma_request_reset(struct iidc_rdma_core_dev_info *cdev, enum iidc_rdma_reset_type reset_type); -int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable); -void ice_get_qos_params(struct ice_pf *pf, - struct iidc_rdma_qos_params *qos); -int ice_alloc_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); -void ice_free_rdma_qvector(struct ice_pf *pf, struct msix_entry *entry); +int ice_rdma_update_vsi_filter(struct iidc_rdma_core_dev_info *cdev, u16 vsi_id, + bool enable); +int ice_alloc_rdma_qvector(struct iidc_rdma_core_dev_info *cdev, + struct msix_entry *entry); +void ice_free_rdma_qvector(struct iidc_rdma_core_dev_info *cdev, + struct msix_entry *entry); #endif /* _IIDC_RDMA_ICE_H_*/ -- cgit v1.2.3 From 1b2900db0119c02e6445bb61ec3fba982d10cc8d Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 8 May 2025 13:30:34 +0300 Subject: ethtool: Block setting of symmetric RSS when non-symmetric rx-flow-hash is requested Symmetric RSS hash requires that: * No other fields besides IP src/dst and/or L4 src/dst are set * If src is set, dst must also be set This restriction was only enforced when RXNFC was configured after symmetric hash was enabled. In the opposite order of operations (RXNFC then symmetric enablement) the check was not performed. Perform the sanity check on set_rxfh as well, by iterating over all flow types hash fields and making sure they are all symmetric. Introduce a function that returns whether a flow type is hashable (not spec only) and needs to be iterated over. To make sure that no one forgets to update the list of hashable flow types when adding new flow types, a static assert is added to draw the developer's attention. The conversion of uapi #defines to enum is not ideal, but as Jakub mentioned [1], we have precedent for that. [1] https://lore.kernel.org/netdev/20250324073509.6571ade3@kernel.org/ Reviewed-by: Tariq Toukan Signed-off-by: Gal Pressman Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250508103034.885536-1-gal@nvidia.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool.h | 134 ++++++++++++++++++++++--------------------- net/ethtool/ioctl.c | 99 ++++++++++++++++++++++++++++---- 2 files changed, 158 insertions(+), 75 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 84833cca29fe..707c1844010c 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2295,71 +2295,75 @@ static inline int ethtool_validate_duplex(__u8 duplex) #define RXH_XFRM_SYM_OR_XOR (1 << 1) #define RXH_XFRM_NO_CHANGE 0xff -/* L2-L4 network traffic flow types */ -#define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */ -#define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */ -#define SCTP_V4_FLOW 0x03 /* hash or spec (sctp_ip4_spec) */ -#define AH_ESP_V4_FLOW 0x04 /* hash only */ -#define TCP_V6_FLOW 0x05 /* hash or spec (tcp_ip6_spec; nfc only) */ -#define UDP_V6_FLOW 0x06 /* hash or spec (udp_ip6_spec; nfc only) */ -#define SCTP_V6_FLOW 0x07 /* hash or spec (sctp_ip6_spec; nfc only) */ -#define AH_ESP_V6_FLOW 0x08 /* hash only */ -#define AH_V4_FLOW 0x09 /* hash or spec (ah_ip4_spec) */ -#define ESP_V4_FLOW 0x0a /* hash or spec (esp_ip4_spec) */ -#define AH_V6_FLOW 0x0b /* hash or spec (ah_ip6_spec; nfc only) */ -#define ESP_V6_FLOW 0x0c /* hash or spec (esp_ip6_spec; nfc only) */ -#define IPV4_USER_FLOW 0x0d /* spec only (usr_ip4_spec) */ -#define IP_USER_FLOW IPV4_USER_FLOW -#define IPV6_USER_FLOW 0x0e /* spec only (usr_ip6_spec; nfc only) */ -#define IPV4_FLOW 0x10 /* hash only */ -#define IPV6_FLOW 0x11 /* hash only */ -#define ETHER_FLOW 0x12 /* spec only (ether_spec) */ - -/* Used for GTP-U IPv4 and IPv6. - * The format of GTP packets only includes - * elements such as TEID and GTP version. - * It is primarily intended for data communication of the UE. - */ -#define GTPU_V4_FLOW 0x13 /* hash only */ -#define GTPU_V6_FLOW 0x14 /* hash only */ - -/* Use for GTP-C IPv4 and v6. - * The format of these GTP packets does not include TEID. - * Primarily expected to be used for communication - * to create sessions for UE data communication, - * commonly referred to as CSR (Create Session Request). - */ -#define GTPC_V4_FLOW 0x15 /* hash only */ -#define GTPC_V6_FLOW 0x16 /* hash only */ - -/* Use for GTP-C IPv4 and v6. - * Unlike GTPC_V4_FLOW, the format of these GTP packets includes TEID. - * After session creation, it becomes this packet. - * This is mainly used for requests to realize UE handover. - */ -#define GTPC_TEID_V4_FLOW 0x17 /* hash only */ -#define GTPC_TEID_V6_FLOW 0x18 /* hash only */ - -/* Use for GTP-U and extended headers for the PSC (PDU Session Container). - * The format of these GTP packets includes TEID and QFI. - * In 5G communication using UPF (User Plane Function), - * data communication with this extended header is performed. - */ -#define GTPU_EH_V4_FLOW 0x19 /* hash only */ -#define GTPU_EH_V6_FLOW 0x1a /* hash only */ - -/* Use for GTP-U IPv4 and v6 PSC (PDU Session Container) extended headers. - * This differs from GTPU_EH_V(4|6)_FLOW in that it is distinguished by - * UL/DL included in the PSC. - * There are differences in the data included based on Downlink/Uplink, - * and can be used to distinguish packets. - * The functions described so far are useful when you want to - * handle communication from the mobile network in UPF, PGW, etc. - */ -#define GTPU_UL_V4_FLOW 0x1b /* hash only */ -#define GTPU_UL_V6_FLOW 0x1c /* hash only */ -#define GTPU_DL_V4_FLOW 0x1d /* hash only */ -#define GTPU_DL_V6_FLOW 0x1e /* hash only */ +enum { + /* L2-L4 network traffic flow types */ + TCP_V4_FLOW = 0x01, /* hash or spec (tcp_ip4_spec) */ + UDP_V4_FLOW = 0x02, /* hash or spec (udp_ip4_spec) */ + SCTP_V4_FLOW = 0x03, /* hash or spec (sctp_ip4_spec) */ + AH_ESP_V4_FLOW = 0x04, /* hash only */ + TCP_V6_FLOW = 0x05, /* hash or spec (tcp_ip6_spec; nfc only) */ + UDP_V6_FLOW = 0x06, /* hash or spec (udp_ip6_spec; nfc only) */ + SCTP_V6_FLOW = 0x07, /* hash or spec (sctp_ip6_spec; nfc only) */ + AH_ESP_V6_FLOW = 0x08, /* hash only */ + AH_V4_FLOW = 0x09, /* hash or spec (ah_ip4_spec) */ + ESP_V4_FLOW = 0x0a, /* hash or spec (esp_ip4_spec) */ + AH_V6_FLOW = 0x0b, /* hash or spec (ah_ip6_spec; nfc only) */ + ESP_V6_FLOW = 0x0c, /* hash or spec (esp_ip6_spec; nfc only) */ + IPV4_USER_FLOW = 0x0d, /* spec only (usr_ip4_spec) */ + IP_USER_FLOW = IPV4_USER_FLOW, + IPV6_USER_FLOW = 0x0e, /* spec only (usr_ip6_spec; nfc only) */ + IPV4_FLOW = 0x10, /* hash only */ + IPV6_FLOW = 0x11, /* hash only */ + ETHER_FLOW = 0x12, /* spec only (ether_spec) */ + + /* Used for GTP-U IPv4 and IPv6. + * The format of GTP packets only includes + * elements such as TEID and GTP version. + * It is primarily intended for data communication of the UE. + */ + GTPU_V4_FLOW = 0x13, /* hash only */ + GTPU_V6_FLOW = 0x14, /* hash only */ + + /* Use for GTP-C IPv4 and v6. + * The format of these GTP packets does not include TEID. + * Primarily expected to be used for communication + * to create sessions for UE data communication, + * commonly referred to as CSR (Create Session Request). + */ + GTPC_V4_FLOW = 0x15, /* hash only */ + GTPC_V6_FLOW = 0x16, /* hash only */ + + /* Use for GTP-C IPv4 and v6. + * Unlike GTPC_V4_FLOW, the format of these GTP packets includes TEID. + * After session creation, it becomes this packet. + * This is mainly used for requests to realize UE handover. + */ + GTPC_TEID_V4_FLOW = 0x17, /* hash only */ + GTPC_TEID_V6_FLOW = 0x18, /* hash only */ + + /* Use for GTP-U and extended headers for the PSC (PDU Session Container). + * The format of these GTP packets includes TEID and QFI. + * In 5G communication using UPF (User Plane Function), + * data communication with this extended header is performed. + */ + GTPU_EH_V4_FLOW = 0x19, /* hash only */ + GTPU_EH_V6_FLOW = 0x1a, /* hash only */ + + /* Use for GTP-U IPv4 and v6 PSC (PDU Session Container) extended headers. + * This differs from GTPU_EH_V(4|6)_FLOW in that it is distinguished by + * UL/DL included in the PSC. + * There are differences in the data included based on Downlink/Uplink, + * and can be used to distinguish packets. + * The functions described so far are useful when you want to + * handle communication from the mobile network in UPF, PGW, etc. + */ + GTPU_UL_V4_FLOW = 0x1b, /* hash only */ + GTPU_UL_V6_FLOW = 0x1c, /* hash only */ + GTPU_DL_V4_FLOW = 0x1d, /* hash only */ + GTPU_DL_V6_FLOW = 0x1e, /* hash only */ + + __FLOW_TYPE_COUNT, +}; /* Flag to enable additional fields in struct ethtool_rx_flow_spec */ #define FLOW_EXT 0x80000000 diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 8262cc10f98d..39ec920f5de7 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -978,6 +978,88 @@ static int ethtool_rxnfc_copy_to_user(void __user *useraddr, return 0; } +static bool flow_type_hashable(u32 flow_type) +{ + switch (flow_type) { + case TCP_V4_FLOW: + case UDP_V4_FLOW: + case SCTP_V4_FLOW: + case AH_ESP_V4_FLOW: + case TCP_V6_FLOW: + case UDP_V6_FLOW: + case SCTP_V6_FLOW: + case AH_ESP_V6_FLOW: + case AH_V4_FLOW: + case ESP_V4_FLOW: + case AH_V6_FLOW: + case ESP_V6_FLOW: + case IPV4_FLOW: + case IPV6_FLOW: + case GTPU_V4_FLOW: + case GTPU_V6_FLOW: + case GTPC_V4_FLOW: + case GTPC_V6_FLOW: + case GTPC_TEID_V4_FLOW: + case GTPC_TEID_V6_FLOW: + case GTPU_EH_V4_FLOW: + case GTPU_EH_V6_FLOW: + case GTPU_UL_V4_FLOW: + case GTPU_UL_V6_FLOW: + case GTPU_DL_V4_FLOW: + case GTPU_DL_V6_FLOW: + return true; + } + + return false; +} + +/* When adding a new type, update the assert and, if it's hashable, add it to + * the flow_type_hashable switch case. + */ +static_assert(GTPU_DL_V6_FLOW + 1 == __FLOW_TYPE_COUNT); + +static int ethtool_check_xfrm_rxfh(u32 input_xfrm, u64 rxfh) +{ + /* Sanity check: if symmetric-xor/symmetric-or-xor is set, then: + * 1 - no other fields besides IP src/dst and/or L4 src/dst are set + * 2 - If src is set, dst must also be set + */ + if ((input_xfrm != RXH_XFRM_NO_CHANGE && + input_xfrm & (RXH_XFRM_SYM_XOR | RXH_XFRM_SYM_OR_XOR)) && + ((rxfh & ~(RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3)) || + (!!(rxfh & RXH_IP_SRC) ^ !!(rxfh & RXH_IP_DST)) || + (!!(rxfh & RXH_L4_B_0_1) ^ !!(rxfh & RXH_L4_B_2_3)))) + return -EINVAL; + + return 0; +} + +static int ethtool_check_flow_types(struct net_device *dev, u32 input_xfrm) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxnfc info = { + .cmd = ETHTOOL_GRXFH, + }; + int err; + u32 i; + + for (i = 0; i < __FLOW_TYPE_COUNT; i++) { + if (!flow_type_hashable(i)) + continue; + + info.flow_type = i; + err = ops->get_rxnfc(dev, &info, NULL); + if (err) + continue; + + err = ethtool_check_xfrm_rxfh(input_xfrm, info.data); + if (err) + return err; + } + + return 0; +} + static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { @@ -1012,16 +1094,9 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, if (rc) return rc; - /* Sanity check: if symmetric-xor/symmetric-or-xor is set, then: - * 1 - no other fields besides IP src/dst and/or L4 src/dst - * 2 - If src is set, dst must also be set - */ - if ((rxfh.input_xfrm & (RXH_XFRM_SYM_XOR | RXH_XFRM_SYM_OR_XOR)) && - ((info.data & ~(RXH_IP_SRC | RXH_IP_DST | - RXH_L4_B_0_1 | RXH_L4_B_2_3)) || - (!!(info.data & RXH_IP_SRC) ^ !!(info.data & RXH_IP_DST)) || - (!!(info.data & RXH_L4_B_0_1) ^ !!(info.data & RXH_L4_B_2_3)))) - return -EINVAL; + rc = ethtool_check_xfrm_rxfh(rxfh.input_xfrm, info.data); + if (rc) + return rc; } rc = ops->set_rxnfc(dev, &info); @@ -1413,6 +1488,10 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh.input_xfrm == RXH_XFRM_NO_CHANGE)) return -EINVAL; + ret = ethtool_check_flow_types(dev, rxfh.input_xfrm); + if (ret) + return ret; + indir_bytes = dev_indir_size * sizeof(rxfh_dev.indir[0]); /* Check settings which may be global rather than per RSS-context */ -- cgit v1.2.3 From 6c14058edfd01cdc0d3018b9069643b0da7c3e80 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 8 May 2025 12:52:36 +0300 Subject: net: dsa: convert to ndo_hwtstamp_get() and ndo_hwtstamp_set() New timestamping API was introduced in commit 66f7223039c0 ("net: add NDOs for configuring hardware timestamping") from kernel v6.6. It is time to convert DSA to the new API, so that the ndo_eth_ioctl() path can be removed completely. Move the ds->ops->port_hwtstamp_get() and ds->ops->port_hwtstamp_set() calls from dsa_user_ioctl() to dsa_user_hwtstamp_get() and dsa_user_hwtstamp_set(). Due to the fact that the underlying ifreq type changes to kernel_hwtstamp_config, the drivers and the Ocelot switchdev front-end, all hooked up directly or indirectly, must also be converted all at once. The conversion also updates the comment from dsa_port_supports_hwtstamp(), which is no longer true because kernel_hwtstamp_config is kernel memory and does not need copy_to_user(). I've deliberated whether it is necessary to also update "err != -EOPNOTSUPP" to a more general "!err", but all drivers now either return 0 or -EOPNOTSUPP. The existing logic from the ocelot_ioctl() function, to avoid configuring timestamping if the PHY supports the operation, is obsoleted by more advanced core logic in dev_set_hwtstamp_phylib(). This is only a partial preparation for proper PHY timestamping support. None of these switch driver currently sets up PTP traps for PHY timestamping, so setting dev->see_all_hwtstamp_requests is not yet necessary and the conversion is relatively trivial. Signed-off-by: Vladimir Oltean Tested-by: Vladimir Oltean # felix, sja1105, mv88e6xxx Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250508095236.887789-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/hirschmann/hellcreek.h | 2 +- drivers/net/dsa/hirschmann/hellcreek_hwtstamp.c | 24 ++++++-------- drivers/net/dsa/hirschmann/hellcreek_hwtstamp.h | 5 +-- drivers/net/dsa/microchip/ksz_common.h | 2 +- drivers/net/dsa/microchip/ksz_ptp.c | 26 ++++++--------- drivers/net/dsa/microchip/ksz_ptp.h | 7 ++-- drivers/net/dsa/mv88e6xxx/chip.h | 2 +- drivers/net/dsa/mv88e6xxx/hwtstamp.c | 24 ++++++-------- drivers/net/dsa/mv88e6xxx/hwtstamp.h | 16 +++++---- drivers/net/dsa/ocelot/felix.c | 11 ++++--- drivers/net/dsa/sja1105/sja1105_ptp.c | 32 ++++++++---------- drivers/net/dsa/sja1105/sja1105_ptp.h | 7 ++-- drivers/net/ethernet/mscc/ocelot_net.c | 33 ++++++++++++------- drivers/net/ethernet/mscc/ocelot_ptp.c | 43 ++++++++----------------- include/net/dsa.h | 5 +-- include/soc/mscc/ocelot.h | 7 ++-- net/dsa/port.c | 10 ++---- net/dsa/user.c | 41 +++++++++++++++-------- 18 files changed, 147 insertions(+), 150 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/hirschmann/hellcreek.h b/drivers/net/dsa/hirschmann/hellcreek.h index 9c2ed2ba79da..bebf0d3ff330 100644 --- a/drivers/net/dsa/hirschmann/hellcreek.h +++ b/drivers/net/dsa/hirschmann/hellcreek.h @@ -244,7 +244,7 @@ struct hellcreek_port_hwtstamp { struct sk_buff *tx_skb; /* Current timestamp configuration */ - struct hwtstamp_config tstamp_config; + struct kernel_hwtstamp_config tstamp_config; }; struct hellcreek_port { diff --git a/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.c b/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.c index ca2500aba96f..99941ff1ebf9 100644 --- a/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.c +++ b/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.c @@ -40,7 +40,7 @@ int hellcreek_get_ts_info(struct dsa_switch *ds, int port, * the user requested what is actually available or not */ static int hellcreek_set_hwtstamp_config(struct hellcreek *hellcreek, int port, - struct hwtstamp_config *config) + struct kernel_hwtstamp_config *config) { struct hellcreek_port_hwtstamp *ps = &hellcreek->ports[port].port_hwtstamp; @@ -110,41 +110,35 @@ static int hellcreek_set_hwtstamp_config(struct hellcreek *hellcreek, int port, } int hellcreek_port_hwtstamp_set(struct dsa_switch *ds, int port, - struct ifreq *ifr) + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) { struct hellcreek *hellcreek = ds->priv; struct hellcreek_port_hwtstamp *ps; - struct hwtstamp_config config; int err; ps = &hellcreek->ports[port].port_hwtstamp; - if (copy_from_user(&config, ifr->ifr_data, sizeof(config))) - return -EFAULT; - - err = hellcreek_set_hwtstamp_config(hellcreek, port, &config); + err = hellcreek_set_hwtstamp_config(hellcreek, port, config); if (err) return err; /* Save the chosen configuration to be returned later */ - memcpy(&ps->tstamp_config, &config, sizeof(config)); + ps->tstamp_config = *config; - return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ? - -EFAULT : 0; + return 0; } int hellcreek_port_hwtstamp_get(struct dsa_switch *ds, int port, - struct ifreq *ifr) + struct kernel_hwtstamp_config *config) { struct hellcreek *hellcreek = ds->priv; struct hellcreek_port_hwtstamp *ps; - struct hwtstamp_config *config; ps = &hellcreek->ports[port].port_hwtstamp; - config = &ps->tstamp_config; + *config = ps->tstamp_config; - return copy_to_user(ifr->ifr_data, config, sizeof(*config)) ? - -EFAULT : 0; + return 0; } /* Returns a pointer to the PTP header if the caller should time stamp, or NULL diff --git a/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.h b/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.h index 7d88da2134f2..388821c4aa10 100644 --- a/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.h +++ b/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.h @@ -38,9 +38,10 @@ #define TX_TSTAMP_TIMEOUT msecs_to_jiffies(40) int hellcreek_port_hwtstamp_set(struct dsa_switch *ds, int port, - struct ifreq *ifr); + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack); int hellcreek_port_hwtstamp_get(struct dsa_switch *ds, int port, - struct ifreq *ifr); + struct kernel_hwtstamp_config *config); bool hellcreek_port_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *clone, unsigned int type); diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h index dd5429ff16ee..a034017568cd 100644 --- a/drivers/net/dsa/microchip/ksz_common.h +++ b/drivers/net/dsa/microchip/ksz_common.h @@ -142,7 +142,7 @@ struct ksz_port { struct ksz_irq pirq; u8 num; #if IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ_PTP) - struct hwtstamp_config tstamp_config; + struct kernel_hwtstamp_config tstamp_config; bool hwts_tx_en; bool hwts_rx_en; struct ksz_irq ptpirq; diff --git a/drivers/net/dsa/microchip/ksz_ptp.c b/drivers/net/dsa/microchip/ksz_ptp.c index 22fb9ef4645c..bc54a96ba646 100644 --- a/drivers/net/dsa/microchip/ksz_ptp.c +++ b/drivers/net/dsa/microchip/ksz_ptp.c @@ -319,22 +319,21 @@ int ksz_get_ts_info(struct dsa_switch *ds, int port, struct kernel_ethtool_ts_in return 0; } -int ksz_hwtstamp_get(struct dsa_switch *ds, int port, struct ifreq *ifr) +int ksz_hwtstamp_get(struct dsa_switch *ds, int port, + struct kernel_hwtstamp_config *config) { struct ksz_device *dev = ds->priv; - struct hwtstamp_config *config; struct ksz_port *prt; prt = &dev->ports[port]; - config = &prt->tstamp_config; + *config = prt->tstamp_config; - return copy_to_user(ifr->ifr_data, config, sizeof(*config)) ? - -EFAULT : 0; + return 0; } static int ksz_set_hwtstamp_config(struct ksz_device *dev, struct ksz_port *prt, - struct hwtstamp_config *config) + struct kernel_hwtstamp_config *config) { int ret; @@ -404,26 +403,21 @@ static int ksz_set_hwtstamp_config(struct ksz_device *dev, return ksz_ptp_enable_mode(dev); } -int ksz_hwtstamp_set(struct dsa_switch *ds, int port, struct ifreq *ifr) +int ksz_hwtstamp_set(struct dsa_switch *ds, int port, + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) { struct ksz_device *dev = ds->priv; - struct hwtstamp_config config; struct ksz_port *prt; int ret; prt = &dev->ports[port]; - if (copy_from_user(&config, ifr->ifr_data, sizeof(config))) - return -EFAULT; - - ret = ksz_set_hwtstamp_config(dev, prt, &config); + ret = ksz_set_hwtstamp_config(dev, prt, config); if (ret) return ret; - memcpy(&prt->tstamp_config, &config, sizeof(config)); - - if (copy_to_user(ifr->ifr_data, &config, sizeof(config))) - return -EFAULT; + prt->tstamp_config = *config; return 0; } diff --git a/drivers/net/dsa/microchip/ksz_ptp.h b/drivers/net/dsa/microchip/ksz_ptp.h index 2f1783c0d723..3086e519b1b6 100644 --- a/drivers/net/dsa/microchip/ksz_ptp.h +++ b/drivers/net/dsa/microchip/ksz_ptp.h @@ -39,8 +39,11 @@ void ksz_ptp_clock_unregister(struct dsa_switch *ds); int ksz_get_ts_info(struct dsa_switch *ds, int port, struct kernel_ethtool_ts_info *ts); -int ksz_hwtstamp_get(struct dsa_switch *ds, int port, struct ifreq *ifr); -int ksz_hwtstamp_set(struct dsa_switch *ds, int port, struct ifreq *ifr); +int ksz_hwtstamp_get(struct dsa_switch *ds, int port, + struct kernel_hwtstamp_config *config); +int ksz_hwtstamp_set(struct dsa_switch *ds, int port, + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack); void ksz_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb); void ksz_port_deferred_xmit(struct kthread_work *work); bool ksz_port_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb, diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h index 86bf113c9bfa..7d00482f53a3 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.h +++ b/drivers/net/dsa/mv88e6xxx/chip.h @@ -241,7 +241,7 @@ struct mv88e6xxx_port_hwtstamp { u16 tx_seq_id; /* Current timestamp configuration */ - struct hwtstamp_config tstamp_config; + struct kernel_hwtstamp_config tstamp_config; }; enum mv88e6xxx_policy_mapping { diff --git a/drivers/net/dsa/mv88e6xxx/hwtstamp.c b/drivers/net/dsa/mv88e6xxx/hwtstamp.c index 49e6e1355142..f663799b0b3b 100644 --- a/drivers/net/dsa/mv88e6xxx/hwtstamp.c +++ b/drivers/net/dsa/mv88e6xxx/hwtstamp.c @@ -89,7 +89,7 @@ int mv88e6xxx_get_ts_info(struct dsa_switch *ds, int port, } static int mv88e6xxx_set_hwtstamp_config(struct mv88e6xxx_chip *chip, int port, - struct hwtstamp_config *config) + struct kernel_hwtstamp_config *config) { const struct mv88e6xxx_ptp_ops *ptp_ops = chip->info->ops->ptp_ops; struct mv88e6xxx_port_hwtstamp *ps = &chip->port_hwtstamp[port]; @@ -169,42 +169,38 @@ static int mv88e6xxx_set_hwtstamp_config(struct mv88e6xxx_chip *chip, int port, } int mv88e6xxx_port_hwtstamp_set(struct dsa_switch *ds, int port, - struct ifreq *ifr) + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) { struct mv88e6xxx_chip *chip = ds->priv; struct mv88e6xxx_port_hwtstamp *ps = &chip->port_hwtstamp[port]; - struct hwtstamp_config config; int err; if (!chip->info->ptp_support) return -EOPNOTSUPP; - if (copy_from_user(&config, ifr->ifr_data, sizeof(config))) - return -EFAULT; - - err = mv88e6xxx_set_hwtstamp_config(chip, port, &config); + err = mv88e6xxx_set_hwtstamp_config(chip, port, config); if (err) return err; /* Save the chosen configuration to be returned later. */ - memcpy(&ps->tstamp_config, &config, sizeof(config)); + ps->tstamp_config = *config; - return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ? - -EFAULT : 0; + return 0; } int mv88e6xxx_port_hwtstamp_get(struct dsa_switch *ds, int port, - struct ifreq *ifr) + struct kernel_hwtstamp_config *config) { struct mv88e6xxx_chip *chip = ds->priv; struct mv88e6xxx_port_hwtstamp *ps = &chip->port_hwtstamp[port]; - struct hwtstamp_config *config = &ps->tstamp_config; if (!chip->info->ptp_support) return -EOPNOTSUPP; - return copy_to_user(ifr->ifr_data, config, sizeof(*config)) ? - -EFAULT : 0; + *config = ps->tstamp_config; + + return 0; } /* Returns a pointer to the PTP header if the caller should time stamp, diff --git a/drivers/net/dsa/mv88e6xxx/hwtstamp.h b/drivers/net/dsa/mv88e6xxx/hwtstamp.h index 85acc758e3eb..22e4acc957f0 100644 --- a/drivers/net/dsa/mv88e6xxx/hwtstamp.h +++ b/drivers/net/dsa/mv88e6xxx/hwtstamp.h @@ -111,9 +111,10 @@ #ifdef CONFIG_NET_DSA_MV88E6XXX_PTP int mv88e6xxx_port_hwtstamp_set(struct dsa_switch *ds, int port, - struct ifreq *ifr); + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack); int mv88e6xxx_port_hwtstamp_get(struct dsa_switch *ds, int port, - struct ifreq *ifr); + struct kernel_hwtstamp_config *cfg); bool mv88e6xxx_port_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *clone, unsigned int type); @@ -132,14 +133,17 @@ int mv88e6165_global_disable(struct mv88e6xxx_chip *chip); #else /* !CONFIG_NET_DSA_MV88E6XXX_PTP */ -static inline int mv88e6xxx_port_hwtstamp_set(struct dsa_switch *ds, - int port, struct ifreq *ifr) +static inline int +mv88e6xxx_port_hwtstamp_set(struct dsa_switch *ds, int port, + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } -static inline int mv88e6xxx_port_hwtstamp_get(struct dsa_switch *ds, - int port, struct ifreq *ifr) +static inline int +mv88e6xxx_port_hwtstamp_get(struct dsa_switch *ds, int port, + struct kernel_hwtstamp_config *config) { return -EOPNOTSUPP; } diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 0a4e682a55ef..2dd4e56e1cf1 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -1774,22 +1774,25 @@ static void felix_teardown(struct dsa_switch *ds) } static int felix_hwtstamp_get(struct dsa_switch *ds, int port, - struct ifreq *ifr) + struct kernel_hwtstamp_config *config) { struct ocelot *ocelot = ds->priv; - return ocelot_hwstamp_get(ocelot, port, ifr); + ocelot_hwstamp_get(ocelot, port, config); + + return 0; } static int felix_hwtstamp_set(struct dsa_switch *ds, int port, - struct ifreq *ifr) + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) { struct ocelot *ocelot = ds->priv; struct felix *felix = ocelot_to_felix(ocelot); bool using_tag_8021q; int err; - err = ocelot_hwstamp_set(ocelot, port, ifr); + err = ocelot_hwstamp_set(ocelot, port, config, extack); if (err) return err; diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index 3abc64aec411..fefe46e2a5e6 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -58,19 +58,17 @@ enum sja1105_ptp_clk_mode { #define ptp_data_to_sja1105(d) \ container_of((d), struct sja1105_private, ptp_data) -int sja1105_hwtstamp_set(struct dsa_switch *ds, int port, struct ifreq *ifr) +int sja1105_hwtstamp_set(struct dsa_switch *ds, int port, + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) { struct sja1105_private *priv = ds->priv; unsigned long hwts_tx_en, hwts_rx_en; - struct hwtstamp_config config; - - if (copy_from_user(&config, ifr->ifr_data, sizeof(config))) - return -EFAULT; hwts_tx_en = priv->hwts_tx_en; hwts_rx_en = priv->hwts_rx_en; - switch (config.tx_type) { + switch (config->tx_type) { case HWTSTAMP_TX_OFF: hwts_tx_en &= ~BIT(port); break; @@ -81,7 +79,7 @@ int sja1105_hwtstamp_set(struct dsa_switch *ds, int port, struct ifreq *ifr) return -ERANGE; } - switch (config.rx_filter) { + switch (config->rx_filter) { case HWTSTAMP_FILTER_NONE: hwts_rx_en &= ~BIT(port); break; @@ -92,32 +90,28 @@ int sja1105_hwtstamp_set(struct dsa_switch *ds, int port, struct ifreq *ifr) return -ERANGE; } - if (copy_to_user(ifr->ifr_data, &config, sizeof(config))) - return -EFAULT; - priv->hwts_tx_en = hwts_tx_en; priv->hwts_rx_en = hwts_rx_en; return 0; } -int sja1105_hwtstamp_get(struct dsa_switch *ds, int port, struct ifreq *ifr) +int sja1105_hwtstamp_get(struct dsa_switch *ds, int port, + struct kernel_hwtstamp_config *config) { struct sja1105_private *priv = ds->priv; - struct hwtstamp_config config; - config.flags = 0; + config->flags = 0; if (priv->hwts_tx_en & BIT(port)) - config.tx_type = HWTSTAMP_TX_ON; + config->tx_type = HWTSTAMP_TX_ON; else - config.tx_type = HWTSTAMP_TX_OFF; + config->tx_type = HWTSTAMP_TX_OFF; if (priv->hwts_rx_en & BIT(port)) - config.rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT; + config->rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT; else - config.rx_filter = HWTSTAMP_FILTER_NONE; + config->rx_filter = HWTSTAMP_FILTER_NONE; - return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ? - -EFAULT : 0; + return 0; } int sja1105_get_ts_info(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.h b/drivers/net/dsa/sja1105/sja1105_ptp.h index 8add2bd5f728..325e3777ea07 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.h +++ b/drivers/net/dsa/sja1105/sja1105_ptp.h @@ -112,9 +112,12 @@ bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port, void sja1105_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb); -int sja1105_hwtstamp_get(struct dsa_switch *ds, int port, struct ifreq *ifr); +int sja1105_hwtstamp_get(struct dsa_switch *ds, int port, + struct kernel_hwtstamp_config *config); -int sja1105_hwtstamp_set(struct dsa_switch *ds, int port, struct ifreq *ifr); +int sja1105_hwtstamp_set(struct dsa_switch *ds, int port, + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack); int __sja1105_ptp_gettimex(struct dsa_switch *ds, u64 *ns, struct ptp_system_timestamp *sts); diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 7663d196eaf8..469784d3a1a6 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -869,24 +869,31 @@ static int ocelot_set_features(struct net_device *dev, } static int ocelot_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + return phy_mii_ioctl(dev->phydev, ifr, cmd); +} + +static int ocelot_port_hwtstamp_get(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) { struct ocelot_port_private *priv = netdev_priv(dev); struct ocelot *ocelot = priv->port.ocelot; int port = priv->port.index; - /* If the attached PHY device isn't capable of timestamping operations, - * use our own (when possible). - */ - if (!phy_has_hwtstamp(dev->phydev) && ocelot->ptp) { - switch (cmd) { - case SIOCSHWTSTAMP: - return ocelot_hwstamp_set(ocelot, port, ifr); - case SIOCGHWTSTAMP: - return ocelot_hwstamp_get(ocelot, port, ifr); - } - } + ocelot_hwstamp_get(ocelot, port, cfg); - return phy_mii_ioctl(dev->phydev, ifr, cmd); + return 0; +} + +static int ocelot_port_hwtstamp_set(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) +{ + struct ocelot_port_private *priv = netdev_priv(dev); + struct ocelot *ocelot = priv->port.ocelot; + int port = priv->port.index; + + return ocelot_hwstamp_set(ocelot, port, cfg, extack); } static int ocelot_change_mtu(struct net_device *dev, int new_mtu) @@ -917,6 +924,8 @@ static const struct net_device_ops ocelot_port_netdev_ops = { .ndo_set_features = ocelot_set_features, .ndo_setup_tc = ocelot_setup_tc, .ndo_eth_ioctl = ocelot_ioctl, + .ndo_hwtstamp_get = ocelot_port_hwtstamp_get, + .ndo_hwtstamp_set = ocelot_port_hwtstamp_set, }; struct net_device *ocelot_port_to_netdev(struct ocelot *ocelot, int port) diff --git a/drivers/net/ethernet/mscc/ocelot_ptp.c b/drivers/net/ethernet/mscc/ocelot_ptp.c index d2a0a32f75ea..88b5422cc2a0 100644 --- a/drivers/net/ethernet/mscc/ocelot_ptp.c +++ b/drivers/net/ethernet/mscc/ocelot_ptp.c @@ -514,47 +514,42 @@ static int ocelot_ptp_tx_type_to_cmd(int tx_type, int *ptp_cmd) return 0; } -int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr) +void ocelot_hwstamp_get(struct ocelot *ocelot, int port, + struct kernel_hwtstamp_config *cfg) { struct ocelot_port *ocelot_port = ocelot->ports[port]; - struct hwtstamp_config cfg = {}; switch (ocelot_port->ptp_cmd) { case IFH_REW_OP_TWO_STEP_PTP: - cfg.tx_type = HWTSTAMP_TX_ON; + cfg->tx_type = HWTSTAMP_TX_ON; break; case IFH_REW_OP_ORIGIN_PTP: - cfg.tx_type = HWTSTAMP_TX_ONESTEP_SYNC; + cfg->tx_type = HWTSTAMP_TX_ONESTEP_SYNC; break; default: - cfg.tx_type = HWTSTAMP_TX_OFF; + cfg->tx_type = HWTSTAMP_TX_OFF; break; } - cfg.rx_filter = ocelot_traps_to_ptp_rx_filter(ocelot_port->trap_proto); - - return copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)) ? -EFAULT : 0; + cfg->rx_filter = ocelot_traps_to_ptp_rx_filter(ocelot_port->trap_proto); } EXPORT_SYMBOL(ocelot_hwstamp_get); -int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr) +int ocelot_hwstamp_set(struct ocelot *ocelot, int port, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) { struct ocelot_port *ocelot_port = ocelot->ports[port]; - int ptp_cmd, old_ptp_cmd = ocelot_port->ptp_cmd; bool l2 = false, l4 = false; - struct hwtstamp_config cfg; - bool old_l2, old_l4; + int ptp_cmd; int err; - if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) - return -EFAULT; - /* Tx type sanity check */ - err = ocelot_ptp_tx_type_to_cmd(cfg.tx_type, &ptp_cmd); + err = ocelot_ptp_tx_type_to_cmd(cfg->tx_type, &ptp_cmd); if (err) return err; - switch (cfg.rx_filter) { + switch (cfg->rx_filter) { case HWTSTAMP_FILTER_NONE: break; case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: @@ -577,27 +572,15 @@ int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr) return -ERANGE; } - old_l2 = ocelot_port->trap_proto & OCELOT_PROTO_PTP_L2; - old_l4 = ocelot_port->trap_proto & OCELOT_PROTO_PTP_L4; - err = ocelot_setup_ptp_traps(ocelot, port, l2, l4); if (err) return err; ocelot_port->ptp_cmd = ptp_cmd; - cfg.rx_filter = ocelot_traps_to_ptp_rx_filter(ocelot_port->trap_proto); - - if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) { - err = -EFAULT; - goto out_restore_ptp_traps; - } + cfg->rx_filter = ocelot_traps_to_ptp_rx_filter(ocelot_port->trap_proto); return 0; -out_restore_ptp_traps: - ocelot_setup_ptp_traps(ocelot, port, old_l2, old_l4); - ocelot_port->ptp_cmd = old_ptp_cmd; - return err; } EXPORT_SYMBOL(ocelot_hwstamp_set); diff --git a/include/net/dsa.h b/include/net/dsa.h index a0a9481c52c2..55e2d97f247e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1131,9 +1131,10 @@ struct dsa_switch_ops { * PTP functionality */ int (*port_hwtstamp_get)(struct dsa_switch *ds, int port, - struct ifreq *ifr); + struct kernel_hwtstamp_config *config); int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, - struct ifreq *ifr); + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack); void (*port_txtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 6db7fc9dbaa4..48d6deb3efd7 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -1073,8 +1073,11 @@ int ocelot_vlan_prepare(struct ocelot *ocelot, int port, u16 vid, bool pvid, int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, bool untagged); int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid); -int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr); -int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr); +void ocelot_hwstamp_get(struct ocelot *ocelot, int port, + struct kernel_hwtstamp_config *cfg); +int ocelot_hwstamp_set(struct ocelot *ocelot, int port, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack); int ocelot_port_txtstamp_request(struct ocelot *ocelot, int port, struct sk_buff *skb, struct sk_buff **clone); diff --git a/net/dsa/port.c b/net/dsa/port.c index 5c9d1798e830..082573ae6864 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -116,19 +116,15 @@ static bool dsa_port_can_configure_learning(struct dsa_port *dp) bool dsa_port_supports_hwtstamp(struct dsa_port *dp) { + struct kernel_hwtstamp_config config = {}; struct dsa_switch *ds = dp->ds; - struct ifreq ifr = {}; int err; if (!ds->ops->port_hwtstamp_get || !ds->ops->port_hwtstamp_set) return false; - /* "See through" shim implementations of the "get" method. - * Since we can't cook up a complete ioctl request structure, this will - * fail in copy_to_user() with -EFAULT, which hopefully is enough to - * detect a valid implementation. - */ - err = ds->ops->port_hwtstamp_get(ds, dp->index, &ifr); + /* "See through" shim implementations of the "get" method. */ + err = ds->ops->port_hwtstamp_get(ds, dp->index, &config); return err != -EOPNOTSUPP; } diff --git a/net/dsa/user.c b/net/dsa/user.c index 804dc7dac4f2..e9334520c54a 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -578,20 +578,6 @@ dsa_user_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, static int dsa_user_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_user_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - int port = p->dp->index; - - /* Pass through to switch driver if it supports timestamping */ - switch (cmd) { - case SIOCGHWTSTAMP: - if (ds->ops->port_hwtstamp_get) - return ds->ops->port_hwtstamp_get(ds, port, ifr); - break; - case SIOCSHWTSTAMP: - if (ds->ops->port_hwtstamp_set) - return ds->ops->port_hwtstamp_set(ds, port, ifr); - break; - } return phylink_mii_ioctl(p->dp->pl, ifr, cmd); } @@ -2574,6 +2560,31 @@ static int dsa_user_fill_forward_path(struct net_device_path_ctx *ctx, return 0; } +static int dsa_user_hwtstamp_get(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->port_hwtstamp_get) + return -EOPNOTSUPP; + + return ds->ops->port_hwtstamp_get(ds, dp->index, cfg); +} + +static int dsa_user_hwtstamp_set(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->port_hwtstamp_set) + return -EOPNOTSUPP; + + return ds->ops->port_hwtstamp_set(ds, dp->index, cfg, extack); +} + static const struct net_device_ops dsa_user_netdev_ops = { .ndo_open = dsa_user_open, .ndo_stop = dsa_user_close, @@ -2595,6 +2606,8 @@ static const struct net_device_ops dsa_user_netdev_ops = { .ndo_vlan_rx_kill_vid = dsa_user_vlan_rx_kill_vid, .ndo_change_mtu = dsa_user_change_mtu, .ndo_fill_forward_path = dsa_user_fill_forward_path, + .ndo_hwtstamp_get = dsa_user_hwtstamp_get, + .ndo_hwtstamp_set = dsa_user_hwtstamp_set, }; static const struct device_type dsa_type = { -- cgit v1.2.3 From a96876057b9e44f60d936f8e4887543555b0593c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 9 May 2025 14:27:51 -0700 Subject: netlink: fix policy dump for int with validation callback Recent devlink change added validation of an integer value via NLA_POLICY_VALIDATE_FN, for sparse enums. Handle this in policy dump. We can't extract any info out of the callback, so report only the type. Fixes: 429ac6211494 ("devlink: define enum for attr types of dynamic attributes") Reported-by: syzbot+01eb26848144516e7f0a@syzkaller.appspotmail.com Link: https://patch.msgid.link/20250509212751.1905149-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netlink.h | 6 ++++++ net/netlink/policy.c | 5 +++++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index 82e07e272290..90a560dc167a 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -321,7 +321,13 @@ enum nla_policy_validation { * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: + * NLA_U8, NLA_U16, + * NLA_U32, NLA_U64, + * NLA_S8, NLA_S16, + * NLA_S32, NLA_S64, + * NLA_MSECS, * NLA_BINARY Validation function called for the attribute. + * * All other Unused - but note that it's a union * * Example: diff --git a/net/netlink/policy.c b/net/netlink/policy.c index 1f8909c16f14..99458da6be32 100644 --- a/net/netlink/policy.c +++ b/net/netlink/policy.c @@ -311,6 +311,8 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state, NL_POLICY_TYPE_ATTR_PAD)) goto nla_put_failure; break; + } else if (pt->validation_type == NLA_VALIDATE_FUNCTION) { + break; } nla_get_range_unsigned(pt, &range); @@ -340,6 +342,9 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state, else type = NL_ATTR_TYPE_SINT; + if (pt->validation_type == NLA_VALIDATE_FUNCTION) + break; + nla_get_range_signed(pt, &range); if (nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_S, -- cgit v1.2.3 From 03e96b8c11d140fb4ead0b30c2d6e1a294b501ef Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 8 May 2025 00:48:21 +0000 Subject: netmem: add niov->type attribute to distinguish different net_iov types Later patches in the series adds TX net_iovs where there is no pp associated, so we can't rely on niov->pp->mp_ops to tell what is the type of the net_iov. Add a type enum to the net_iov which tells us the net_iov type. Signed-off-by: Mina Almasry Link: https://patch.msgid.link/20250508004830.4100853-2-almasrymina@google.com Signed-off-by: Paolo Abeni --- include/net/netmem.h | 11 ++++++++++- io_uring/zcrx.c | 1 + net/core/devmem.c | 3 ++- 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netmem.h b/include/net/netmem.h index c61d5b21e7b4..973fdbcfef38 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -20,8 +20,17 @@ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers); */ #define NET_IOV 0x01UL +enum net_iov_type { + NET_IOV_DMABUF, + NET_IOV_IOURING, + + /* Force size to unsigned long to make the NET_IOV_ASSERTS below pass. + */ + NET_IOV_MAX = ULONG_MAX +}; + struct net_iov { - unsigned long __unused_padding; + enum net_iov_type type; unsigned long pp_magic; struct page_pool *pp; struct net_iov_area *owner; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index fe86606b9f30..a07ad38935c8 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -259,6 +259,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, niov->owner = &area->nia; area->freelist[i] = i; atomic_set(&area->user_refs[i], 0); + niov->type = NET_IOV_IOURING; } area->free_count = nr_iovs; diff --git a/net/core/devmem.c b/net/core/devmem.c index 6e27a47d0493..f5c3a7e6dbb7 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -30,7 +30,7 @@ static const struct memory_provider_ops dmabuf_devmem_ops; bool net_is_devmem_iov(struct net_iov *niov) { - return niov->pp->mp_ops == &dmabuf_devmem_ops; + return niov->type == NET_IOV_DMABUF; } static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, @@ -266,6 +266,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, for (i = 0; i < owner->area.num_niovs; i++) { niov = &owner->area.niovs[i]; + niov->type = NET_IOV_DMABUF; niov->owner = &owner->area; page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); -- cgit v1.2.3 From e9f3d61db5cb29b3f17f0dc40c3ec2cda2ee93e5 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 8 May 2025 00:48:22 +0000 Subject: net: add get_netmem/put_netmem support Currently net_iovs support only pp ref counts, and do not support a page ref equivalent. This is fine for the RX path as net_iovs are used exclusively with the pp and only pp refcounting is needed there. The TX path however does not use pp ref counts, thus, support for get_page/put_page equivalent is needed for netmem. Support get_netmem/put_netmem. Check the type of the netmem before passing it to page or net_iov specific code to obtain a page ref equivalent. For dmabuf net_iovs, we obtain a ref on the underlying binding. This ensures the entire binding doesn't disappear until all the net_iovs have been put_netmem'ed. We do not need to track the refcount of individual dmabuf net_iovs as we don't allocate/free them from a pool similar to what the buddy allocator does for pages. This code is written to be extensible by other net_iov implementers. get_netmem/put_netmem will check the type of the netmem and route it to the correct helper: pages -> [get|put]_page() dmabuf net_iovs -> net_devmem_[get|put]_net_iov() new net_iovs -> new helpers Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250508004830.4100853-3-almasrymina@google.com Signed-off-by: Paolo Abeni --- include/linux/skbuff_ref.h | 4 ++-- include/net/netmem.h | 3 +++ net/core/devmem.c | 10 ++++++++++ net/core/devmem.h | 20 ++++++++++++++++++++ net/core/skbuff.c | 30 ++++++++++++++++++++++++++++++ 5 files changed, 65 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h index 0f3c58007488..9e49372ef1a0 100644 --- a/include/linux/skbuff_ref.h +++ b/include/linux/skbuff_ref.h @@ -17,7 +17,7 @@ */ static inline void __skb_frag_ref(skb_frag_t *frag) { - get_page(skb_frag_page(frag)); + get_netmem(skb_frag_netmem(frag)); } /** @@ -40,7 +40,7 @@ static inline void skb_page_unref(netmem_ref netmem, bool recycle) if (recycle && napi_pp_put_page(netmem)) return; #endif - put_page(netmem_to_page(netmem)); + put_netmem(netmem); } /** diff --git a/include/net/netmem.h b/include/net/netmem.h index 973fdbcfef38..ecb6b29c93f6 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -273,4 +273,7 @@ static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) return __netmem_clear_lsb(netmem)->dma_addr; } +void get_netmem(netmem_ref netmem); +void put_netmem(netmem_ref netmem); + #endif /* _NET_NETMEM_H */ diff --git a/net/core/devmem.c b/net/core/devmem.c index f5c3a7e6dbb7..dca2ff7cf692 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -295,6 +295,16 @@ err_put_dmabuf: return ERR_PTR(err); } +void net_devmem_get_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov)); +} + +void net_devmem_put_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov)); +} + /*** "Dmabuf devmem memory provider" ***/ int mp_dmabuf_devmem_init(struct page_pool *pool) diff --git a/net/core/devmem.h b/net/core/devmem.h index 7fc158d52729..946f2e015746 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -29,6 +29,10 @@ struct net_devmem_dmabuf_binding { * The binding undos itself and unmaps the underlying dmabuf once all * those refs are dropped and the binding is no longer desired or in * use. + * + * net_devmem_get_net_iov() on dmabuf net_iovs will increment this + * reference, making sure that the binding remains alive until all the + * net_iovs are no longer used. */ refcount_t ref; @@ -111,6 +115,9 @@ net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) __net_devmem_dmabuf_binding_free(binding); } +void net_devmem_get_net_iov(struct net_iov *niov); +void net_devmem_put_net_iov(struct net_iov *niov); + struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); @@ -120,6 +127,19 @@ bool net_is_devmem_iov(struct net_iov *niov); #else struct net_devmem_dmabuf_binding; +static inline void +net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) +{ +} + +static inline void net_devmem_get_net_iov(struct net_iov *niov) +{ +} + +static inline void net_devmem_put_net_iov(struct net_iov *niov) +{ +} + static inline void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d73ad79fe739..00c22bce98e4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -89,6 +89,7 @@ #include #include "dev.h" +#include "devmem.h" #include "netmem_priv.h" #include "sock_destructor.h" @@ -7313,3 +7314,32 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, return false; } EXPORT_SYMBOL(csum_and_copy_from_iter_full); + +void get_netmem(netmem_ref netmem) +{ + struct net_iov *niov; + + if (netmem_is_net_iov(netmem)) { + niov = netmem_to_net_iov(netmem); + if (net_is_devmem_iov(niov)) + net_devmem_get_net_iov(netmem_to_net_iov(netmem)); + return; + } + get_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(get_netmem); + +void put_netmem(netmem_ref netmem) +{ + struct net_iov *niov; + + if (netmem_is_net_iov(netmem)) { + niov = netmem_to_net_iov(netmem); + if (net_is_devmem_iov(niov)) + net_devmem_put_net_iov(netmem_to_net_iov(netmem)); + return; + } + + put_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(put_netmem); -- cgit v1.2.3 From 8802087d20c0e1c26c4b4fe30e22264bf8285e51 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 8 May 2025 00:48:23 +0000 Subject: net: devmem: TCP tx netlink api Add bind-tx netlink call to attach dmabuf for TX; queue is not required, only ifindex and dmabuf fd for attachment. Signed-off-by: Stanislav Fomichev Signed-off-by: Mina Almasry Link: https://patch.msgid.link/20250508004830.4100853-4-almasrymina@google.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/netdev.yaml | 12 ++++++++++++ include/uapi/linux/netdev.h | 1 + net/core/netdev-genl-gen.c | 13 +++++++++++++ net/core/netdev-genl-gen.h | 1 + net/core/netdev-genl.c | 6 ++++++ tools/include/uapi/linux/netdev.h | 1 + 6 files changed, 34 insertions(+) (limited to 'include') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index f5e0750ab71d..c0ef6d0d7786 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -743,6 +743,18 @@ operations: - defer-hard-irqs - gro-flush-timeout - irq-suspend-timeout + - + name: bind-tx + doc: Bind dmabuf to netdev for TX + attribute-set: dmabuf + do: + request: + attributes: + - ifindex + - fd + reply: + attributes: + - id kernel-family: headers: [ "net/netdev_netlink.h"] diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7600bf62dbdf..7eb9571786b8 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -219,6 +219,7 @@ enum { NETDEV_CMD_QSTATS_GET, NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, + NETDEV_CMD_BIND_TX, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 739f7b6506a6..4fc44587f493 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -99,6 +99,12 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPE [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, }; +/* NETDEV_CMD_BIND_TX - do */ +static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { + [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -190,6 +196,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = NETDEV_CMD_BIND_TX, + .doit = netdev_nl_bind_tx_doit, + .policy = netdev_bind_tx_nl_policy, + .maxattr = NETDEV_A_DMABUF_FD, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 17d39fd64c94..cf3fad74511f 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -34,6 +34,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 17aa9e42e8d5..3beef87235d8 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -968,6 +968,12 @@ err_genlmsg_free: return err; } +/* stub */ +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) +{ + return 0; +} + void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) { INIT_LIST_HEAD(&priv->bindings); diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 7600bf62dbdf..7eb9571786b8 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -219,6 +219,7 @@ enum { NETDEV_CMD_QSTATS_GET, NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, + NETDEV_CMD_BIND_TX, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) -- cgit v1.2.3 From bd61848900bff597764238f3a8ec67c815cd316e Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 8 May 2025 00:48:24 +0000 Subject: net: devmem: Implement TX path Augment dmabuf binding to be able to handle TX. Additional to all the RX binding, we also create tx_vec needed for the TX path. Provide API for sendmsg to be able to send dmabufs bound to this device: - Provide a new dmabuf_tx_cmsg which includes the dmabuf to send from. - MSG_ZEROCOPY with SCM_DEVMEM_DMABUF cmsg indicates send from dma-buf. Devmem is uncopyable, so piggyback off the existing MSG_ZEROCOPY implementation, while disabling instances where MSG_ZEROCOPY falls back to copying. We additionally pipe the binding down to the new zerocopy_fill_skb_from_devmem which fills a TX skb with net_iov netmems instead of the traditional page netmems. We also special case skb_frag_dma_map to return the dma-address of these dmabuf net_iovs instead of attempting to map pages. The TX path may release the dmabuf in a context where we cannot wait. This happens when the user unbinds a TX dmabuf while there are still references to its netmems in the TX path. In that case, the netmems will be put_netmem'd from a context where we can't unmap the dmabuf, Resolve this by making __net_devmem_dmabuf_binding_free schedule_work'd. Based on work by Stanislav Fomichev . A lot of the meat of the implementation came from devmem TCP RFC v1[1], which included the TX path, but Stan did all the rebasing on top of netmem/net_iov. Cc: Stanislav Fomichev Signed-off-by: Kaiyuan Zhang Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250508004830.4100853-5-almasrymina@google.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 17 +++-- include/net/sock.h | 1 + io_uring/zcrx.c | 2 +- net/core/datagram.c | 48 ++++++++++++- net/core/devmem.c | 118 +++++++++++++++++++++++++++----- net/core/devmem.h | 61 ++++++++++++++--- net/core/netdev-genl.c | 71 ++++++++++++++++++- net/core/skbuff.c | 18 +++-- net/core/sock.c | 5 ++ net/ipv4/ip_output.c | 3 +- net/ipv4/tcp.c | 48 ++++++++++--- net/ipv6/ip6_output.c | 3 +- net/vmw_vsock/virtio_transport_common.c | 5 +- 13 files changed, 340 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f3e72be6f634..c7397b17bb08 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1707,13 +1707,16 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops; struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg); + struct ubuf_info *uarg, bool devmem); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); +struct net_devmem_dmabuf_binding; + int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, - size_t length); + size_t length, + struct net_devmem_dmabuf_binding *binding); int zerocopy_fill_skb_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length); @@ -1721,12 +1724,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { - return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); + return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len, + NULL); } int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg); + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding); /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) @@ -3697,6 +3702,10 @@ static inline dma_addr_t __skb_frag_dma_map(struct device *dev, size_t offset, size_t size, enum dma_data_direction dir) { + if (skb_frag_is_net_iov(frag)) { + return netmem_to_net_iov(frag->netmem)->dma_addr + offset + + frag->offset; + } return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } diff --git a/include/net/sock.h b/include/net/sock.h index f0fabb9fd28a..3e15d7105ad2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1851,6 +1851,7 @@ struct sockcm_cookie { u32 tsflags; u32 ts_opt_id; u32 priority; + u32 dmabuf_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index a07ad38935c8..0c64129f5d50 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -810,7 +810,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, return io_zcrx_copy_frag(req, ifq, frag, off, len); niov = netmem_to_net_iov(frag->netmem); - if (niov->pp->mp_ops != &io_uring_pp_zc_ops || + if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(niov->pp) != ifq) return -EFAULT; diff --git a/net/core/datagram.c b/net/core/datagram.c index f0634f0cb834..9ef5442536f5 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -63,6 +63,8 @@ #include #include +#include "devmem.h" + /* * Is a socket 'connection oriented' ? */ @@ -691,9 +693,49 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, return 0; } +static int +zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from, + int length, + struct net_devmem_dmabuf_binding *binding) +{ + int i = skb_shinfo(skb)->nr_frags; + size_t virt_addr, size, off; + struct net_iov *niov; + + /* Devmem filling works by taking an IOVEC from the user where the + * iov_addrs are interpreted as an offset in bytes into the dma-buf to + * send from. We do not support other iter types. + */ + if (iov_iter_type(from) != ITER_IOVEC) + return -EFAULT; + + while (length && iov_iter_count(from)) { + if (i == MAX_SKB_FRAGS) + return -EMSGSIZE; + + virt_addr = (size_t)iter_iov_addr(from); + niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size); + if (!niov) + return -EFAULT; + + size = min_t(size_t, size, length); + size = min_t(size_t, size, iter_iov_len(from)); + + get_netmem(net_iov_to_netmem(niov)); + skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off, + size, PAGE_SIZE); + iov_iter_advance(from, size); + length -= size; + i++; + } + + return 0; +} + int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, - size_t length) + size_t length, + struct net_devmem_dmabuf_binding *binding) { unsigned long orig_size = skb->truesize; unsigned long truesize; @@ -701,6 +743,8 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, if (msg && msg->msg_ubuf && msg->sg_from_iter) ret = msg->sg_from_iter(skb, from, length); + else if (binding) + ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding); else ret = zerocopy_fill_skb_from_iter(skb, from, length); @@ -734,7 +778,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; - return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U); + return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL); } EXPORT_SYMBOL(zerocopy_sg_from_iter); diff --git a/net/core/devmem.c b/net/core/devmem.c index dca2ff7cf692..88065e5ef651 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "devmem.h" @@ -52,8 +53,10 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); } -void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +void __net_devmem_dmabuf_binding_free(struct work_struct *wq) { + struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); + size_t size, avail; gen_pool_for_each_chunk(binding->chunk_pool, @@ -71,8 +74,10 @@ void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) dma_buf_detach(binding->dmabuf, binding->attachment); dma_buf_put(binding->dmabuf); xa_destroy(&binding->bound_rxqs); + kvfree(binding->tx_vec); kfree(binding); } +EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free); struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) @@ -117,6 +122,13 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) unsigned long xa_idx; unsigned int rxq_idx; + xa_erase(&net_devmem_dmabuf_bindings, binding->id); + + /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the + * erase. + */ + synchronize_net(); + if (binding->list.next) list_del(&binding->list); @@ -131,8 +143,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); } - xa_erase(&net_devmem_dmabuf_bindings, binding->id); - net_devmem_dmabuf_binding_put(binding); } @@ -166,8 +176,9 @@ err_close_rxq: } struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, - struct netlink_ext_ack *extack) +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netlink_ext_ack *extack) { struct net_devmem_dmabuf_binding *binding; static u32 id_alloc_next; @@ -189,13 +200,6 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, } binding->dev = dev; - - err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, - binding, xa_limit_32b, &id_alloc_next, - GFP_KERNEL); - if (err < 0) - goto err_free_binding; - xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); refcount_set(&binding->ref, 1); @@ -206,26 +210,36 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, if (IS_ERR(binding->attachment)) { err = PTR_ERR(binding->attachment); NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); - goto err_free_id; + goto err_free_binding; } binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, - DMA_FROM_DEVICE); + direction); if (IS_ERR(binding->sgt)) { err = PTR_ERR(binding->sgt); NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); goto err_detach; } + if (direction == DMA_TO_DEVICE) { + binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, + sizeof(struct net_iov *), + GFP_KERNEL); + if (!binding->tx_vec) { + err = -ENOMEM; + goto err_unmap; + } + } + /* For simplicity we expect to make PAGE_SIZE allocations, but the * binding can be much more flexible than that. We may be able to * allocate MTU sized chunks here. Leave that for future work... */ - binding->chunk_pool = - gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev)); + binding->chunk_pool = gen_pool_create(PAGE_SHIFT, + dev_to_node(&dev->dev)); if (!binding->chunk_pool) { err = -ENOMEM; - goto err_unmap; + goto err_tx_vec; } virtual = 0; @@ -270,24 +284,32 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, niov->owner = &owner->area; page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); + if (direction == DMA_TO_DEVICE) + binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; } virtual += len; } + err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, + binding, xa_limit_32b, &id_alloc_next, + GFP_KERNEL); + if (err < 0) + goto err_free_chunks; + return binding; err_free_chunks: gen_pool_for_each_chunk(binding->chunk_pool, net_devmem_dmabuf_free_chunk_owner, NULL); gen_pool_destroy(binding->chunk_pool); +err_tx_vec: + kvfree(binding->tx_vec); err_unmap: dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, DMA_FROM_DEVICE); err_detach: dma_buf_detach(dmabuf, binding->attachment); -err_free_id: - xa_erase(&net_devmem_dmabuf_bindings, binding->id); err_free_binding: kfree(binding); err_put_dmabuf: @@ -295,6 +317,21 @@ err_put_dmabuf: return ERR_PTR(err); } +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + struct net_devmem_dmabuf_binding *binding; + + rcu_read_lock(); + binding = xa_load(&net_devmem_dmabuf_bindings, id); + if (binding) { + if (!net_devmem_dmabuf_binding_get(binding)) + binding = NULL; + } + rcu_read_unlock(); + + return binding; +} + void net_devmem_get_net_iov(struct net_iov *niov) { net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov)); @@ -305,6 +342,49 @@ void net_devmem_put_net_iov(struct net_iov *niov) net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov)); } +struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, + unsigned int dmabuf_id) +{ + struct net_devmem_dmabuf_binding *binding; + struct dst_entry *dst = __sk_dst_get(sk); + int err = 0; + + binding = net_devmem_lookup_dmabuf(dmabuf_id); + if (!binding || !binding->tx_vec) { + err = -EINVAL; + goto out_err; + } + + /* The dma-addrs in this binding are only reachable to the corresponding + * net_device. + */ + if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) { + err = -ENODEV; + goto out_err; + } + + return binding; + +out_err: + if (binding) + net_devmem_dmabuf_binding_put(binding); + + return ERR_PTR(err); +} + +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, + size_t virt_addr, size_t *off, size_t *size) +{ + if (virt_addr >= binding->dmabuf->size) + return NULL; + + *off = virt_addr % PAGE_SIZE; + *size = PAGE_SIZE - *off; + + return binding->tx_vec[virt_addr / PAGE_SIZE]; +} + /*** "Dmabuf devmem memory provider" ***/ int mp_dmabuf_devmem_init(struct page_pool *pool) diff --git a/net/core/devmem.h b/net/core/devmem.h index 946f2e015746..67168aae5e5b 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -23,8 +23,9 @@ struct net_devmem_dmabuf_binding { /* The user holds a ref (via the netlink API) for as long as they want * the binding to remain alive. Each page pool using this binding holds - * a ref to keep the binding alive. Each allocated net_iov holds a - * ref. + * a ref to keep the binding alive. The page_pool does not release the + * ref until all the net_iovs allocated from this binding are released + * back to the page_pool. * * The binding undos itself and unmaps the underlying dmabuf once all * those refs are dropped and the binding is no longer desired or in @@ -32,7 +33,10 @@ struct net_devmem_dmabuf_binding { * * net_devmem_get_net_iov() on dmabuf net_iovs will increment this * reference, making sure that the binding remains alive until all the - * net_iovs are no longer used. + * net_iovs are no longer used. net_iovs allocated from this binding + * that are stuck in the TX path for any reason (such as awaiting + * retransmits) hold a reference to the binding until the skb holding + * them is freed. */ refcount_t ref; @@ -48,6 +52,14 @@ struct net_devmem_dmabuf_binding { * active. */ u32 id; + + /* Array of net_iov pointers for this binding, sorted by virtual + * address. This array is convenient to map the virtual addresses to + * net_iovs in the TX path. + */ + struct net_iov **tx_vec; + + struct work_struct unbind_w; }; #if defined(CONFIG_NET_DEVMEM) @@ -64,14 +76,17 @@ struct dmabuf_genpool_chunk_owner { dma_addr_t base_dma_addr; }; -void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); +void __net_devmem_dmabuf_binding_free(struct work_struct *wq); struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, - struct netlink_ext_ack *extack); +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netlink_ext_ack *extack); +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id); void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack); +void net_devmem_bind_tx_release(struct sock *sk); static inline struct dmabuf_genpool_chunk_owner * net_devmem_iov_to_chunk_owner(const struct net_iov *niov) @@ -100,10 +115,10 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); } -static inline void +static inline bool net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) { - refcount_inc(&binding->ref); + return refcount_inc_not_zero(&binding->ref); } static inline void @@ -112,7 +127,8 @@ net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) if (!refcount_dec_and_test(&binding->ref)) return; - __net_devmem_dmabuf_binding_free(binding); + INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); + schedule_work(&binding->unbind_w); } void net_devmem_get_net_iov(struct net_iov *niov); @@ -123,6 +139,11 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); bool net_is_devmem_iov(struct net_iov *niov); +struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id); +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size); #else struct net_devmem_dmabuf_binding; @@ -140,18 +161,23 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov) { } -static inline void -__net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +static inline void __net_devmem_dmabuf_binding_free(struct work_struct *wq) { } static inline struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, + enum dma_data_direction direction, struct netlink_ext_ack *extack) { return ERR_PTR(-EOPNOTSUPP); } +static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + return NULL; +} + static inline void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) { @@ -190,6 +216,19 @@ static inline bool net_is_devmem_iov(struct net_iov *niov) { return false; } + +static inline struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size) +{ + return NULL; +} #endif #endif /* _NET_DEVMEM_H */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 3beef87235d8..93ea57d96931 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -907,7 +907,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock; } - binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack); + binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd, + info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock; @@ -968,10 +969,74 @@ err_genlmsg_free: return err; } -/* stub */ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) { - return 0; + struct net_devmem_dmabuf_binding *binding; + struct netdev_nl_sock *priv; + struct net_device *netdev; + u32 ifindex, dmabuf_fd; + struct sk_buff *rsp; + int err = 0; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + + priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); + if (IS_ERR(priv)) + return PTR_ERR(priv); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + mutex_lock(&priv->lock); + + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { + err = -ENODEV; + goto err_unlock_sock; + } + + if (!netif_device_present(netdev)) { + err = -ENODEV; + goto err_unlock_netdev; + } + + binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, + info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_unlock_netdev; + } + + list_add(&binding->list, &priv->bindings); + + nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); + genlmsg_end(rsp, hdr); + + netdev_unlock(netdev); + mutex_unlock(&priv->lock); + + return genlmsg_reply(rsp, info); + +err_unlock_netdev: + netdev_unlock(netdev); +err_unlock_sock: + mutex_unlock(&priv->lock); +err_genlmsg_free: + nlmsg_free(rsp); + return err; } void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 00c22bce98e4..4159107f1666 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1655,7 +1655,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); -static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) +static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size, + bool devmem) { struct ubuf_info_msgzc *uarg; struct sk_buff *skb; @@ -1670,7 +1671,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) uarg = (void *)skb->cb; uarg->mmp.user = NULL; - if (mm_account_pinned_pages(&uarg->mmp, size)) { + if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) { kfree_skb(skb); return NULL; } @@ -1693,7 +1694,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) } struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg) + struct ubuf_info *uarg, bool devmem) { if (uarg) { struct ubuf_info_msgzc *uarg_zc; @@ -1723,7 +1724,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, next = (u32)atomic_read(&sk->sk_zckey); if ((u32)(uarg_zc->id + uarg_zc->len) == next) { - if (mm_account_pinned_pages(&uarg_zc->mmp, size)) + if (likely(!devmem) && + mm_account_pinned_pages(&uarg_zc->mmp, size)) return NULL; uarg_zc->len++; uarg_zc->bytelen = bytelen; @@ -1738,7 +1740,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, } new_alloc: - return msg_zerocopy_alloc(sk, size); + return msg_zerocopy_alloc(sk, size, devmem); } EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); @@ -1842,7 +1844,8 @@ EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg) + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding) { int err, orig_len = skb->len; @@ -1861,7 +1864,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return -EEXIST; } - err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); + err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len, + binding); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { struct sock *save_sk = skb->sk; diff --git a/net/core/sock.c b/net/core/sock.c index b64df2463300..347ce75482f5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3018,6 +3018,11 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, return -EPERM; sockc->priority = *(u32 *)CMSG_DATA(cmsg); break; + case SCM_DEVMEM_DMABUF: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); + break; default: return -EINVAL; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6e18d7ec5062..a2705d454fd6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1014,7 +1014,8 @@ static int __ip_append_data(struct sock *sk, uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), + false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 86c427f16636..0ae265d39184 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1059,6 +1059,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { + struct net_devmem_dmabuf_binding *binding = NULL; struct tcp_sock *tp = tcp_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; @@ -1066,11 +1067,23 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; + bool sockc_valid = true; int zc = 0; long timeo; flags = msg->msg_flags; + sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) }; + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) + /* Don't return error until MSG_FASTOPEN has been + * processed; that may succeed even if the cmsg is + * invalid. + */ + sockc_valid = false; + } + if ((flags & MSG_ZEROCOPY) && size) { if (msg->msg_ubuf) { uarg = msg->msg_ubuf; @@ -1078,7 +1091,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_ZEROCOPY; } else if (sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); - uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb), + sockc_valid && !!sockc.dmabuf_id); if (!uarg) { err = -ENOBUFS; goto out_err; @@ -1087,12 +1101,27 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_ZEROCOPY; else uarg_to_msgzc(uarg)->zerocopy = 0; + + if (sockc_valid && sockc.dmabuf_id) { + binding = net_devmem_get_binding(sk, sockc.dmabuf_id); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + binding = NULL; + goto out_err; + } + } } } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_SPLICE_PAGES; } + if (sockc_valid && sockc.dmabuf_id && + (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) { + err = -EINVAL; + goto out_err; + } + if (unlikely(flags & MSG_FASTOPEN || inet_test_bit(DEFER_CONNECT, sk)) && !tp->repair) { @@ -1131,13 +1160,10 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockc = (struct sockcm_cookie) { .tsflags = READ_ONCE(sk->sk_tsflags)}; - if (msg->msg_controllen) { - err = sock_cmsg_send(sk, msg, &sockc); - if (unlikely(err)) { + if (!sockc_valid) { + if (!err) err = -EINVAL; - goto out_err; - } + goto out_err; } /* This should be in poll */ @@ -1258,7 +1284,8 @@ new_segment: goto wait_for_space; } - err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); + err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg, + binding); if (err == -EMSGSIZE || err == -EEXIST) { tcp_mark_push(tp, skb); goto new_segment; @@ -1339,6 +1366,8 @@ out_nopush: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put(uarg); + if (binding) + net_devmem_dmabuf_binding_put(binding); return copied + copied_syn; do_error: @@ -1356,6 +1385,9 @@ out_err: sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } + if (binding) + net_devmem_dmabuf_binding_put(binding); + return err; } EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ef052ccd9380..7bd29a9ff0db 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1524,7 +1524,8 @@ emsgsize: uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), + false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 7f7de6d88096..6e7b727c781c 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -87,7 +87,7 @@ static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk, uarg = msg_zerocopy_realloc(sk_vsock(vsk), iter->count, - NULL); + NULL, false); if (!uarg) return -1; @@ -107,8 +107,7 @@ static int virtio_transport_fill_skb(struct sk_buff *skb, { if (zcopy) return __zerocopy_sg_from_iter(info->msg, NULL, skb, - &info->msg->msg_iter, - len); + &info->msg->msg_iter, len, NULL); return memcpy_from_msg(skb_put(skb, len), info->msg, len); } -- cgit v1.2.3 From 383faec0fd64b9bff15eb5f700f023ec35520a96 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 8 May 2025 00:48:26 +0000 Subject: net: enable driver support for netmem TX Drivers need to make sure not to pass netmem dma-addrs to the dma-mapping API in order to support netmem TX. Add helpers and netmem_dma_*() helpers that enables special handling of netmem dma-addrs that drivers can use. Document in netmem.rst what drivers need to do to support netmem TX. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250508004830.4100853-7-almasrymina@google.com Signed-off-by: Paolo Abeni --- .../networking/net_cachelines/net_device.rst | 1 + Documentation/networking/netdev-features.rst | 5 +++++ Documentation/networking/netmem.rst | 23 ++++++++++++++++++++-- include/linux/netdevice.h | 2 ++ include/net/netmem.h | 20 +++++++++++++++++++ 5 files changed, 49 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index ca8605eb82ff..c69cc89c958e 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -10,6 +10,7 @@ Type Name fastpath_tx_acce =================================== =========================== =================== =================== =================================================================================== unsigned_long:32 priv_flags read_mostly __dev_queue_xmit(tx) unsigned_long:1 lltx read_mostly HARD_TX_LOCK,HARD_TX_TRYLOCK,HARD_TX_UNLOCK(tx) +unsigned long:1 netmem_tx:1; read_mostly char name[16] struct netdev_name_node* name_node struct dev_ifalias* ifalias diff --git a/Documentation/networking/netdev-features.rst b/Documentation/networking/netdev-features.rst index 5014f7cc1398..02bd7536fc0c 100644 --- a/Documentation/networking/netdev-features.rst +++ b/Documentation/networking/netdev-features.rst @@ -188,3 +188,8 @@ Redundancy) frames from one port to another in hardware. This should be set for devices which duplicate outgoing HSR (High-availability Seamless Redundancy) or PRP (Parallel Redundancy Protocol) tags automatically frames in hardware. + +* netmem-tx + +This should be set for devices which support netmem TX. See +Documentation/networking/netmem.rst diff --git a/Documentation/networking/netmem.rst b/Documentation/networking/netmem.rst index 7de21ddb5412..b63aded46337 100644 --- a/Documentation/networking/netmem.rst +++ b/Documentation/networking/netmem.rst @@ -19,8 +19,8 @@ Benefits of Netmem : * Simplified Development: Drivers interact with a consistent API, regardless of the underlying memory implementation. -Driver Requirements -=================== +Driver RX Requirements +====================== 1. The driver must support page_pool. @@ -77,3 +77,22 @@ Driver Requirements that purpose, but be mindful that some netmem types might have longer circulation times, such as when userspace holds a reference in zerocopy scenarios. + +Driver TX Requirements +====================== + +1. The Driver must not pass the netmem dma_addr to any of the dma-mapping APIs + directly. This is because netmem dma_addrs may come from a source like + dma-buf that is not compatible with the dma-mapping APIs. + + Helpers like netmem_dma_unmap_page_attrs() & netmem_dma_unmap_addr_set() + should be used in lieu of dma_unmap_page[_attrs](), dma_unmap_addr_set(). + The netmem variants will handle netmem dma_addrs correctly regardless of the + source, delegating to the dma-mapping APIs when appropriate. + + Not all dma-mapping APIs have netmem equivalents at the moment. If your + driver relies on a missing netmem API, feel free to add and propose to + netdev@, or reach out to the maintainers and/or almasrymina@google.com for + help adding the netmem API. + +2. Driver should declare support by setting `netdev->netmem_tx = true` diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 773167508c82..32a1e41636a9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1772,6 +1772,7 @@ enum netdev_reg_state { * @lltx: device supports lockless Tx. Deprecated for real HW * drivers. Mainly used by logical interfaces, such as * bonding and tunnels + * @netmem_tx: device support netmem_tx. * * @name: This is the first field of the "visible" part of this structure * (i.e. as seen by users in the "Space.c" file). It is the name @@ -2087,6 +2088,7 @@ struct net_device { struct_group(priv_flags_fast, unsigned long priv_flags:32; unsigned long lltx:1; + unsigned long netmem_tx:1; ); const struct net_device_ops *netdev_ops; const struct header_ops *header_ops; diff --git a/include/net/netmem.h b/include/net/netmem.h index ecb6b29c93f6..386164fb9c18 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -8,6 +8,7 @@ #ifndef _NET_NETMEM_H #define _NET_NETMEM_H +#include #include #include @@ -276,4 +277,23 @@ static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) void get_netmem(netmem_ref netmem); void put_netmem(netmem_ref netmem); +#define netmem_dma_unmap_addr_set(NETMEM, PTR, ADDR_NAME, VAL) \ + do { \ + if (!netmem_is_net_iov(NETMEM)) \ + dma_unmap_addr_set(PTR, ADDR_NAME, VAL); \ + else \ + dma_unmap_addr_set(PTR, ADDR_NAME, 0); \ + } while (0) + +static inline void netmem_dma_unmap_page_attrs(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + if (!addr) + return; + + dma_unmap_page_attrs(dev, addr, size, dir, attrs); +} + #endif /* _NET_NETMEM_H */ -- cgit v1.2.3 From dc75c3ced10c611f524e9e444303a0249ce32e43 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 12 May 2025 22:20:59 +0200 Subject: net: phy: remove stub for mdiobus_register_board_info The functionality of mdiobus_register_board_info() typically isn't optional for the caller. Therefore remove the stub. Note: Currently we have only one caller of mdiobus_register_board_info(), in a DSA/PHYLINK context. Therefore CONFIG_MDIO_DEVICE is selected anyway. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/410a2222-c4e8-45b0-9091-d49674caeb00@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index d62d292024bc..7c29d346d4b3 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2071,17 +2071,8 @@ struct mdio_board_info { const void *platform_data; }; -#if IS_ENABLED(CONFIG_MDIO_DEVICE) int mdiobus_register_board_info(const struct mdio_board_info *info, unsigned int n); -#else -static inline int mdiobus_register_board_info(const struct mdio_board_info *i, - unsigned int n) -{ - return 0; -} -#endif - /** * phy_module_driver() - Helper macro for registering PHY drivers -- cgit v1.2.3 From c16608005ccb99fbde3a4cd96eab28e16f148abf Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 13 May 2025 11:19:22 +0300 Subject: net: Look for bonding slaves in the bond's network namespace Update the for_each_netdev_in_bond_rcu macro to iterate through network devices in the bond's network namespace instead of always using init_net. This change is safe because: 1. **Bond-Slave Namespace Relationship**: A bond device and its slaves must reside in the same network namespace. The bond device's namespace is established at creation time and cannot change. 2. **Slave Movement Implications**: Any attempt to move a slave device to a different namespace automatically removes it from the bond, as per kernel networking stack rules. This maintains the invariant that slaves must exist in the same namespace as their bond. This change is part of an effort to enable Link Aggregation (LAG) to work properly inside custom network namespaces. Previously, the macro would only find slave devices in the initial network namespace, preventing proper bonding functionality in custom namespaces. Signed-off-by: Shay Drory Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250513081922.525716-1-mbloch@nvidia.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 32a1e41636a9..9e3a2d8452d6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3268,7 +3268,7 @@ int call_netdevice_notifiers_info(unsigned long val, #define for_each_netdev_continue_rcu(net, d) \ list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_in_bond_rcu(bond, slave) \ - for_each_netdev_rcu(&init_net, slave) \ + for_each_netdev_rcu(dev_net_rcu(bond), slave) \ if (netdev_master_upper_dev_get_rcu(slave) == (bond)) #define net_device_entry(lh) list_entry(lh, struct net_device, dev_list) -- cgit v1.2.3 From b9eef3391de028fdd88fd7a2f81a4834fc98c9ac Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:26 +0200 Subject: xdp: Use nested-BH locking for system_page_pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit system_page_pool is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Make a struct with a page_pool member (original system_page_pool) and a local_lock_t and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: Andrew Lunn Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jesper Dangaard Brouer Cc: John Fastabend Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250512092736.229935-6-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 7 ++++++- net/core/dev.c | 15 ++++++++++----- net/core/xdp.c | 15 ++++++++++----- 3 files changed, 26 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9e3a2d8452d6..73a97cf1bbce 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3503,7 +3503,12 @@ struct softnet_data { }; DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); -DECLARE_PER_CPU(struct page_pool *, system_page_pool); + +struct page_pool_bh { + struct page_pool *pool; + local_lock_t bh_lock; +}; +DECLARE_PER_CPU(struct page_pool_bh, system_page_pool); #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) diff --git a/net/core/dev.c b/net/core/dev.c index 33d5e95209cb..eea26162a585 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -462,7 +462,9 @@ EXPORT_PER_CPU_SYMBOL(softnet_data); * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ -DEFINE_PER_CPU(struct page_pool *, system_page_pool); +DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; #ifdef CONFIG_LOCKDEP /* @@ -5322,7 +5324,10 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) struct sk_buff *skb = *pskb; int err, hroom, troom; - if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog)) + local_lock_nested_bh(&system_page_pool.bh_lock); + err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog); + local_unlock_nested_bh(&system_page_pool.bh_lock); + if (!err) return 0; /* In case we have to go down the path and also linearize, @@ -12712,7 +12717,7 @@ static int net_page_pool_create(int cpuid) return err; } - per_cpu(system_page_pool, cpuid) = pp_ptr; + per_cpu(system_page_pool.pool, cpuid) = pp_ptr; #endif return 0; } @@ -12842,13 +12847,13 @@ out: for_each_possible_cpu(i) { struct page_pool *pp_ptr; - pp_ptr = per_cpu(system_page_pool, i); + pp_ptr = per_cpu(system_page_pool.pool, i); if (!pp_ptr) continue; xdp_unreg_page_pool(pp_ptr); page_pool_destroy(pp_ptr); - per_cpu(system_page_pool, i) = NULL; + per_cpu(system_page_pool.pool, i) = NULL; } } diff --git a/net/core/xdp.c b/net/core/xdp.c index 4e91c7790671..e6f22ba61c1e 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -739,25 +739,27 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, */ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) { - struct page_pool *pp = this_cpu_read(system_page_pool); const struct xdp_rxq_info *rxq = xdp->rxq; u32 len = xdp->data_end - xdp->data_meta; u32 truesize = xdp->frame_sz; - struct sk_buff *skb; + struct sk_buff *skb = NULL; + struct page_pool *pp; int metalen; void *data; if (!IS_ENABLED(CONFIG_PAGE_POOL)) return NULL; + local_lock_nested_bh(&system_page_pool.bh_lock); + pp = this_cpu_read(system_page_pool.pool); data = page_pool_dev_alloc_va(pp, &truesize); if (unlikely(!data)) - return NULL; + goto out; skb = napi_build_skb(data, truesize); if (unlikely(!skb)) { page_pool_free_va(pp, data, true); - return NULL; + goto out; } skb_mark_for_recycle(skb); @@ -776,13 +778,16 @@ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) if (unlikely(xdp_buff_has_frags(xdp)) && unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) { napi_consume_skb(skb, true); - return NULL; + skb = NULL; + goto out; } xsk_buff_free(xdp); skb->protocol = eth_type_trans(skb, rxq->dev); +out: + local_unlock_nested_bh(&system_page_pool.bh_lock); return skb; } EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc); -- cgit v1.2.3 From 7fe70c06a182a140be9996b02256d907e114479a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 11:27:31 +0200 Subject: net/sched: act_mirred: Move the recursion counter struct netdev_xmit mirred_nest_level is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Move mirred_nest_level to struct netdev_xmit as u8, provide wrappers. Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Juri Lelli Link: https://patch.msgid.link/20250512092736.229935-11-bigeasy@linutronix.de Signed-off-by: Paolo Abeni --- include/linux/netdevice_xmit.h | 3 +++ net/sched/act_mirred.c | 28 +++++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h index 38325e070296..848735b3a7c0 100644 --- a/include/linux/netdevice_xmit.h +++ b/include/linux/netdevice_xmit.h @@ -8,6 +8,9 @@ struct netdev_xmit { #ifdef CONFIG_NET_EGRESS u8 skip_txqueue; #endif +#if IS_ENABLED(CONFIG_NET_ACT_MIRRED) + u8 sched_mirred_nest; +#endif }; #endif diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5b3814365924..5f01f567c934 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,7 +30,29 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); #define MIRRED_NEST_LIMIT 4 -static DEFINE_PER_CPU(unsigned int, mirred_nest_level); + +#ifndef CONFIG_PREEMPT_RT +static u8 tcf_mirred_nest_level_inc_return(void) +{ + return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest); +} + +static void tcf_mirred_nest_level_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.sched_mirred_nest); +} + +#else +static u8 tcf_mirred_nest_level_inc_return(void) +{ + return current->net_xmit.sched_mirred_nest++; +} + +static void tcf_mirred_nest_level_dec(void) +{ + current->net_xmit.sched_mirred_nest--; +} +#endif static bool tcf_mirred_is_act_redirect(int action) { @@ -423,7 +445,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, int m_eaction; u32 blockid; - nest_level = __this_cpu_inc_return(mirred_nest_level); + nest_level = tcf_mirred_nest_level_inc_return(); if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); @@ -454,7 +476,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, retval); dec_nest_level: - __this_cpu_dec(mirred_nest_level); + tcf_mirred_nest_level_dec(); return retval; } -- cgit v1.2.3 From c1269d3d12b88151ee4c109624b5022d53a11738 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 May 2025 19:39:09 +0000 Subject: tcp: add tcp_rcvbuf_grow() tracepoint Provide a new tracepoint to better understand tcp_rcv_space_adjust() (currently broken) behavior. Call it only when tcp_rcv_space_adjust() has a chance to make a change. I chose to leave trace_tcp_rcv_space_adjust() as is, because commit 6163849d289b ("net: introduce a new tracepoint for tcp_rcv_space_adjust") intent was to get it called after each data delivery to user space. Tested: Pair of hosts in the same rack. Ideally, sk->sk_rcvbuf should be kept small. echo "4096 131072 33554432" >/proc/sys/net/ipv4/tcp_rmem ./netserver perf record -C10 -e tcp:tcp_rcvbuf_grow sleep 30 Trace for a TS enabled TCP flow (with standard ms granularity) perf script // We can see that sk_rcvbuf is growing very fast to tcp_mem[2] 260.500397: tcp:tcp_rcvbuf_grow: time=291 rtt_us=274 copied=110592 inq=0 space=41080 ooo=0 scaling_ratio=230 rcvbuf=131072 ... 260.501333: tcp:tcp_rcvbuf_grow: time=555 rtt_us=364 copied=333824 inq=0 space=110592 ooo=0 scaling_ratio=230 rcvbuf=1399144 ... 260.501664: tcp:tcp_rcvbuf_grow: time=331 rtt_us=330 copied=798720 inq=0 space=333824 ooo=0 scaling_ratio=230 rcvbuf=4110551 ... 260.502003: tcp:tcp_rcvbuf_grow: time=340 rtt_us=330 copied=1040384 inq=49152 space=798720 ooo=0 scaling_ratio=230 rcvbuf=7006410 ... 260.502483: tcp:tcp_rcvbuf_grow: time=479 rtt_us=330 copied=2658304 inq=49152 space=1040384 ooo=0 scaling_ratio=230 rcvbuf=7006410 ... 260.502899: tcp:tcp_rcvbuf_grow: time=416 rtt_us=413 copied=4026368 inq=147456 space=2658304 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.504233: tcp:tcp_rcvbuf_grow: time=493 rtt_us=487 copied=4800512 inq=196608 space=4026368 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.504792: tcp:tcp_rcvbuf_grow: time=559 rtt_us=551 copied=5672960 inq=49152 space=4800512 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.506614: tcp:tcp_rcvbuf_grow: time=610 rtt_us=607 copied=6688768 inq=180224 space=5672960 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.507280: tcp:tcp_rcvbuf_grow: time=666 rtt_us=656 copied=6868992 inq=49152 space=6688768 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.507979: tcp:tcp_rcvbuf_grow: time=699 rtt_us=699 copied=7000064 inq=0 space=6868992 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.508681: tcp:tcp_rcvbuf_grow: time=703 rtt_us=699 copied=7208960 inq=0 space=7000064 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.509426: tcp:tcp_rcvbuf_grow: time=744 rtt_us=737 copied=7569408 inq=0 space=7208960 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.510213: tcp:tcp_rcvbuf_grow: time=787 rtt_us=770 copied=7880704 inq=49152 space=7569408 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.511013: tcp:tcp_rcvbuf_grow: time=801 rtt_us=798 copied=8339456 inq=0 space=7880704 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.511860: tcp:tcp_rcvbuf_grow: time=847 rtt_us=824 copied=8601600 inq=49152 space=8339456 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.512710: tcp:tcp_rcvbuf_grow: time=850 rtt_us=846 copied=8814592 inq=65536 space=8601600 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.514428: tcp:tcp_rcvbuf_grow: time=871 rtt_us=865 copied=8855552 inq=49152 space=8814592 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.515333: tcp:tcp_rcvbuf_grow: time=905 rtt_us=882 copied=9228288 inq=49152 space=8855552 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.516237: tcp:tcp_rcvbuf_grow: time=905 rtt_us=896 copied=9371648 inq=49152 space=9228288 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.517149: tcp:tcp_rcvbuf_grow: time=911 rtt_us=909 copied=9543680 inq=49152 space=9371648 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.518070: tcp:tcp_rcvbuf_grow: time=921 rtt_us=921 copied=9793536 inq=0 space=9543680 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.520895: tcp:tcp_rcvbuf_grow: time=948 rtt_us=947 copied=10203136 inq=114688 space=9793536 ooo=0 scaling_ratio=230 rcvbuf=24622616 ... 260.521853: tcp:tcp_rcvbuf_grow: time=959 rtt_us=954 copied=10293248 inq=57344 space=10203136 ooo=0 scaling_ratio=230 rcvbuf=24691992 ... 260.522818: tcp:tcp_rcvbuf_grow: time=964 rtt_us=959 copied=10330112 inq=0 space=10293248 ooo=0 scaling_ratio=230 rcvbuf=24691992 ... 260.524760: tcp:tcp_rcvbuf_grow: time=979 rtt_us=969 copied=10633216 inq=49152 space=10330112 ooo=0 scaling_ratio=230 rcvbuf=24691992 ... 260.526709: tcp:tcp_rcvbuf_grow: time=975 rtt_us=973 copied=12013568 inq=163840 space=10633216 ooo=0 scaling_ratio=230 rcvbuf=25136755 ... 260.527694: tcp:tcp_rcvbuf_grow: time=985 rtt_us=976 copied=12025856 inq=32768 space=12013568 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 260.530655: tcp:tcp_rcvbuf_grow: time=991 rtt_us=986 copied=12050432 inq=98304 space=12025856 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 260.533626: tcp:tcp_rcvbuf_grow: time=993 rtt_us=989 copied=12124160 inq=0 space=12050432 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 260.538606: tcp:tcp_rcvbuf_grow: time=1000 rtt_us=994 copied=12222464 inq=49152 space=12124160 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 260.545605: tcp:tcp_rcvbuf_grow: time=1005 rtt_us=998 copied=12263424 inq=81920 space=12222464 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 260.553626: tcp:tcp_rcvbuf_grow: time=1005 rtt_us=999 copied=12320768 inq=12288 space=12263424 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 260.589749: tcp:tcp_rcvbuf_grow: time=1001 rtt_us=1000 copied=12398592 inq=16384 space=12320768 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 260.806577: tcp:tcp_rcvbuf_grow: time=1010 rtt_us=1000 copied=12402688 inq=32768 space=12398592 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 261.002386: tcp:tcp_rcvbuf_grow: time=1002 rtt_us=1000 copied=12419072 inq=98304 space=12402688 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 261.803432: tcp:tcp_rcvbuf_grow: time=1013 rtt_us=1000 copied=12468224 inq=49152 space=12419072 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 261.829533: tcp:tcp_rcvbuf_grow: time=1004 rtt_us=1000 copied=12615680 inq=0 space=12468224 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... 265.505435: tcp:tcp_rcvbuf_grow: time=1007 rtt_us=1000 copied=12632064 inq=32768 space=12615680 ooo=0 scaling_ratio=230 rcvbuf=33554432 ... We also see rtt_us going gradually to 1000 usec, causing massive overshoot. Trace for a usec TS enabled TCP flow (us granularity) perf script // We can see that sk_rcvbuf is growing to a smaller value, thanks to tight rtt_us values. 1509.273955: tcp:tcp_rcvbuf_grow: time=396 rtt_us=377 copied=110592 inq=0 space=41080 ooo=0 scaling_ratio=230 rcvbuf=131072 ... 1509.274366: tcp:tcp_rcvbuf_grow: time=412 rtt_us=365 copied=129024 inq=0 space=110592 ooo=0 scaling_ratio=230 rcvbuf=1399144 ... 1509.274738: tcp:tcp_rcvbuf_grow: time=372 rtt_us=355 copied=194560 inq=0 space=129024 ooo=0 scaling_ratio=230 rcvbuf=1399144 ... 1509.275020: tcp:tcp_rcvbuf_grow: time=282 rtt_us=257 copied=401408 inq=0 space=194560 ooo=0 scaling_ratio=230 rcvbuf=1399144 ... 1509.275190: tcp:tcp_rcvbuf_grow: time=170 rtt_us=144 copied=741376 inq=229376 space=401408 ooo=0 scaling_ratio=230 rcvbuf=3021625 ... 1509.275300: tcp:tcp_rcvbuf_grow: time=110 rtt_us=110 copied=1146880 inq=65536 space=741376 ooo=0 scaling_ratio=230 rcvbuf=4642390 ... 1509.275449: tcp:tcp_rcvbuf_grow: time=149 rtt_us=106 copied=1310720 inq=737280 space=1146880 ooo=0 scaling_ratio=230 rcvbuf=5498637 ... 1509.275560: tcp:tcp_rcvbuf_grow: time=111 rtt_us=107 copied=1388544 inq=430080 space=1310720 ooo=0 scaling_ratio=230 rcvbuf=5498637 ... 1509.275674: tcp:tcp_rcvbuf_grow: time=114 rtt_us=113 copied=1495040 inq=421888 space=1388544 ooo=0 scaling_ratio=230 rcvbuf=5498637 ... 1509.275800: tcp:tcp_rcvbuf_grow: time=126 rtt_us=126 copied=1572864 inq=77824 space=1495040 ooo=0 scaling_ratio=230 rcvbuf=5498637 ... 1509.275968: tcp:tcp_rcvbuf_grow: time=168 rtt_us=161 copied=1863680 inq=172032 space=1572864 ooo=0 scaling_ratio=230 rcvbuf=5498637 ... 1509.276129: tcp:tcp_rcvbuf_grow: time=161 rtt_us=161 copied=1941504 inq=204800 space=1863680 ooo=0 scaling_ratio=230 rcvbuf=5782790 ... 1509.276288: tcp:tcp_rcvbuf_grow: time=159 rtt_us=158 copied=1990656 inq=131072 space=1941504 ooo=0 scaling_ratio=230 rcvbuf=5782790 ... 1509.276900: tcp:tcp_rcvbuf_grow: time=228 rtt_us=226 copied=2883584 inq=266240 space=1990656 ooo=0 scaling_ratio=230 rcvbuf=5782790 ... 1509.277819: tcp:tcp_rcvbuf_grow: time=242 rtt_us=236 copied=3022848 inq=0 space=2883584 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.278072: tcp:tcp_rcvbuf_grow: time=253 rtt_us=247 copied=3055616 inq=49152 space=3022848 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.279560: tcp:tcp_rcvbuf_grow: time=268 rtt_us=264 copied=3133440 inq=180224 space=3055616 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.279833: tcp:tcp_rcvbuf_grow: time=274 rtt_us=270 copied=3424256 inq=0 space=3133440 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.282187: tcp:tcp_rcvbuf_grow: time=277 rtt_us=273 copied=3465216 inq=180224 space=3424256 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.284685: tcp:tcp_rcvbuf_grow: time=292 rtt_us=292 copied=3481600 inq=147456 space=3465216 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.284983: tcp:tcp_rcvbuf_grow: time=297 rtt_us=295 copied=3702784 inq=45056 space=3481600 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.285596: tcp:tcp_rcvbuf_grow: time=311 rtt_us=310 copied=3723264 inq=40960 space=3702784 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.285909: tcp:tcp_rcvbuf_grow: time=313 rtt_us=304 copied=3846144 inq=196608 space=3723264 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.291654: tcp:tcp_rcvbuf_grow: time=322 rtt_us=311 copied=3960832 inq=49152 space=3846144 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.291986: tcp:tcp_rcvbuf_grow: time=333 rtt_us=330 copied=4075520 inq=360448 space=3960832 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.292319: tcp:tcp_rcvbuf_grow: time=332 rtt_us=332 copied=4079616 inq=65536 space=4075520 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.292666: tcp:tcp_rcvbuf_grow: time=348 rtt_us=347 copied=4177920 inq=212992 space=4079616 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.293015: tcp:tcp_rcvbuf_grow: time=349 rtt_us=345 copied=4276224 inq=262144 space=4177920 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.293371: tcp:tcp_rcvbuf_grow: time=356 rtt_us=346 copied=4415488 inq=49152 space=4276224 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... 1509.515798: tcp:tcp_rcvbuf_grow: time=424 rtt_us=411 copied=4833280 inq=81920 space=4415488 ooo=0 scaling_ratio=230 rcvbuf=12316197 ... Signed-off-by: Eric Dumazet Reviewed-by: Wei Wang Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250513193919.1089692-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/trace/events/tcp.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_input.c | 2 ++ 2 files changed, 75 insertions(+) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 53e878fa14d1..006c2116c8f6 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -213,6 +213,79 @@ DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, TP_ARGS(sk) ); +TRACE_EVENT(tcp_rcvbuf_grow, + + TP_PROTO(struct sock *sk, int time), + + TP_ARGS(sk, time), + + TP_STRUCT__entry( + __field(int, time) + __field(__u32, rtt_us) + __field(__u32, copied) + __field(__u32, inq) + __field(__u32, space) + __field(__u32, ooo_space) + __field(__u32, rcvbuf) + __field(__u8, scaling_ratio) + __field(__u16, sport) + __field(__u16, dport) + __field(__u16, family) + __array(__u8, saddr, 4) + __array(__u8, daddr, 4) + __array(__u8, saddr_v6, 16) + __array(__u8, daddr_v6, 16) + __field(const void *, skaddr) + __field(__u64, sock_cookie) + ), + + TP_fast_assign( + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + __be32 *p32; + + __entry->time = time; + __entry->rtt_us = tp->rcv_rtt_est.rtt_us >> 3; + __entry->copied = tp->copied_seq - tp->rcvq_space.seq; + __entry->inq = tp->rcv_nxt - tp->copied_seq; + __entry->space = tp->rcvq_space.space; + __entry->ooo_space = RB_EMPTY_ROOT(&tp->out_of_order_queue) ? 0 : + TCP_SKB_CB(tp->ooo_last_skb)->end_seq - + tp->rcv_nxt; + + __entry->rcvbuf = sk->sk_rcvbuf; + __entry->scaling_ratio = tp->scaling_ratio; + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + __entry->family = sk->sk_family; + + p32 = (__be32 *) __entry->saddr; + *p32 = inet->inet_saddr; + + p32 = (__be32 *) __entry->daddr; + *p32 = inet->inet_daddr; + + TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, + sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); + + __entry->skaddr = sk; + __entry->sock_cookie = sock_gen_cookie(sk); + ), + + TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u " + "family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 " + "saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx", + __entry->time, __entry->rtt_us, __entry->copied, + __entry->inq, __entry->space, __entry->ooo_space, + __entry->scaling_ratio, __entry->rcvbuf, + show_family_name(__entry->family), + __entry->sport, __entry->dport, + __entry->saddr, __entry->daddr, + __entry->saddr_v6, __entry->daddr_v6, + __entry->skaddr, + __entry->sock_cookie) +); + TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a35018e2d0ba..88beb6d0f7b5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -769,6 +769,8 @@ void tcp_rcv_space_adjust(struct sock *sk) if (copied <= tp->rcvq_space.space) goto new_measure; + trace_tcp_rcvbuf_grow(sk, time); + /* A bit of theory : * copied = bytes received in previous RTT, our base window * To cope with packet losses, we need a 2x factor -- cgit v1.2.3 From ea33537d82921e71f852ea2ed985acc562125efe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 May 2025 19:39:12 +0000 Subject: tcp: add receive queue awareness in tcp_rcv_space_adjust() If the application can not drain fast enough a TCP socket queue, tcp_rcv_space_adjust() can overestimate tp->rcvq_space.space. Then sk->sk_rcvbuf can grow and hit tcp_rmem[2] for no good reason. Fix this by taking into acount the number of available bytes. Keeping sk->sk_rcvbuf at the right size allows better cache efficiency. Signed-off-by: Eric Dumazet Reviewed-by: Wei Wang Link: https://patch.msgid.link/20250513193919.1089692-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 2 +- net/ipv4/tcp_input.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a8af71623ba7..29f59d50dc73 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -340,7 +340,7 @@ struct tcp_sock { } rcv_rtt_est; /* Receiver queue space */ struct { - u32 space; + int space; u32 seq; u64 time; } rcvq_space; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f799200db264..5d64a6ecfc8f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -780,8 +780,7 @@ static void tcp_rcvbuf_grow(struct sock *sk) void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - u32 copied; - int time; + int time, inq, copied; trace_tcp_rcv_space_adjust(sk); @@ -792,6 +791,9 @@ void tcp_rcv_space_adjust(struct sock *sk) /* Number of bytes copied to user in last RTT */ copied = tp->copied_seq - tp->rcvq_space.seq; + /* Number of bytes in receive queue. */ + inq = tp->rcv_nxt - tp->copied_seq; + copied -= inq; if (copied <= tp->rcvq_space.space) goto new_measure; -- cgit v1.2.3 From 1119e5519dcdb7b3527f5d85accf9c7aa02b2b28 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 13 May 2025 15:17:52 -0700 Subject: net: sched: uapi: add more sanely named duplicate defines The TCA_FLOWER_KEY_CFM enum has a UNSPEC and MAX with _OPT in the name, but the real attributes don't. Add a MAX that more reasonably matches the attrs. The PAD in TCA_TAPRIO is the only attr which doesn't have _ATTR in it, perhaps signifying that it's not a real attr? If so interesting idea in abstract but it makes codegen painful. Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250513221752.843102-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_cls.h | 1 + include/uapi/linux/pkt_sched.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 490821364165..28d94b11d1aa 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -697,6 +697,7 @@ enum { }; #define TCA_FLOWER_KEY_CFM_OPT_MAX (__TCA_FLOWER_KEY_CFM_OPT_MAX - 1) +#define TCA_FLOWER_KEY_CFM_MAX (__TCA_FLOWER_KEY_CFM_OPT_MAX - 1) #define TCA_FLOWER_MASK_FLAGS_RANGE (1 << 0) /* Range-based match */ diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 9ea874395717..3e41349f3fa2 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1182,6 +1182,7 @@ enum { TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */ TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ TCA_TAPRIO_PAD, + TCA_TAPRIO_ATTR_PAD = TCA_TAPRIO_PAD, TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */ TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */ TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */ -- cgit v1.2.3 From 9cd5ef0b8c04c46a15c8f5d002f02ea0d0477790 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 May 2025 10:03:54 +0000 Subject: net: rfs: add sock_rps_delete_flow() helper RFS can exhibit lower performance for workloads using short-lived flows and a small set of 4-tuple. This is often the case for load-testers, using a pair of hosts, if the server has a single listener port. Typical use case : Server : tcp_crr -T128 -F1000 -6 -U -l30 -R 14250 Client : tcp_crr -T128 -F1000 -6 -U -l30 -c -H server | grep local_throughput This is because RFS global hash table contains stale information, when the same RSS key is recycled for another socket and another cpu. Make sure to undo the changes and go back to initial state when a flow is disconnected. Performance of the above test is increased by 22 %, going from 372604 transactions per second to 457773. Signed-off-by: Eric Dumazet Reported-by: Octavian Purdila Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250515100354.3339920-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/rps.h | 24 ++++++++++++++++++++++++ net/ipv4/inet_hashtables.c | 6 ++++-- net/ipv4/udp.c | 2 ++ net/sctp/socket.c | 2 +- 4 files changed, 31 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/rps.h b/include/net/rps.h index 507f4aa5d39b..d8ab3a08bcc4 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -123,6 +123,30 @@ static inline void sock_rps_record_flow(const struct sock *sk) #endif } +static inline void sock_rps_delete_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + struct rps_sock_flow_table *table; + u32 hash, index; + + if (!static_branch_unlikely(&rfs_needed)) + return; + + hash = READ_ONCE(sk->sk_rxhash); + if (!hash) + return; + + rcu_read_lock(); + table = rcu_dereference(net_hotdata.rps_sock_flow_table); + if (table) { + index = hash & table->mask; + if (READ_ONCE(table->ents[index]) != RPS_NO_CPU) + WRITE_ONCE(table->ents[index], RPS_NO_CPU); + } + rcu_read_unlock(); +#endif +} + static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) { #ifdef CONFIG_RPS diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index da85cc30e382..77a0b52b2eab 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -23,11 +23,12 @@ #if IS_ENABLED(CONFIG_IPV6) #include #endif -#include #include #include -#include +#include +#include #include +#include u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, @@ -790,6 +791,7 @@ void inet_unhash(struct sock *sk) if (sk_unhashed(sk)) return; + sock_rps_delete_flow(sk); if (sk->sk_state == TCP_LISTEN) { struct inet_listen_hashbucket *ilb2; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 358b49caa7b9..dde52b8050b8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -120,6 +120,7 @@ #if IS_ENABLED(CONFIG_IPV6) #include #endif +#include struct udp_table udp_table __read_mostly; @@ -2200,6 +2201,7 @@ void udp_lib_unhash(struct sock *sk) struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; + sock_rps_delete_flow(sk); hslot = udp_hashslot(udptable, sock_net(sk), udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 53725ee7ba06..85a9dfeff4d6 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -8321,7 +8321,7 @@ static int sctp_hash(struct sock *sk) static void sctp_unhash(struct sock *sk) { - /* STUB */ + sock_rps_delete_flow(sk); } /* Check if port is acceptable. Possibly find first available port. -- cgit v1.2.3 From 7b151e4efdde7cc7cfaae66e497d12487a70c6e9 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 14 May 2025 20:54:29 +0200 Subject: net: phy: fixed_phy: remove fixed_phy_register_with_gpiod Since its introduction 6 yrs ago this functions has never had a user. So remove it. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/ccbeef28-65ae-4e28-b1db-816c44338dee@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/fixed_phy.c | 32 +++++++------------------------- include/linux/phy_fixed.h | 14 -------------- 2 files changed, 7 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index ee7831a9849b..c91adf2464c5 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -223,12 +223,12 @@ static struct gpio_desc *fixed_phy_get_gpiod(struct device_node *np) } #endif -static struct phy_device *__fixed_phy_register(unsigned int irq, - struct fixed_phy_status *status, - struct device_node *np, - struct gpio_desc *gpiod) +struct phy_device *fixed_phy_register(unsigned int irq, + struct fixed_phy_status *status, + struct device_node *np) { struct fixed_mdio_bus *fmb = &platform_fmb; + struct gpio_desc *gpiod; struct phy_device *phy; int phy_addr; int ret; @@ -237,11 +237,9 @@ static struct phy_device *__fixed_phy_register(unsigned int irq, return ERR_PTR(-EPROBE_DEFER); /* Check if we have a GPIO associated with this fixed phy */ - if (!gpiod) { - gpiod = fixed_phy_get_gpiod(np); - if (IS_ERR(gpiod)) - return ERR_CAST(gpiod); - } + gpiod = fixed_phy_get_gpiod(np); + if (IS_ERR(gpiod)) + return ERR_CAST(gpiod); /* Get the next available PHY address, up to PHY_MAX_ADDR */ phy_addr = ida_alloc_max(&phy_fixed_ida, PHY_MAX_ADDR - 1, GFP_KERNEL); @@ -306,24 +304,8 @@ static struct phy_device *__fixed_phy_register(unsigned int irq, return phy; } - -struct phy_device *fixed_phy_register(unsigned int irq, - struct fixed_phy_status *status, - struct device_node *np) -{ - return __fixed_phy_register(irq, status, np, NULL); -} EXPORT_SYMBOL_GPL(fixed_phy_register); -struct phy_device * -fixed_phy_register_with_gpiod(unsigned int irq, - struct fixed_phy_status *status, - struct gpio_desc *gpiod) -{ - return __fixed_phy_register(irq, status, NULL, gpiod); -} -EXPORT_SYMBOL_GPL(fixed_phy_register_with_gpiod); - void fixed_phy_unregister(struct phy_device *phy) { phy_device_remove(phy); diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 1acafd86ab13..3392c09b5d24 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -13,7 +13,6 @@ struct fixed_phy_status { }; struct device_node; -struct gpio_desc; struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) @@ -24,11 +23,6 @@ extern struct phy_device *fixed_phy_register(unsigned int irq, struct fixed_phy_status *status, struct device_node *np); -extern struct phy_device * -fixed_phy_register_with_gpiod(unsigned int irq, - struct fixed_phy_status *status, - struct gpio_desc *gpiod); - extern void fixed_phy_unregister(struct phy_device *phydev); extern int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, @@ -46,14 +40,6 @@ static inline struct phy_device *fixed_phy_register(unsigned int irq, return ERR_PTR(-ENODEV); } -static inline struct phy_device * -fixed_phy_register_with_gpiod(unsigned int irq, - struct fixed_phy_status *status, - struct gpio_desc *gpiod) -{ - return ERR_PTR(-ENODEV); -} - static inline void fixed_phy_unregister(struct phy_device *phydev) { } -- cgit v1.2.3 From a462903fa22541f212134fba81084315ad843e6e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 16 May 2025 13:59:27 +0200 Subject: net: netlink: reduce extack cookie size Seems like the extack cookie hasn't found any users outside of wireless, which always uses nl_set_extack_cookie_u64(). Thus, allocating 20 bytes for it is pointless, reduce that to 8 bytes, and add a BUILD_BUG_ON() to ensure it's enough (obviously it is, for a u64, but in case it changes again.) Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20250516115927.38209-2-johannes@sipsolutions.net Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index c3ae84a77e16..882e9c1b6c1d 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -63,7 +63,7 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) } /* this can be increased when necessary - don't expose to userland */ -#define NETLINK_MAX_COOKIE_LEN 20 +#define NETLINK_MAX_COOKIE_LEN 8 #define NETLINK_MAX_FMTMSG_LEN 80 /** @@ -212,6 +212,7 @@ static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, { if (!extack) return; + BUILD_BUG_ON(sizeof(extack->cookie) < sizeof(cookie)); memcpy(extack->cookie, &cookie, sizeof(cookie)); extack->cookie_len = sizeof(cookie); } -- cgit v1.2.3 From 84b21e61ebd64931d865ce3df49d930db8c9e2cd Mon Sep 17 00:00:00 2001 From: Gur Stavi Date: Sun, 18 May 2025 13:00:54 +0300 Subject: queue_api: reduce risk of name collision over txq Rename local variable in macros from txq to _txq. When macro parameter get_desc is expended it is likely to have a txq token that refers to a different txq variable at the caller's site. Signed-off-by: Gur Stavi Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/95b60d218f004308486d92ed17c8cc6f28bac09d.1747559621.git.gur.stavi@huawei.com Signed-off-by: Jakub Kicinski --- include/net/netdev_queues.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 069ff35a72de..ba2eaf39089b 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -288,27 +288,27 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue, #define netif_subqueue_try_stop(dev, idx, get_desc, start_thrs) \ ({ \ - struct netdev_queue *txq; \ + struct netdev_queue *_txq; \ \ - txq = netdev_get_tx_queue(dev, idx); \ - netif_txq_try_stop(txq, get_desc, start_thrs); \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_try_stop(_txq, get_desc, start_thrs); \ }) #define netif_subqueue_maybe_stop(dev, idx, get_desc, stop_thrs, start_thrs) \ ({ \ - struct netdev_queue *txq; \ + struct netdev_queue *_txq; \ \ - txq = netdev_get_tx_queue(dev, idx); \ - netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs); \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_maybe_stop(_txq, get_desc, stop_thrs, start_thrs); \ }) #define netif_subqueue_completed_wake(dev, idx, pkts, bytes, \ get_desc, start_thrs) \ ({ \ - struct netdev_queue *txq; \ + struct netdev_queue *_txq; \ \ - txq = netdev_get_tx_queue(dev, idx); \ - netif_txq_completed_wake(txq, pkts, bytes, \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_completed_wake(_txq, pkts, bytes, \ get_desc, start_thrs); \ }) -- cgit v1.2.3 From 31be641d74267d98317ef5a2b90e6200511cabb3 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 15 May 2025 10:11:54 +0200 Subject: net: phy: make mdio consumer / device layer a separate module After having factored out the provider part from mdio_bus.c, we can make the mdio consumer / device layer a separate module. This also allows to remove Kconfig symbol MDIO_DEVICE. The module init / exit functions from mdio_bus.c no longer have to be called from phy_device.c. The link order defined in drivers/net/phy/Makefile ensures that init / exit functions are called in the right order. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/dba6b156-5748-44ce-b5e2-e8dc2fcee5a7@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/mdio/Kconfig | 16 ++-------------- drivers/net/phy/Kconfig | 2 +- drivers/net/phy/Makefile | 18 +++++------------- drivers/net/phy/mdio_bus.c | 14 ++++++-------- drivers/net/phy/mdio_device.c | 1 + drivers/net/phy/phy_device.c | 11 ++--------- include/linux/phy.h | 3 --- 7 files changed, 17 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/drivers/net/mdio/Kconfig b/drivers/net/mdio/Kconfig index d3219ca194ec..7db40aaa079d 100644 --- a/drivers/net/mdio/Kconfig +++ b/drivers/net/mdio/Kconfig @@ -3,21 +3,10 @@ # MDIO Layer Configuration # -menuconfig MDIO_DEVICE - tristate "MDIO bus device drivers" - help - MDIO devices and driver infrastructure code. - -if MDIO_DEVICE - config MDIO_BUS - tristate - default m if PHYLIB=m - default MDIO_DEVICE + tristate "MDIO bus consumer layer" help - This internal symbol is used for link time dependencies and it - reflects whether the mdio_bus/mdio_device code is built as a - loadable module or built-in. + MDIO bus consumer layer if PHYLIB @@ -291,4 +280,3 @@ config MDIO_BUS_MUX_MMIOREG endif -endif diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 677d56e06539..127a9fd0feb9 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -14,7 +14,7 @@ config PHYLINK menuconfig PHYLIB tristate "PHY Device support and infrastructure" - select MDIO_DEVICE + select MDIO_BUS help Ethernet controllers are usually attached to PHY devices. This option provides infrastructure for diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index 59ac3a9a3177..7de69320a3a7 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -6,27 +6,19 @@ libphy-y := phy.o phy-c45.o phy-core.o phy_device.o \ phy_package.o phy_caps.o mdio_bus_provider.o mdio-bus-y += mdio_bus.o mdio_device.o -ifdef CONFIG_MDIO_DEVICE -obj-y += mdio-boardinfo.o -endif - -# PHYLIB implies MDIO_DEVICE, in that case, we have a bunch of circular -# dependencies that does not make it possible to split mdio-bus objects into a -# dedicated loadable module, so we bundle them all together into libphy.ko ifdef CONFIG_PHYLIB -libphy-y += $(mdio-bus-y) -# the stubs are built-in whenever PHYLIB is built-in or module -obj-y += stubs.o -else -obj-$(CONFIG_MDIO_DEVICE) += mdio-bus.o +# built-in whenever PHYLIB is built-in or module +obj-y += stubs.o mdio-boardinfo.o endif -obj-$(CONFIG_PHYLIB) += mdio_devres.o + libphy-$(CONFIG_SWPHY) += swphy.o libphy-$(CONFIG_LED_TRIGGER_PHY) += phy_led_triggers.o libphy-$(CONFIG_OPEN_ALLIANCE_HELPERS) += open_alliance_helpers.o +obj-$(CONFIG_MDIO_BUS) += mdio-bus.o obj-$(CONFIG_PHYLINK) += phylink.o obj-$(CONFIG_PHYLIB) += libphy.o +obj-$(CONFIG_PHYLIB) += mdio_devres.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += mii_timestamper.o diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index f5ccbe33a384..a6bcb0fee863 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -988,7 +988,7 @@ const struct bus_type mdio_bus_type = { }; EXPORT_SYMBOL(mdio_bus_type); -int __init mdio_bus_init(void) +static int __init mdio_bus_init(void) { int ret; @@ -1002,16 +1002,14 @@ int __init mdio_bus_init(void) return ret; } -#if IS_ENABLED(CONFIG_PHYLIB) -void mdio_bus_exit(void) +static void __exit mdio_bus_exit(void) { class_unregister(&mdio_bus_class); bus_unregister(&mdio_bus_type); } -EXPORT_SYMBOL_GPL(mdio_bus_exit); -#else -module_init(mdio_bus_init); -/* no module_exit, intentional */ + +subsys_initcall(mdio_bus_init); +module_exit(mdio_bus_exit); + MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MDIO bus/device layer"); -#endif diff --git a/drivers/net/phy/mdio_device.c b/drivers/net/phy/mdio_device.c index e747ee63c665..cce3f405d1a4 100644 --- a/drivers/net/phy/mdio_device.c +++ b/drivers/net/phy/mdio_device.c @@ -45,6 +45,7 @@ int mdio_device_bus_match(struct device *dev, const struct device_driver *drv) return strcmp(mdiodev->modalias, drv->name) == 0; } +EXPORT_SYMBOL_GPL(mdio_device_bus_match); struct mdio_device *mdio_device_create(struct mii_bus *bus, int addr) { diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 2eb735e68dd8..781dfa6680eb 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3557,19 +3557,15 @@ static int __init phy_init(void) phylib_register_stubs(); rtnl_unlock(); - rc = mdio_bus_init(); - if (rc) - goto err_ethtool_phy_ops; - rc = phy_caps_init(); if (rc) - goto err_mdio_bus; + goto err_ethtool_phy_ops; features_init(); rc = phy_driver_register(&genphy_c45_driver, THIS_MODULE); if (rc) - goto err_mdio_bus; + goto err_ethtool_phy_ops; rc = phy_driver_register(&genphy_driver, THIS_MODULE); if (rc) @@ -3579,8 +3575,6 @@ static int __init phy_init(void) err_c45: phy_driver_unregister(&genphy_c45_driver); -err_mdio_bus: - mdio_bus_exit(); err_ethtool_phy_ops: rtnl_lock(); phylib_unregister_stubs(); @@ -3594,7 +3588,6 @@ static void __exit phy_exit(void) { phy_driver_unregister(&genphy_c45_driver); phy_driver_unregister(&genphy_driver); - mdio_bus_exit(); rtnl_lock(); phylib_unregister_stubs(); ethtool_set_ethtool_phy_ops(NULL); diff --git a/include/linux/phy.h b/include/linux/phy.h index 7c29d346d4b3..92a88b5ce356 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2033,9 +2033,6 @@ int phy_ethtool_set_link_ksettings(struct net_device *ndev, const struct ethtool_link_ksettings *cmd); int phy_ethtool_nway_reset(struct net_device *ndev); -int __init mdio_bus_init(void); -void mdio_bus_exit(void); - int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data); int phy_ethtool_get_sset_count(struct phy_device *phydev); int phy_ethtool_get_stats(struct phy_device *phydev, -- cgit v1.2.3 From 4c2bd7913f52b1e5c978edf56cdef39c30a1f603 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 17 May 2025 13:08:10 -0700 Subject: net: let lockdep compare instance locks AFAIU always returning -1 from lockdep's compare function basically disables checking of dependencies between given locks. Try to be a little more precise about what guarantees that instance locks won't deadlock. Right now we only nest them under protection of rtnl_lock. Mostly in unregister_netdevice_many() and dev_close_many(). Acked-by: Stanislav Fomichev Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250517200810.466531-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netdev_lock.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index 2a753813f849..c345afecd4c5 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -99,16 +99,15 @@ static inline void netdev_unlock_ops_compat(struct net_device *dev) static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { - /* Only lower devices currently grab the instance lock, so no - * real ordering issues can occur. In the near future, only - * hardware devices will grab instance lock which also does not - * involve any ordering. Suppress lockdep ordering warnings - * until (if) we start grabbing instance lock on pure SW - * devices (bond/team/veth/etc). - */ if (a == b) return 0; - return -1; + + /* Allow locking multiple devices only under rtnl_lock, + * the exact order doesn't matter. + * Note that upper devices don't lock their ops, so nesting + * mostly happens in batched device removal for now. + */ + return lockdep_rtnl_is_held() ? -1 : 1; } #define netdev_lockdep_set_classes(dev) \ -- cgit v1.2.3 From 3f1716ee0f6c63795e6d225e3f5ec3825cd2bd57 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 17 May 2025 22:34:32 +0200 Subject: net: phy: fixed_phy: remove irq argument from fixed_phy_add All callers pass PHY_POLL, therefore remove irq argument from fixed_phy_add(). Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Acked-by: Greg Ungerer Link: https://patch.msgid.link/b3b9b3bc-c310-4a54-b376-c909c83575de@gmail.com Signed-off-by: Jakub Kicinski --- arch/m68k/coldfire/m5272.c | 2 +- arch/mips/bcm47xx/setup.c | 2 +- drivers/net/phy/fixed_phy.c | 5 ++--- include/linux/phy_fixed.h | 5 ++--- 4 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/arch/m68k/coldfire/m5272.c b/arch/m68k/coldfire/m5272.c index 734dab657fe3..5b70dfdab368 100644 --- a/arch/m68k/coldfire/m5272.c +++ b/arch/m68k/coldfire/m5272.c @@ -119,7 +119,7 @@ static struct fixed_phy_status nettel_fixed_phy_status __initdata = { static int __init init_BSP(void) { m5272_uarts_init(); - fixed_phy_add(PHY_POLL, 0, &nettel_fixed_phy_status); + fixed_phy_add(0, &nettel_fixed_phy_status); clkdev_add_table(m5272_clk_lookup, ARRAY_SIZE(m5272_clk_lookup)); return 0; } diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c index 247be207f293..de426a474b5b 100644 --- a/arch/mips/bcm47xx/setup.c +++ b/arch/mips/bcm47xx/setup.c @@ -282,7 +282,7 @@ static int __init bcm47xx_register_bus_complete(void) bcm47xx_leds_register(); bcm47xx_workarounds(); - fixed_phy_add(PHY_POLL, 0, &bcm47xx_fixed_phy_status); + fixed_phy_add(0, &bcm47xx_fixed_phy_status); return 0; } device_initcall(bcm47xx_register_bus_complete); diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index c91adf2464c5..34a71f223f0f 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -160,10 +160,9 @@ static int fixed_phy_add_gpiod(unsigned int irq, int phy_addr, return 0; } -int fixed_phy_add(unsigned int irq, int phy_addr, - struct fixed_phy_status *status) +int fixed_phy_add(int phy_addr, struct fixed_phy_status *status) { - return fixed_phy_add_gpiod(irq, phy_addr, status, NULL); + return fixed_phy_add_gpiod(PHY_POLL, phy_addr, status, NULL); } EXPORT_SYMBOL_GPL(fixed_phy_add); diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 3392c09b5d24..316bb4deda37 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -17,8 +17,7 @@ struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); -extern int fixed_phy_add(unsigned int irq, int phy_id, - struct fixed_phy_status *status); +int fixed_phy_add(int phy_id, struct fixed_phy_status *status); extern struct phy_device *fixed_phy_register(unsigned int irq, struct fixed_phy_status *status, struct device_node *np); @@ -28,7 +27,7 @@ extern int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, struct fixed_phy_status *)); #else -static inline int fixed_phy_add(unsigned int irq, int phy_id, +static inline int fixed_phy_add(int phy_id, struct fixed_phy_status *status) { return -ENODEV; -- cgit v1.2.3 From d23b4af5df3900fb0b4e1a05cb8119dd1c395519 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 17 May 2025 22:35:56 +0200 Subject: net: phy: fixed_phy: remove irq argument from fixed_phy_register All callers pass PHY_POLL, therefore remove irq argument from fixed_phy_register(). Note: I keep the irq argument in fixed_phy_add_gpiod() for now, for the case that somebody may want to use a GPIO interrupt in the future, by e.g. adding a call to fwnode_irq_get() to fixed_phy_get_gpiod(). Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/31cdb232-a5e9-4997-a285-cb9a7d208124@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/dsa_loop.c | 2 +- drivers/net/ethernet/broadcom/bgmac.c | 2 +- drivers/net/ethernet/broadcom/genet/bcmmii.c | 2 +- drivers/net/ethernet/faraday/ftgmac100.c | 2 +- drivers/net/mdio/of_mdio.c | 2 +- drivers/net/phy/fixed_phy.c | 5 ++--- drivers/net/usb/lan78xx.c | 2 +- include/linux/phy_fixed.h | 11 +++++------ 8 files changed, 13 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index adbab544c60f..d8a35f25a4c8 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -405,7 +405,7 @@ static int __init dsa_loop_init(void) unsigned int i, ret; for (i = 0; i < NUM_FIXED_PHYS; i++) - phydevs[i] = fixed_phy_register(PHY_POLL, &status, NULL); + phydevs[i] = fixed_phy_register(&status, NULL); ret = mdio_driver_register(&dsa_loop_drv); if (ret) diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index a461ec612e95..3e9c57196a39 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -1446,7 +1446,7 @@ int bgmac_phy_connect_direct(struct bgmac *bgmac) struct phy_device *phy_dev; int err; - phy_dev = fixed_phy_register(PHY_POLL, &fphy_status, NULL); + phy_dev = fixed_phy_register(&fphy_status, NULL); if (IS_ERR(phy_dev)) { dev_err(bgmac->dev, "Failed to register fixed PHY device\n"); return PTR_ERR(phy_dev); diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index 71c619d2bea5..b6437ba7a2eb 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -625,7 +625,7 @@ static int bcmgenet_mii_pd_init(struct bcmgenet_priv *priv) .asym_pause = 0, }; - phydev = fixed_phy_register(PHY_POLL, &fphy_status, NULL); + phydev = fixed_phy_register(&fphy_status, NULL); if (IS_ERR(phydev)) { dev_err(kdev, "failed to register fixed PHY device\n"); return PTR_ERR(phydev); diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index 17ec35e75a65..a98d5af3f9e3 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -1906,7 +1906,7 @@ static int ftgmac100_probe(struct platform_device *pdev) goto err_phy_connect; } - phydev = fixed_phy_register(PHY_POLL, &ncsi_phy_status, np); + phydev = fixed_phy_register(&ncsi_phy_status, np); if (IS_ERR(phydev)) { dev_err(&pdev->dev, "failed to register fixed PHY device\n"); err = PTR_ERR(phydev); diff --git a/drivers/net/mdio/of_mdio.c b/drivers/net/mdio/of_mdio.c index 2f4fc664d2e1..98f667b121f7 100644 --- a/drivers/net/mdio/of_mdio.c +++ b/drivers/net/mdio/of_mdio.c @@ -458,7 +458,7 @@ int of_phy_register_fixed_link(struct device_node *np) return -ENODEV; register_phy: - return PTR_ERR_OR_ZERO(fixed_phy_register(PHY_POLL, &status, np)); + return PTR_ERR_OR_ZERO(fixed_phy_register(&status, np)); } EXPORT_SYMBOL(of_phy_register_fixed_link); diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index 34a71f223f0f..ea002a137a7d 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -222,8 +222,7 @@ static struct gpio_desc *fixed_phy_get_gpiod(struct device_node *np) } #endif -struct phy_device *fixed_phy_register(unsigned int irq, - struct fixed_phy_status *status, +struct phy_device *fixed_phy_register(struct fixed_phy_status *status, struct device_node *np) { struct fixed_mdio_bus *fmb = &platform_fmb; @@ -245,7 +244,7 @@ struct phy_device *fixed_phy_register(unsigned int irq, if (phy_addr < 0) return ERR_PTR(phy_addr); - ret = fixed_phy_add_gpiod(irq, phy_addr, status, gpiod); + ret = fixed_phy_add_gpiod(PHY_POLL, phy_addr, status, gpiod); if (ret < 0) { ida_free(&phy_fixed_ida, phy_addr); return ERR_PTR(ret); diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 58e3589e3b89..480bbc0f2d8f 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2640,7 +2640,7 @@ static struct phy_device *lan78xx_register_fixed_phy(struct lan78xx_net *dev) netdev_info(dev->net, "No PHY found on LAN7801 – registering fixed PHY (e.g. EVB-KSZ9897-1)\n"); - return fixed_phy_register(PHY_POLL, &fphy_status, NULL); + return fixed_phy_register(&fphy_status, NULL); } /** diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 316bb4deda37..634149a73c2a 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -18,9 +18,8 @@ struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); int fixed_phy_add(int phy_id, struct fixed_phy_status *status); -extern struct phy_device *fixed_phy_register(unsigned int irq, - struct fixed_phy_status *status, - struct device_node *np); +struct phy_device *fixed_phy_register(struct fixed_phy_status *status, + struct device_node *np); extern void fixed_phy_unregister(struct phy_device *phydev); extern int fixed_phy_set_link_update(struct phy_device *phydev, @@ -32,9 +31,9 @@ static inline int fixed_phy_add(int phy_id, { return -ENODEV; } -static inline struct phy_device *fixed_phy_register(unsigned int irq, - struct fixed_phy_status *status, - struct device_node *np) +static inline struct phy_device * +fixed_phy_register(struct fixed_phy_status *status, + struct device_node *np) { return ERR_PTR(-ENODEV); } -- cgit v1.2.3 From 4ba1c5bb4811f560a86697311cb4e9741e047a5d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 17 May 2025 22:37:29 +0200 Subject: net: phy: fixed_phy: constify status argument where possible Constify the passed struct fixed_phy_status *status where possible. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/d1764b62-8538-408b-a4e3-b63715481a38@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/fixed_phy.c | 6 +++--- include/linux/phy_fixed.h | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index ea002a137a7d..033656d574b8 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -131,7 +131,7 @@ int fixed_phy_set_link_update(struct phy_device *phydev, EXPORT_SYMBOL_GPL(fixed_phy_set_link_update); static int fixed_phy_add_gpiod(unsigned int irq, int phy_addr, - struct fixed_phy_status *status, + const struct fixed_phy_status *status, struct gpio_desc *gpiod) { int ret; @@ -160,7 +160,7 @@ static int fixed_phy_add_gpiod(unsigned int irq, int phy_addr, return 0; } -int fixed_phy_add(int phy_addr, struct fixed_phy_status *status) +int fixed_phy_add(int phy_addr, const struct fixed_phy_status *status) { return fixed_phy_add_gpiod(PHY_POLL, phy_addr, status, NULL); } @@ -222,7 +222,7 @@ static struct gpio_desc *fixed_phy_get_gpiod(struct device_node *np) } #endif -struct phy_device *fixed_phy_register(struct fixed_phy_status *status, +struct phy_device *fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np) { struct fixed_mdio_bus *fmb = &platform_fmb; diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 634149a73c2a..5399b9e41e35 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -17,8 +17,8 @@ struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); -int fixed_phy_add(int phy_id, struct fixed_phy_status *status); -struct phy_device *fixed_phy_register(struct fixed_phy_status *status, +int fixed_phy_add(int phy_id, const struct fixed_phy_status *status); +struct phy_device *fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np); extern void fixed_phy_unregister(struct phy_device *phydev); @@ -27,12 +27,12 @@ extern int fixed_phy_set_link_update(struct phy_device *phydev, struct fixed_phy_status *)); #else static inline int fixed_phy_add(int phy_id, - struct fixed_phy_status *status) + const struct fixed_phy_status *status) { return -ENODEV; } static inline struct phy_device * -fixed_phy_register(struct fixed_phy_status *status, +fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np) { return ERR_PTR(-ENODEV); -- cgit v1.2.3 From f0a56c17e64bb5e7cdb9295df2b5fc21e4949005 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 15 May 2025 19:27:18 -0700 Subject: inet: Remove rtnl_is_held arg of lwtunnel_valid_encap_type(_attr)?(). Commit f130a0cc1b4f ("inet: fix lwtunnel_valid_encap_type() lock imbalance") added the rtnl_is_held argument as a temporary fix while I'm converting nexthop and IPv6 routing table to per-netns RTNL or RCU. Now all callers of lwtunnel_valid_encap_type() do not hold RTNL. Let's remove the argument. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250516022759.44392-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/lwtunnel.h | 13 +++++-------- net/core/lwtunnel.c | 15 +++------------ net/ipv4/fib_frontend.c | 4 ++-- net/ipv4/nexthop.c | 3 +-- net/ipv6/route.c | 6 ++---- 5 files changed, 13 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 39cd50300a18..c306ebe379a0 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -116,11 +116,9 @@ int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_valid_encap_type(u16 encap_type, - struct netlink_ext_ack *extack, - bool rtnl_is_held); + struct netlink_ext_ack *extack); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, - struct netlink_ext_ack *extack, - bool rtnl_is_held); + struct netlink_ext_ack *extack); int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, @@ -203,15 +201,14 @@ static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, } static inline int lwtunnel_valid_encap_type(u16 encap_type, - struct netlink_ext_ack *extack, - bool rtnl_is_held) + struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel"); return -EOPNOTSUPP; } + static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, - struct netlink_ext_ack *extack, - bool rtnl_is_held) + struct netlink_ext_ack *extack) { /* return 0 since we are not walking attr looking for * RTA_ENCAP_TYPE attribute on nexthops. diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 60f27cb4e54f..f9d76d85d04f 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -149,8 +149,7 @@ int lwtunnel_build_state(struct net *net, u16 encap_type, } EXPORT_SYMBOL_GPL(lwtunnel_build_state); -int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack, - bool rtnl_is_held) +int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; int ret = -EINVAL; @@ -167,12 +166,7 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack, const char *encap_type_str = lwtunnel_encap_str(encap_type); if (encap_type_str) { - if (rtnl_is_held) - __rtnl_unlock(); request_module("rtnl-lwt-%s", encap_type_str); - if (rtnl_is_held) - rtnl_lock(); - ops = rcu_access_pointer(lwtun_encaps[encap_type]); } } @@ -186,8 +180,7 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack, EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, - struct netlink_ext_ack *extack, - bool rtnl_is_held) + struct netlink_ext_ack *extack) { struct rtnexthop *rtnh = (struct rtnexthop *)attr; struct nlattr *nla_entype; @@ -208,9 +201,7 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, } encap_type = nla_get_u16(nla_entype); - if (lwtunnel_valid_encap_type(encap_type, - extack, - rtnl_is_held) != 0) + if (lwtunnel_valid_encap_type(encap_type, extack)) return -EOPNOTSUPP; } } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 57f088e5540e..fd1e1507a224 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -807,7 +807,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_MULTIPATH: err = lwtunnel_valid_encap_type_attr(nla_data(attr), nla_len(attr), - extack, false); + extack); if (err < 0) goto errout; cfg->fc_mp = nla_data(attr); @@ -825,7 +825,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, - extack, false); + extack); if (err < 0) goto errout; break; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 823e4a783d2b..4397e89d3123 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -3180,8 +3180,7 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, } cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); - err = lwtunnel_valid_encap_type(cfg->nh_encap_type, - extack, false); + err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); if (err < 0) goto out; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 44300962230b..6baf177c529b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5172,8 +5172,7 @@ static int rtm_to_fib6_multipath_config(struct fib6_config *cfg, rtnh = rtnh_next(rtnh, &remaining); } while (rtnh_ok(rtnh, remaining)); - return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, - extack, false); + return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, extack); } static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -5310,8 +5309,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[RTA_ENCAP_TYPE]) { cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); - err = lwtunnel_valid_encap_type(cfg->fc_encap_type, - extack, false); + err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); if (err < 0) goto errout; } -- cgit v1.2.3 From d7500fbfb12067ee7313f13f4c58f771be3018ab Mon Sep 17 00:00:00 2001 From: Bert Karwatzki Date: Wed, 21 May 2025 00:34:29 +0200 Subject: wifi: check if socket flags are valid Checking the SOCK_WIFI_STATUS flag bit in sk_flags may give wrong results since sk_flags are part of a union and the union is used otherwise. Add sk_requests_wifi_status() which checks if sk is non-NULL, sk is a full socket (so flags are valid) and checks the flag bit. Fixes: 76a853f86c97 ("wifi: free SKBTX_WIFI_STATUS skb tx_flags flag") Suggested-by: Johannes Berg Signed-off-by: Bert Karwatzki Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250520223430.6875-1-spasswolf@web.de [edit commit message, fix indentation] Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/wil6210/txrx.h | 3 +-- drivers/net/wireless/marvell/mwifiex/main.c | 3 +-- include/net/sock.h | 6 ++++++ net/mac80211/mesh.c | 2 +- net/mac80211/tx.c | 6 +++--- 5 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/wil6210/txrx.h b/drivers/net/wireless/ath/wil6210/txrx.h index 33ccd0b248d4..fff8b7f8abc5 100644 --- a/drivers/net/wireless/ath/wil6210/txrx.h +++ b/drivers/net/wireless/ath/wil6210/txrx.h @@ -617,8 +617,7 @@ static inline bool wil_need_txstat(struct sk_buff *skb) { const u8 *da = wil_skb_get_da(skb); - return is_unicast_ether_addr(da) && skb->sk && - sock_flag(skb->sk, SOCK_WIFI_STATUS); + return is_unicast_ether_addr(da) && sk_requests_wifi_status(skb->sk); } static inline void wil_consume_skb(struct sk_buff *skb, bool acked) diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c index 1485f949ad4e..7b50a88a18e5 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.c +++ b/drivers/net/wireless/marvell/mwifiex/main.c @@ -913,8 +913,7 @@ mwifiex_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) multicast = is_multicast_ether_addr(skb->data); - if (unlikely(!multicast && skb->sk && - sock_flag(skb->sk, SOCK_WIFI_STATUS) && + if (unlikely(!multicast && sk_requests_wifi_status(skb->sk) && priv->adapter->fw_api_ver == MWIFIEX_FW_V15)) skb = mwifiex_clone_skb_for_tx_status(priv, skb, diff --git a/include/net/sock.h b/include/net/sock.h index f0fabb9fd28a..75c12e14fc47 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2821,6 +2821,12 @@ sk_is_refcounted(struct sock *sk) return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); } +static inline bool +sk_requests_wifi_status(struct sock *sk) +{ + return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS); +} + /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. * Check decrypted mark in case skb_orphan() cleared socket. diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index a381b4b756ea..5cc56d578048 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -777,7 +777,7 @@ bool ieee80211_mesh_xmit_fast(struct ieee80211_sub_if_data *sdata, if (ethertype < ETH_P_802_3_MIN) return false; - if (skb->sk && sock_flag(skb->sk, SOCK_WIFI_STATUS)) + if (sk_requests_wifi_status(skb->sk)) return false; if (skb->ip_summed == CHECKSUM_PARTIAL) { diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 3b9392a6ddb2..d8d4f3d7d7f2 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2859,7 +2859,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, } if (unlikely(!multicast && - ((skb->sk && sock_flag(skb->sk, SOCK_WIFI_STATUS)) || + (sk_requests_wifi_status(skb->sk) || ctrl_flags & IEEE80211_TX_CTL_REQ_TX_STATUS))) info_id = ieee80211_store_ack_skb(local, skb, &info_flags, cookie); @@ -3756,7 +3756,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, return false; /* don't handle TX status request here either */ - if (skb->sk && sock_flag(skb->sk, SOCK_WIFI_STATUS)) + if (sk_requests_wifi_status(skb->sk)) return false; if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { @@ -4648,7 +4648,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, memcpy(IEEE80211_SKB_CB(seg), info, sizeof(*info)); } - if (unlikely(skb->sk && sock_flag(skb->sk, SOCK_WIFI_STATUS))) { + if (unlikely(sk_requests_wifi_status(skb->sk))) { info->status_data = ieee80211_store_ack_skb(local, skb, &info->flags, NULL); if (info->status_data) -- cgit v1.2.3 From b803c4a4f78834b31ebfbbcea350473333760559 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Fri, 2 May 2025 02:12:10 +0900 Subject: can: dev: add struct data_bittiming_params to group FD parameters This is a preparation patch for the introduction of CAN XL. CAN FD and CAN XL uses similar bittiming parameters. Add one level of nesting for all the CAN FD parameters. Typically: priv->can.data_bittiming; becomes: priv->can.fd.data_bittiming; This way, the CAN XL equivalent (to be introduced later) would be: priv->can.xl.data_bittiming; Add the new struct data_bittiming_params which contains all the data bittiming parameters, including the TDC and the callback functions. This done, update all the CAN FD drivers to make use of the new layout. Acked-by: Oliver Hartkopp Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250501171213.2161572-2-mailhol.vincent@wanadoo.fr [mkl: fix rcar_canfd] Signed-off-by: Marc Kleine-Budde --- drivers/net/can/ctucanfd/ctucanfd_base.c | 8 +-- drivers/net/can/dev/dev.c | 12 ++-- drivers/net/can/dev/netlink.c | 74 +++++++++++----------- drivers/net/can/flexcan/flexcan-core.c | 4 +- drivers/net/can/ifi_canfd/ifi_canfd.c | 10 +-- drivers/net/can/kvaser_pciefd.c | 6 +- drivers/net/can/m_can/m_can.c | 8 +-- drivers/net/can/peak_canfd/peak_canfd.c | 6 +- drivers/net/can/rcar/rcar_canfd.c | 4 +- drivers/net/can/rockchip/rockchip_canfd-core.c | 4 +- .../net/can/rockchip/rockchip_canfd-timestamp.c | 2 +- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 10 +-- drivers/net/can/usb/esd_usb.c | 6 +- drivers/net/can/usb/etas_es58x/es58x_core.c | 4 +- drivers/net/can/usb/etas_es58x/es58x_fd.c | 6 +- drivers/net/can/usb/gs_usb.c | 8 +-- drivers/net/can/usb/kvaser_usb/kvaser_usb.h | 2 +- drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c | 6 +- drivers/net/can/usb/peak_usb/pcan_usb_core.c | 6 +- drivers/net/can/xilinx_can.c | 16 ++--- include/linux/can/dev.h | 28 ++++---- 21 files changed, 117 insertions(+), 113 deletions(-) (limited to 'include') diff --git a/drivers/net/can/ctucanfd/ctucanfd_base.c b/drivers/net/can/ctucanfd/ctucanfd_base.c index f65c1a1e05cc..bf6398772960 100644 --- a/drivers/net/can/ctucanfd/ctucanfd_base.c +++ b/drivers/net/can/ctucanfd/ctucanfd_base.c @@ -275,7 +275,7 @@ static int ctucan_set_bittiming(struct net_device *ndev) static int ctucan_set_data_bittiming(struct net_device *ndev) { struct ctucan_priv *priv = netdev_priv(ndev); - struct can_bittiming *dbt = &priv->can.data_bittiming; + struct can_bittiming *dbt = &priv->can.fd.data_bittiming; /* Note that dbt may be modified here */ return ctucan_set_btr(ndev, dbt, false); @@ -290,7 +290,7 @@ static int ctucan_set_data_bittiming(struct net_device *ndev) static int ctucan_set_secondary_sample_point(struct net_device *ndev) { struct ctucan_priv *priv = netdev_priv(ndev); - struct can_bittiming *dbt = &priv->can.data_bittiming; + struct can_bittiming *dbt = &priv->can.fd.data_bittiming; int ssp_offset = 0; u32 ssp_cfg = 0; /* No SSP by default */ @@ -1358,12 +1358,12 @@ int ctucan_probe_common(struct device *dev, void __iomem *addr, int irq, unsigne priv->ntxbufs = ntxbufs; priv->dev = dev; priv->can.bittiming_const = &ctu_can_fd_bit_timing_max; - priv->can.data_bittiming_const = &ctu_can_fd_bit_timing_data_max; + priv->can.fd.data_bittiming_const = &ctu_can_fd_bit_timing_data_max; priv->can.do_set_mode = ctucan_do_set_mode; /* Needed for timing adjustment to be performed as soon as possible */ priv->can.do_set_bittiming = ctucan_set_bittiming; - priv->can.do_set_data_bittiming = ctucan_set_data_bittiming; + priv->can.fd.do_set_data_bittiming = ctucan_set_data_bittiming; priv->can.do_get_berr_counter = ctucan_get_berr_counter; priv->can.ctrlmode_supported = CAN_CTRLMODE_LOOPBACK diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 5ec3170b896a..ea8c807af4d8 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -404,8 +404,8 @@ int open_candev(struct net_device *dev) /* For CAN FD the data bitrate has to be >= the arbitration bitrate */ if ((priv->ctrlmode & CAN_CTRLMODE_FD) && - (!priv->data_bittiming.bitrate || - priv->data_bittiming.bitrate < priv->bittiming.bitrate)) { + (!priv->fd.data_bittiming.bitrate || + priv->fd.data_bittiming.bitrate < priv->bittiming.bitrate)) { netdev_err(dev, "incorrect/missing data bit-timing\n"); return -EINVAL; } @@ -543,16 +543,16 @@ int register_candev(struct net_device *dev) if (!priv->bitrate_const != !priv->bitrate_const_cnt) return -EINVAL; - if (!priv->data_bitrate_const != !priv->data_bitrate_const_cnt) + if (!priv->fd.data_bitrate_const != !priv->fd.data_bitrate_const_cnt) return -EINVAL; /* We only support either fixed bit rates or bit timing const. */ - if ((priv->bitrate_const || priv->data_bitrate_const) && - (priv->bittiming_const || priv->data_bittiming_const)) + if ((priv->bitrate_const || priv->fd.data_bitrate_const) && + (priv->bittiming_const || priv->fd.data_bittiming_const)) return -EINVAL; if (!can_bittiming_const_valid(priv->bittiming_const) || - !can_bittiming_const_valid(priv->data_bittiming_const)) + !can_bittiming_const_valid(priv->fd.data_bittiming_const)) return -EINVAL; if (!priv->termination_const) { diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c index f1db9b7ffd4d..a36842ace084 100644 --- a/drivers/net/can/dev/netlink.c +++ b/drivers/net/can/dev/netlink.c @@ -141,7 +141,7 @@ static int can_tdc_changelink(struct can_priv *priv, const struct nlattr *nla, { struct nlattr *tb_tdc[IFLA_CAN_TDC_MAX + 1]; struct can_tdc tdc = { 0 }; - const struct can_tdc_const *tdc_const = priv->tdc_const; + const struct can_tdc_const *tdc_const = priv->fd.tdc_const; int err; if (!tdc_const || !can_tdc_is_enabled(priv)) @@ -179,7 +179,7 @@ static int can_tdc_changelink(struct can_priv *priv, const struct nlattr *nla, tdc.tdcf = tdcf; } - priv->tdc = tdc; + priv->fd.tdc = tdc; return 0; } @@ -228,10 +228,10 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[], dev->mtu = CANFD_MTU; } else { dev->mtu = CAN_MTU; - memset(&priv->data_bittiming, 0, - sizeof(priv->data_bittiming)); + memset(&priv->fd.data_bittiming, 0, + sizeof(priv->fd.data_bittiming)); priv->ctrlmode &= ~CAN_CTRLMODE_TDC_MASK; - memset(&priv->tdc, 0, sizeof(priv->tdc)); + memset(&priv->fd.tdc, 0, sizeof(priv->fd.tdc)); } tdc_mask = cm->mask & CAN_CTRLMODE_TDC_MASK; @@ -312,16 +312,16 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[], * directly via do_set_bitrate(). Bail out if neither * is given. */ - if (!priv->data_bittiming_const && !priv->do_set_data_bittiming && - !priv->data_bitrate_const) + if (!priv->fd.data_bittiming_const && !priv->fd.do_set_data_bittiming && + !priv->fd.data_bitrate_const) return -EOPNOTSUPP; memcpy(&dbt, nla_data(data[IFLA_CAN_DATA_BITTIMING]), sizeof(dbt)); err = can_get_bittiming(dev, &dbt, - priv->data_bittiming_const, - priv->data_bitrate_const, - priv->data_bitrate_const_cnt, + priv->fd.data_bittiming_const, + priv->fd.data_bitrate_const, + priv->fd.data_bitrate_const_cnt, extack); if (err) return err; @@ -333,7 +333,7 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[], return -EINVAL; } - memset(&priv->tdc, 0, sizeof(priv->tdc)); + memset(&priv->fd.tdc, 0, sizeof(priv->fd.tdc)); if (data[IFLA_CAN_TDC]) { /* TDC parameters are provided: use them */ err = can_tdc_changelink(priv, data[IFLA_CAN_TDC], @@ -346,17 +346,17 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[], /* Neither of TDC parameters nor TDC flags are * provided: do calculation */ - can_calc_tdco(&priv->tdc, priv->tdc_const, &dbt, + can_calc_tdco(&priv->fd.tdc, priv->fd.tdc_const, &dbt, &priv->ctrlmode, priv->ctrlmode_supported); } /* else: both CAN_CTRLMODE_TDC_{AUTO,MANUAL} are explicitly * turned off. TDC is disabled: do nothing */ - memcpy(&priv->data_bittiming, &dbt, sizeof(dbt)); + memcpy(&priv->fd.data_bittiming, &dbt, sizeof(dbt)); - if (priv->do_set_data_bittiming) { + if (priv->fd.do_set_data_bittiming) { /* Finally, set the bit-timing registers */ - err = priv->do_set_data_bittiming(dev); + err = priv->fd.do_set_data_bittiming(dev); if (err) return err; } @@ -394,7 +394,7 @@ static size_t can_tdc_get_size(const struct net_device *dev) struct can_priv *priv = netdev_priv(dev); size_t size; - if (!priv->tdc_const) + if (!priv->fd.tdc_const) return 0; size = nla_total_size(0); /* nest IFLA_CAN_TDC */ @@ -404,17 +404,17 @@ static size_t can_tdc_get_size(const struct net_device *dev) } size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCO_MIN */ size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCO_MAX */ - if (priv->tdc_const->tdcf_max) { + if (priv->fd.tdc_const->tdcf_max) { size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCF_MIN */ size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCF_MAX */ } if (can_tdc_is_enabled(priv)) { if (priv->ctrlmode & CAN_CTRLMODE_TDC_MANUAL || - priv->do_get_auto_tdcv) + priv->fd.do_get_auto_tdcv) size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCV */ size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCO */ - if (priv->tdc_const->tdcf_max) + if (priv->fd.tdc_const->tdcf_max) size += nla_total_size(sizeof(u32)); /* IFLA_CAN_TDCF */ } @@ -442,9 +442,9 @@ static size_t can_get_size(const struct net_device *dev) size += nla_total_size(sizeof(u32)); /* IFLA_CAN_RESTART_MS */ if (priv->do_get_berr_counter) /* IFLA_CAN_BERR_COUNTER */ size += nla_total_size(sizeof(struct can_berr_counter)); - if (priv->data_bittiming.bitrate) /* IFLA_CAN_DATA_BITTIMING */ + if (priv->fd.data_bittiming.bitrate) /* IFLA_CAN_DATA_BITTIMING */ size += nla_total_size(sizeof(struct can_bittiming)); - if (priv->data_bittiming_const) /* IFLA_CAN_DATA_BITTIMING_CONST */ + if (priv->fd.data_bittiming_const) /* IFLA_CAN_DATA_BITTIMING_CONST */ size += nla_total_size(sizeof(struct can_bittiming_const)); if (priv->termination_const) { size += nla_total_size(sizeof(priv->termination)); /* IFLA_CAN_TERMINATION */ @@ -454,9 +454,9 @@ static size_t can_get_size(const struct net_device *dev) if (priv->bitrate_const) /* IFLA_CAN_BITRATE_CONST */ size += nla_total_size(sizeof(*priv->bitrate_const) * priv->bitrate_const_cnt); - if (priv->data_bitrate_const) /* IFLA_CAN_DATA_BITRATE_CONST */ - size += nla_total_size(sizeof(*priv->data_bitrate_const) * - priv->data_bitrate_const_cnt); + if (priv->fd.data_bitrate_const) /* IFLA_CAN_DATA_BITRATE_CONST */ + size += nla_total_size(sizeof(*priv->fd.data_bitrate_const) * + priv->fd.data_bitrate_const_cnt); size += sizeof(priv->bitrate_max); /* IFLA_CAN_BITRATE_MAX */ size += can_tdc_get_size(dev); /* IFLA_CAN_TDC */ size += can_ctrlmode_ext_get_size(); /* IFLA_CAN_CTRLMODE_EXT */ @@ -468,8 +468,8 @@ static int can_tdc_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *nest; struct can_priv *priv = netdev_priv(dev); - struct can_tdc *tdc = &priv->tdc; - const struct can_tdc_const *tdc_const = priv->tdc_const; + struct can_tdc *tdc = &priv->fd.tdc; + const struct can_tdc_const *tdc_const = priv->fd.tdc_const; if (!tdc_const) return 0; @@ -497,8 +497,8 @@ static int can_tdc_fill_info(struct sk_buff *skb, const struct net_device *dev) if (priv->ctrlmode & CAN_CTRLMODE_TDC_MANUAL) { tdcv = tdc->tdcv; err = 0; - } else if (priv->do_get_auto_tdcv) { - err = priv->do_get_auto_tdcv(dev, &tdcv); + } else if (priv->fd.do_get_auto_tdcv) { + err = priv->fd.do_get_auto_tdcv(dev, &tdcv); } if (!err && nla_put_u32(skb, IFLA_CAN_TDC_TDCV, tdcv)) goto err_cancel; @@ -564,14 +564,14 @@ static int can_fill_info(struct sk_buff *skb, const struct net_device *dev) !priv->do_get_berr_counter(dev, &bec) && nla_put(skb, IFLA_CAN_BERR_COUNTER, sizeof(bec), &bec)) || - (priv->data_bittiming.bitrate && + (priv->fd.data_bittiming.bitrate && nla_put(skb, IFLA_CAN_DATA_BITTIMING, - sizeof(priv->data_bittiming), &priv->data_bittiming)) || + sizeof(priv->fd.data_bittiming), &priv->fd.data_bittiming)) || - (priv->data_bittiming_const && + (priv->fd.data_bittiming_const && nla_put(skb, IFLA_CAN_DATA_BITTIMING_CONST, - sizeof(*priv->data_bittiming_const), - priv->data_bittiming_const)) || + sizeof(*priv->fd.data_bittiming_const), + priv->fd.data_bittiming_const)) || (priv->termination_const && (nla_put_u16(skb, IFLA_CAN_TERMINATION, priv->termination) || @@ -586,11 +586,11 @@ static int can_fill_info(struct sk_buff *skb, const struct net_device *dev) priv->bitrate_const_cnt, priv->bitrate_const)) || - (priv->data_bitrate_const && + (priv->fd.data_bitrate_const && nla_put(skb, IFLA_CAN_DATA_BITRATE_CONST, - sizeof(*priv->data_bitrate_const) * - priv->data_bitrate_const_cnt, - priv->data_bitrate_const)) || + sizeof(*priv->fd.data_bitrate_const) * + priv->fd.data_bitrate_const_cnt, + priv->fd.data_bitrate_const)) || (nla_put(skb, IFLA_CAN_BITRATE_MAX, sizeof(priv->bitrate_max), diff --git a/drivers/net/can/flexcan/flexcan-core.c b/drivers/net/can/flexcan/flexcan-core.c index 6d80c341b26f..06d5d35fc1b5 100644 --- a/drivers/net/can/flexcan/flexcan-core.c +++ b/drivers/net/can/flexcan/flexcan-core.c @@ -1226,7 +1226,7 @@ static void flexcan_set_bittiming_cbt(const struct net_device *dev) { struct flexcan_priv *priv = netdev_priv(dev); struct can_bittiming *bt = &priv->can.bittiming; - struct can_bittiming *dbt = &priv->can.data_bittiming; + struct can_bittiming *dbt = &priv->can.fd.data_bittiming; struct flexcan_regs __iomem *regs = priv->regs; u32 reg_cbt, reg_fdctrl; @@ -2239,7 +2239,7 @@ static int flexcan_probe(struct platform_device *pdev) priv->can.ctrlmode_supported |= CAN_CTRLMODE_FD | CAN_CTRLMODE_FD_NON_ISO; priv->can.bittiming_const = &flexcan_fd_bittiming_const; - priv->can.data_bittiming_const = + priv->can.fd.data_bittiming_const = &flexcan_fd_data_bittiming_const; } else { priv->can.bittiming_const = &flexcan_bittiming_const; diff --git a/drivers/net/can/ifi_canfd/ifi_canfd.c b/drivers/net/can/ifi_canfd/ifi_canfd.c index c86b57d47085..2eeee65f606f 100644 --- a/drivers/net/can/ifi_canfd/ifi_canfd.c +++ b/drivers/net/can/ifi_canfd/ifi_canfd.c @@ -669,7 +669,7 @@ static void ifi_canfd_set_bittiming(struct net_device *ndev) { struct ifi_canfd_priv *priv = netdev_priv(ndev); const struct can_bittiming *bt = &priv->can.bittiming; - const struct can_bittiming *dbt = &priv->can.data_bittiming; + const struct can_bittiming *dbt = &priv->can.fd.data_bittiming; u16 brp, sjw, tseg1, tseg2, tdc; /* Configure bit timing */ @@ -1000,10 +1000,10 @@ static int ifi_canfd_plat_probe(struct platform_device *pdev) priv->can.clock.freq = readl(addr + IFI_CANFD_CANCLOCK); - priv->can.bittiming_const = &ifi_canfd_bittiming_const; - priv->can.data_bittiming_const = &ifi_canfd_bittiming_const; - priv->can.do_set_mode = ifi_canfd_set_mode; - priv->can.do_get_berr_counter = ifi_canfd_get_berr_counter; + priv->can.bittiming_const = &ifi_canfd_bittiming_const; + priv->can.fd.data_bittiming_const = &ifi_canfd_bittiming_const; + priv->can.do_set_mode = ifi_canfd_set_mode; + priv->can.do_get_berr_counter = ifi_canfd_get_berr_counter; /* IFI CANFD can do both Bosch FD and ISO FD */ priv->can.ctrlmode = CAN_CTRLMODE_FD; diff --git a/drivers/net/can/kvaser_pciefd.c b/drivers/net/can/kvaser_pciefd.c index cf0d51805272..4022eb2e1a7d 100644 --- a/drivers/net/can/kvaser_pciefd.c +++ b/drivers/net/can/kvaser_pciefd.c @@ -856,7 +856,7 @@ static int kvaser_pciefd_set_bittiming(struct kvaser_pciefd_can *can, bool data) struct can_bittiming *bt; if (data) - bt = &can->can.data_bittiming; + bt = &can->can.fd.data_bittiming; else bt = &can->can.bittiming; @@ -991,9 +991,9 @@ static int kvaser_pciefd_setup_can_ctrls(struct kvaser_pciefd *pcie) spin_lock_init(&can->lock); can->can.bittiming_const = &kvaser_pciefd_bittiming_const; - can->can.data_bittiming_const = &kvaser_pciefd_bittiming_const; + can->can.fd.data_bittiming_const = &kvaser_pciefd_bittiming_const; can->can.do_set_bittiming = kvaser_pciefd_set_nominal_bittiming; - can->can.do_set_data_bittiming = kvaser_pciefd_set_data_bittiming; + can->can.fd.do_set_data_bittiming = kvaser_pciefd_set_data_bittiming; can->can.do_set_mode = kvaser_pciefd_set_mode; can->can.do_get_berr_counter = kvaser_pciefd_get_berr_counter; can->can.ctrlmode_supported = CAN_CTRLMODE_LISTENONLY | diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index c2c116ce1087..6c656bfdb323 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1372,7 +1372,7 @@ static int m_can_set_bittiming(struct net_device *dev) { struct m_can_classdev *cdev = netdev_priv(dev); const struct can_bittiming *bt = &cdev->can.bittiming; - const struct can_bittiming *dbt = &cdev->can.data_bittiming; + const struct can_bittiming *dbt = &cdev->can.fd.data_bittiming; u16 brp, sjw, tseg1, tseg2; u32 reg_btp; @@ -1738,7 +1738,7 @@ static int m_can_dev_setup(struct m_can_classdev *cdev) if (err) return err; cdev->can.bittiming_const = &m_can_bittiming_const_30X; - cdev->can.data_bittiming_const = &m_can_data_bittiming_const_30X; + cdev->can.fd.data_bittiming_const = &m_can_data_bittiming_const_30X; break; case 31: /* CAN_CTRLMODE_FD_NON_ISO is fixed with M_CAN IP v3.1.x */ @@ -1746,13 +1746,13 @@ static int m_can_dev_setup(struct m_can_classdev *cdev) if (err) return err; cdev->can.bittiming_const = &m_can_bittiming_const_31X; - cdev->can.data_bittiming_const = &m_can_data_bittiming_const_31X; + cdev->can.fd.data_bittiming_const = &m_can_data_bittiming_const_31X; break; case 32: case 33: /* Support both MCAN version v3.2.x and v3.3.0 */ cdev->can.bittiming_const = &m_can_bittiming_const_31X; - cdev->can.data_bittiming_const = &m_can_data_bittiming_const_31X; + cdev->can.fd.data_bittiming_const = &m_can_data_bittiming_const_31X; niso = m_can_niso_supported(cdev); if (niso < 0) diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c index 28f3fd805273..77292afaed22 100644 --- a/drivers/net/can/peak_canfd/peak_canfd.c +++ b/drivers/net/can/peak_canfd/peak_canfd.c @@ -624,7 +624,7 @@ static int peak_canfd_set_data_bittiming(struct net_device *ndev) { struct peak_canfd_priv *priv = netdev_priv(ndev); - return pucan_set_timing_fast(priv, &priv->can.data_bittiming); + return pucan_set_timing_fast(priv, &priv->can.fd.data_bittiming); } static int peak_canfd_close(struct net_device *ndev) @@ -813,12 +813,12 @@ struct net_device *alloc_peak_canfd_dev(int sizeof_priv, int index, /* complete now socket-can initialization side */ priv->can.state = CAN_STATE_STOPPED; priv->can.bittiming_const = &peak_canfd_nominal_const; - priv->can.data_bittiming_const = &peak_canfd_data_const; + priv->can.fd.data_bittiming_const = &peak_canfd_data_const; priv->can.do_set_mode = peak_canfd_set_mode; priv->can.do_get_berr_counter = peak_canfd_get_berr_counter; priv->can.do_set_bittiming = peak_canfd_set_bittiming; - priv->can.do_set_data_bittiming = peak_canfd_set_data_bittiming; + priv->can.fd.do_set_data_bittiming = peak_canfd_set_data_bittiming; priv->can.ctrlmode_supported = CAN_CTRLMODE_LOOPBACK | CAN_CTRLMODE_LISTENONLY | CAN_CTRLMODE_3_SAMPLES | diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c index 27d503ac87dc..7f10213738e5 100644 --- a/drivers/net/can/rcar/rcar_canfd.c +++ b/drivers/net/can/rcar/rcar_canfd.c @@ -1441,7 +1441,7 @@ static void rcar_canfd_set_bittiming(struct net_device *dev) struct rcar_canfd_channel *priv = netdev_priv(dev); struct rcar_canfd_global *gpriv = priv->gpriv; const struct can_bittiming *bt = &priv->can.bittiming; - const struct can_bittiming *dbt = &priv->can.data_bittiming; + const struct can_bittiming *dbt = &priv->can.fd.data_bittiming; u16 brp, sjw, tseg1, tseg2; u32 cfg; u32 ch = priv->channel; @@ -1923,7 +1923,7 @@ static int rcar_canfd_channel_probe(struct rcar_canfd_global *gpriv, u32 ch, if (gpriv->fdmode) { priv->can.bittiming_const = gpriv->info->nom_bittiming; - priv->can.data_bittiming_const = gpriv->info->data_bittiming; + priv->can.fd.data_bittiming_const = gpriv->info->data_bittiming; /* Controller starts in CAN FD only mode */ err = can_set_static_ctrlmode(ndev, CAN_CTRLMODE_FD); diff --git a/drivers/net/can/rockchip/rockchip_canfd-core.c b/drivers/net/can/rockchip/rockchip_canfd-core.c index c3fb3176ce42..046f0a0ae4d4 100644 --- a/drivers/net/can/rockchip/rockchip_canfd-core.c +++ b/drivers/net/can/rockchip/rockchip_canfd-core.c @@ -118,7 +118,7 @@ static void rkcanfd_chip_set_work_mode(const struct rkcanfd_priv *priv) static int rkcanfd_set_bittiming(struct rkcanfd_priv *priv) { - const struct can_bittiming *dbt = &priv->can.data_bittiming; + const struct can_bittiming *dbt = &priv->can.fd.data_bittiming; const struct can_bittiming *bt = &priv->can.bittiming; u32 reg_nbt, reg_dbt, reg_tdc; u32 tdco; @@ -899,7 +899,7 @@ static int rkcanfd_probe(struct platform_device *pdev) platform_set_drvdata(pdev, priv); priv->can.clock.freq = clk_get_rate(priv->clks[0].clk); priv->can.bittiming_const = &rkcanfd_bittiming_const; - priv->can.data_bittiming_const = &rkcanfd_data_bittiming_const; + priv->can.fd.data_bittiming_const = &rkcanfd_data_bittiming_const; priv->can.ctrlmode_supported = CAN_CTRLMODE_LOOPBACK | CAN_CTRLMODE_BERR_REPORTING; priv->can.do_set_mode = rkcanfd_set_mode; diff --git a/drivers/net/can/rockchip/rockchip_canfd-timestamp.c b/drivers/net/can/rockchip/rockchip_canfd-timestamp.c index 43d4b5721812..fa85a75be65a 100644 --- a/drivers/net/can/rockchip/rockchip_canfd-timestamp.c +++ b/drivers/net/can/rockchip/rockchip_canfd-timestamp.c @@ -39,7 +39,7 @@ static void rkcanfd_timestamp_work(struct work_struct *work) void rkcanfd_timestamp_init(struct rkcanfd_priv *priv) { - const struct can_bittiming *dbt = &priv->can.data_bittiming; + const struct can_bittiming *dbt = &priv->can.fd.data_bittiming; const struct can_bittiming *bt = &priv->can.bittiming; struct cyclecounter *cc = &priv->cc; u32 bitrate, div, reg, rate; diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index c30b04f8fc0d..7450ea42c1ea 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -527,7 +527,7 @@ static int mcp251xfd_chip_timestamp_init(const struct mcp251xfd_priv *priv) static int mcp251xfd_set_bittiming(const struct mcp251xfd_priv *priv) { const struct can_bittiming *bt = &priv->can.bittiming; - const struct can_bittiming *dbt = &priv->can.data_bittiming; + const struct can_bittiming *dbt = &priv->can.fd.data_bittiming; u32 tdcmod, val = 0; int err; @@ -600,8 +600,8 @@ static int mcp251xfd_set_bittiming(const struct mcp251xfd_priv *priv) tdcmod = MCP251XFD_REG_TDC_TDCMOD_DISABLED; val = FIELD_PREP(MCP251XFD_REG_TDC_TDCMOD_MASK, tdcmod) | - FIELD_PREP(MCP251XFD_REG_TDC_TDCV_MASK, priv->can.tdc.tdcv) | - FIELD_PREP(MCP251XFD_REG_TDC_TDCO_MASK, priv->can.tdc.tdco); + FIELD_PREP(MCP251XFD_REG_TDC_TDCV_MASK, priv->can.fd.tdc.tdcv) | + FIELD_PREP(MCP251XFD_REG_TDC_TDCO_MASK, priv->can.fd.tdc.tdco); return regmap_write(priv->map_reg, MCP251XFD_REG_TDC, val); } @@ -2104,8 +2104,8 @@ static int mcp251xfd_probe(struct spi_device *spi) priv->can.do_set_mode = mcp251xfd_set_mode; priv->can.do_get_berr_counter = mcp251xfd_get_berr_counter; priv->can.bittiming_const = &mcp251xfd_bittiming_const; - priv->can.data_bittiming_const = &mcp251xfd_data_bittiming_const; - priv->can.tdc_const = &mcp251xfd_tdc_const; + priv->can.fd.data_bittiming_const = &mcp251xfd_data_bittiming_const; + priv->can.fd.tdc_const = &mcp251xfd_tdc_const; priv->can.ctrlmode_supported = CAN_CTRLMODE_LOOPBACK | CAN_CTRLMODE_LISTENONLY | CAN_CTRLMODE_BERR_REPORTING | CAN_CTRLMODE_FD | CAN_CTRLMODE_FD_NON_ISO | diff --git a/drivers/net/can/usb/esd_usb.c b/drivers/net/can/usb/esd_usb.c index 03ad10b01867..27a3818885c2 100644 --- a/drivers/net/can/usb/esd_usb.c +++ b/drivers/net/can/usb/esd_usb.c @@ -1098,7 +1098,7 @@ static int esd_usb_3_set_bittiming(struct net_device *netdev) const struct can_bittiming_const *data_btc = &esd_usb_3_data_bittiming_const; struct esd_usb_net_priv *priv = netdev_priv(netdev); struct can_bittiming *nom_bt = &priv->can.bittiming; - struct can_bittiming *data_bt = &priv->can.data_bittiming; + struct can_bittiming *data_bt = &priv->can.fd.data_bittiming; struct esd_usb_3_set_baudrate_msg_x *baud_x; union esd_usb_msg *msg; u16 flags = 0; @@ -1218,9 +1218,9 @@ static int esd_usb_probe_one_net(struct usb_interface *intf, int index) priv->can.clock.freq = ESD_USB_3_CAN_CLOCK; priv->can.ctrlmode_supported |= CAN_CTRLMODE_FD; priv->can.bittiming_const = &esd_usb_3_nom_bittiming_const; - priv->can.data_bittiming_const = &esd_usb_3_data_bittiming_const; + priv->can.fd.data_bittiming_const = &esd_usb_3_data_bittiming_const; priv->can.do_set_bittiming = esd_usb_3_set_bittiming; - priv->can.do_set_data_bittiming = esd_usb_3_set_bittiming; + priv->can.fd.do_set_data_bittiming = esd_usb_3_set_bittiming; break; case ESD_USB_CANUSBM_PRODUCT_ID: diff --git a/drivers/net/can/usb/etas_es58x/es58x_core.c b/drivers/net/can/usb/etas_es58x/es58x_core.c index 71f24dc0a927..db1acf6d504c 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_core.c +++ b/drivers/net/can/usb/etas_es58x/es58x_core.c @@ -2059,8 +2059,8 @@ static int es58x_init_priv(struct es58x_device *es58x_dev, can->bittiming_const = param->bittiming_const; if (param->ctrlmode_supported & CAN_CTRLMODE_FD) { - can->data_bittiming_const = param->data_bittiming_const; - can->tdc_const = param->tdc_const; + can->fd.data_bittiming_const = param->data_bittiming_const; + can->fd.tdc_const = param->tdc_const; } can->bitrate_max = param->bitrate_max; can->clock = param->clock; diff --git a/drivers/net/can/usb/etas_es58x/es58x_fd.c b/drivers/net/can/usb/etas_es58x/es58x_fd.c index 84ffa1839bac..d924b053677b 100644 --- a/drivers/net/can/usb/etas_es58x/es58x_fd.c +++ b/drivers/net/can/usb/etas_es58x/es58x_fd.c @@ -427,12 +427,12 @@ static int es58x_fd_enable_channel(struct es58x_priv *priv) if (tx_conf_msg.canfd_enabled) { es58x_fd_convert_bittiming(&tx_conf_msg.data_bittiming, - &priv->can.data_bittiming); + &priv->can.fd.data_bittiming); if (can_tdc_is_enabled(&priv->can)) { tx_conf_msg.tdc_enabled = 1; - tx_conf_msg.tdco = cpu_to_le16(priv->can.tdc.tdco); - tx_conf_msg.tdcf = cpu_to_le16(priv->can.tdc.tdcf); + tx_conf_msg.tdco = cpu_to_le16(priv->can.fd.tdc.tdco); + tx_conf_msg.tdcf = cpu_to_le16(priv->can.fd.tdc.tdcf); } conf_len = ES58X_FD_CANFD_CONF_LEN; diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index 3ccac6781b98..bb6335278e46 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -728,7 +728,7 @@ static int gs_usb_set_bittiming(struct net_device *netdev) static int gs_usb_set_data_bittiming(struct net_device *netdev) { struct gs_can *dev = netdev_priv(netdev); - struct can_bittiming *bt = &dev->can.data_bittiming; + struct can_bittiming *bt = &dev->can.fd.data_bittiming; struct gs_device_bittiming dbt = { .prop_seg = cpu_to_le32(bt->prop_seg), .phase_seg1 = cpu_to_le32(bt->phase_seg1), @@ -1300,8 +1300,8 @@ static struct gs_can *gs_make_candev(unsigned int channel, /* The data bit timing will be overwritten, if * GS_CAN_FEATURE_BT_CONST_EXT is set. */ - dev->can.data_bittiming_const = &dev->bt_const; - dev->can.do_set_data_bittiming = gs_usb_set_data_bittiming; + dev->can.fd.data_bittiming_const = &dev->bt_const; + dev->can.fd.do_set_data_bittiming = gs_usb_set_data_bittiming; } if (feature & GS_CAN_FEATURE_TERMINATION) { @@ -1381,7 +1381,7 @@ static struct gs_can *gs_make_candev(unsigned int channel, dev->data_bt_const.brp_max = le32_to_cpu(bt_const_extended.dbrp_max); dev->data_bt_const.brp_inc = le32_to_cpu(bt_const_extended.dbrp_inc); - dev->can.data_bittiming_const = &dev->data_bt_const; + dev->can.fd.data_bittiming_const = &dev->data_bt_const; } can_rx_offload_add_manual(netdev, &dev->offload, GS_NAPI_WEIGHT); diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb.h b/drivers/net/can/usb/kvaser_usb/kvaser_usb.h index 078496d9b7ba..f6c77eca9f43 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb.h +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb.h @@ -137,7 +137,7 @@ struct kvaser_usb_net_priv { * @dev_set_mode: used for can.do_set_mode * @dev_set_bittiming: used for can.do_set_bittiming * @dev_get_busparams: readback arbitration busparams - * @dev_set_data_bittiming: used for can.do_set_data_bittiming + * @dev_set_data_bittiming: used for can.fd.do_set_data_bittiming * @dev_get_data_busparams: readback data busparams * @dev_get_berr_counter: used for can.do_get_berr_counter * diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c index dcb0bcbe0565..daf42080f942 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c @@ -592,7 +592,7 @@ static int kvaser_usb_set_data_bittiming(struct net_device *netdev) struct kvaser_usb_net_priv *priv = netdev_priv(netdev); struct kvaser_usb *dev = priv->dev; const struct kvaser_usb_dev_ops *ops = dev->driver_info->ops; - struct can_bittiming *dbt = &priv->can.data_bittiming; + struct can_bittiming *dbt = &priv->can.fd.data_bittiming; struct kvaser_usb_busparams busparams; int tseg1 = dbt->prop_seg + dbt->phase_seg1; int tseg2 = dbt->phase_seg2; @@ -842,8 +842,8 @@ static int kvaser_usb_init_one(struct kvaser_usb *dev, int channel) priv->can.ctrlmode_supported |= dev->card_data.ctrlmode_supported; if (priv->can.ctrlmode_supported & CAN_CTRLMODE_FD) { - priv->can.data_bittiming_const = dev->cfg->data_bittiming_const; - priv->can.do_set_data_bittiming = kvaser_usb_set_data_bittiming; + priv->can.fd.data_bittiming_const = dev->cfg->data_bittiming_const; + priv->can.fd.do_set_data_bittiming = kvaser_usb_set_data_bittiming; } netdev->flags |= IFF_ECHO; diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c index 59f7cd8ceb39..117637b9b995 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c @@ -770,7 +770,7 @@ static int peak_usb_set_data_bittiming(struct net_device *netdev) const struct peak_usb_adapter *pa = dev->adapter; if (pa->dev_set_data_bittiming) { - struct can_bittiming *bt = &dev->can.data_bittiming; + struct can_bittiming *bt = &dev->can.fd.data_bittiming; int err = pa->dev_set_data_bittiming(dev, bt); if (err) @@ -954,8 +954,8 @@ static int peak_usb_create_dev(const struct peak_usb_adapter *peak_usb_adapter, dev->can.clock = peak_usb_adapter->clock; dev->can.bittiming_const = peak_usb_adapter->bittiming_const; dev->can.do_set_bittiming = peak_usb_set_bittiming; - dev->can.data_bittiming_const = peak_usb_adapter->data_bittiming_const; - dev->can.do_set_data_bittiming = peak_usb_set_data_bittiming; + dev->can.fd.data_bittiming_const = peak_usb_adapter->data_bittiming_const; + dev->can.fd.do_set_data_bittiming = peak_usb_set_data_bittiming; dev->can.do_set_mode = peak_usb_set_mode; dev->can.do_get_berr_counter = peak_usb_adapter->do_get_berr_counter; dev->can.ctrlmode_supported = peak_usb_adapter->ctrlmode_supported; diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c index 436c0e4b0344..3f2e378199ab 100644 --- a/drivers/net/can/xilinx_can.c +++ b/drivers/net/can/xilinx_can.c @@ -481,7 +481,7 @@ static int xcan_set_bittiming(struct net_device *ndev) { struct xcan_priv *priv = netdev_priv(ndev); struct can_bittiming *bt = &priv->can.bittiming; - struct can_bittiming *dbt = &priv->can.data_bittiming; + struct can_bittiming *dbt = &priv->can.fd.data_bittiming; u32 btr0, btr1; u32 is_config_mode; @@ -517,10 +517,10 @@ static int xcan_set_bittiming(struct net_device *ndev) btr0 = dbt->brp - 1; if (can_tdc_is_enabled(&priv->can)) { if (priv->devtype.cantype == XAXI_CANFD) - btr0 |= FIELD_PREP(XCAN_BRPR_TDCO_MASK, priv->can.tdc.tdco) | + btr0 |= FIELD_PREP(XCAN_BRPR_TDCO_MASK, priv->can.fd.tdc.tdco) | XCAN_BRPR_TDC_ENABLE; else - btr0 |= FIELD_PREP(XCAN_2_BRPR_TDCO_MASK, priv->can.tdc.tdco) | + btr0 |= FIELD_PREP(XCAN_2_BRPR_TDCO_MASK, priv->can.fd.tdc.tdco) | XCAN_BRPR_TDC_ENABLE; } @@ -1967,22 +1967,22 @@ static int xcan_probe(struct platform_device *pdev) goto err_free; if (devtype->cantype == XAXI_CANFD) { - priv->can.data_bittiming_const = + priv->can.fd.data_bittiming_const = &xcan_data_bittiming_const_canfd; - priv->can.tdc_const = &xcan_tdc_const_canfd; + priv->can.fd.tdc_const = &xcan_tdc_const_canfd; } if (devtype->cantype == XAXI_CANFD_2_0) { - priv->can.data_bittiming_const = + priv->can.fd.data_bittiming_const = &xcan_data_bittiming_const_canfd2; - priv->can.tdc_const = &xcan_tdc_const_canfd2; + priv->can.fd.tdc_const = &xcan_tdc_const_canfd2; } if (devtype->cantype == XAXI_CANFD || devtype->cantype == XAXI_CANFD_2_0) { priv->can.ctrlmode_supported |= CAN_CTRLMODE_FD | CAN_CTRLMODE_TDC_AUTO; - priv->can.do_get_auto_tdcv = xcan_get_auto_tdcv; + priv->can.fd.do_get_auto_tdcv = xcan_get_auto_tdcv; } priv->reg_base = addr; diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 23492213ea35..492d23bec7be 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -38,6 +38,17 @@ enum can_termination_gpio { CAN_TERMINATION_GPIO_MAX, }; +struct data_bittiming_params { + const struct can_bittiming_const *data_bittiming_const; + struct can_bittiming data_bittiming; + const struct can_tdc_const *tdc_const; + struct can_tdc tdc; + const u32 *data_bitrate_const; + unsigned int data_bitrate_const_cnt; + int (*do_set_data_bittiming)(struct net_device *dev); + int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv); +}; + /* * CAN common private data */ @@ -45,16 +56,11 @@ struct can_priv { struct net_device *dev; struct can_device_stats can_stats; - const struct can_bittiming_const *bittiming_const, - *data_bittiming_const; - struct can_bittiming bittiming, data_bittiming; - const struct can_tdc_const *tdc_const; - struct can_tdc tdc; - + const struct can_bittiming_const *bittiming_const; + struct can_bittiming bittiming; + struct data_bittiming_params fd; unsigned int bitrate_const_cnt; const u32 *bitrate_const; - const u32 *data_bitrate_const; - unsigned int data_bitrate_const_cnt; u32 bitrate_max; struct can_clock clock; @@ -77,14 +83,12 @@ struct can_priv { struct delayed_work restart_work; int (*do_set_bittiming)(struct net_device *dev); - int (*do_set_data_bittiming)(struct net_device *dev); int (*do_set_mode)(struct net_device *dev, enum can_mode mode); int (*do_set_termination)(struct net_device *dev, u16 term); int (*do_get_state)(const struct net_device *dev, enum can_state *state); int (*do_get_berr_counter)(const struct net_device *dev, struct can_berr_counter *bec); - int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv); }; static inline bool can_tdc_is_enabled(const struct can_priv *priv) @@ -114,11 +118,11 @@ static inline bool can_tdc_is_enabled(const struct can_priv *priv) */ static inline s32 can_get_relative_tdco(const struct can_priv *priv) { - const struct can_bittiming *dbt = &priv->data_bittiming; + const struct can_bittiming *dbt = &priv->fd.data_bittiming; s32 sample_point_in_tc = (CAN_SYNC_SEG + dbt->prop_seg + dbt->phase_seg1) * dbt->brp; - return (s32)priv->tdc.tdco - sample_point_in_tc; + return (s32)priv->fd.tdc.tdco - sample_point_in_tc; } /* helper to define static CAN controller features at device creation time */ -- cgit v1.2.3 From 04425292a62c15d1fde714522beaf8f3c2ed1de9 Mon Sep 17 00:00:00 2001 From: Hsin-chen Chuang Date: Wed, 16 Apr 2025 09:53:35 +0000 Subject: Bluetooth: Introduce HCI Driver protocol Although commit 75ddcd5ad40e ("Bluetooth: btusb: Configure altsetting for HCI_USER_CHANNEL") has enabled the HCI_USER_CHANNEL user to send out SCO data through USB Bluetooth chips, it's observed that with the patch HFP is flaky on most of the existing USB Bluetooth controllers: Intel chips sometimes send out no packet for Transparent codec; MTK chips may generate SCO data with a wrong handle for CVSD codec; RTK could split the data with a wrong packet size for Transparent codec; ... etc. To address the issue above one needs to reset the altsetting back to zero when there is no active SCO connection, which is the same as the BlueZ behavior, and another benefit is the bus doesn't need to reserve bandwidth when no SCO connection. This patch adds the infrastructure that allow the user space program to talk to Bluetooth drivers directly: - Define the new packet type HCI_DRV_PKT which is specifically used for communication between the user space program and the Bluetooth drviers - hci_send_frame intercepts the packets and invokes drivers' HCI Drv callbacks (so far only defined for btusb) - 2 kinds of events to user space: Command Status and Command Complete, the former simply returns the status while the later may contain additional response data. Cc: chromeos-bluetooth-upstreaming@chromium.org Fixes: b16b327edb4d ("Bluetooth: btusb: add sysfs attribute to control USB alt setting") Signed-off-by: Hsin-chen Chuang Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 69 ++++++++++++++++++++++--- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/hci_core.h | 3 ++ include/net/bluetooth/hci_drv.h | 76 ++++++++++++++++++++++++++++ include/net/bluetooth/hci_mon.h | 2 + net/bluetooth/Makefile | 3 +- net/bluetooth/hci_core.c | 11 ++++ net/bluetooth/hci_drv.c | 105 +++++++++++++++++++++++++++++++++++++++ net/bluetooth/hci_sock.c | 12 ++++- 9 files changed, 273 insertions(+), 9 deletions(-) create mode 100644 include/net/bluetooth/hci_drv.h create mode 100644 net/bluetooth/hci_drv.c (limited to 'include') diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 47e08fa7db8f..953b88449923 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -21,6 +21,7 @@ #include #include +#include #include "btintel.h" #include "btbcm.h" @@ -3800,6 +3801,61 @@ static ssize_t isoc_alt_store(struct device *dev, static DEVICE_ATTR_RW(isoc_alt); +static const struct { + u16 opcode; + const char *desc; +} btusb_hci_drv_supported_commands[] = { + /* Common commands */ + { HCI_DRV_OP_READ_INFO, "Read Info" }, +}; +static int btusb_hci_drv_read_info(struct hci_dev *hdev, void *data, + u16 data_len) +{ + struct hci_drv_rp_read_info *rp; + size_t rp_size; + int err, i; + u16 opcode, num_supported_commands = + ARRAY_SIZE(btusb_hci_drv_supported_commands); + + rp_size = sizeof(*rp) + num_supported_commands * 2; + + rp = kmalloc(rp_size, GFP_KERNEL); + if (!rp) + return -ENOMEM; + + strscpy_pad(rp->driver_name, btusb_driver.name); + + rp->num_supported_commands = cpu_to_le16(num_supported_commands); + for (i = 0; i < num_supported_commands; i++) { + opcode = btusb_hci_drv_supported_commands[i].opcode; + bt_dev_info(hdev, + "Supported HCI Drv command (0x%02x|0x%04x): %s", + hci_opcode_ogf(opcode), + hci_opcode_ocf(opcode), + btusb_hci_drv_supported_commands[i].desc); + rp->supported_commands[i] = cpu_to_le16(opcode); + } + + err = hci_drv_cmd_complete(hdev, HCI_DRV_OP_READ_INFO, + HCI_DRV_STATUS_SUCCESS, rp, rp_size); + + kfree(rp); + return err; +} + +static const struct hci_drv_handler btusb_hci_drv_common_handlers[] = { + { btusb_hci_drv_read_info, HCI_DRV_READ_INFO_SIZE }, +}; + +static const struct hci_drv_handler btusb_hci_drv_specific_handlers[] = {}; + +static struct hci_drv btusb_hci_drv = { + .common_handler_count = ARRAY_SIZE(btusb_hci_drv_common_handlers), + .common_handlers = btusb_hci_drv_common_handlers, + .specific_handler_count = ARRAY_SIZE(btusb_hci_drv_specific_handlers), + .specific_handlers = btusb_hci_drv_specific_handlers, +}; + static int btusb_probe(struct usb_interface *intf, const struct usb_device_id *id) { @@ -3939,12 +3995,13 @@ static int btusb_probe(struct usb_interface *intf, data->reset_gpio = reset_gpio; } - hdev->open = btusb_open; - hdev->close = btusb_close; - hdev->flush = btusb_flush; - hdev->send = btusb_send_frame; - hdev->notify = btusb_notify; - hdev->wakeup = btusb_wakeup; + hdev->open = btusb_open; + hdev->close = btusb_close; + hdev->flush = btusb_flush; + hdev->send = btusb_send_frame; + hdev->notify = btusb_notify; + hdev->wakeup = btusb_wakeup; + hdev->hci_drv = &btusb_hci_drv; #ifdef CONFIG_PM err = btusb_config_oob_wake(hdev); diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 797992019f9e..2502febf4da7 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -494,6 +494,7 @@ enum { #define HCI_EVENT_PKT 0x04 #define HCI_ISODATA_PKT 0x05 #define HCI_DIAG_PKT 0xf0 +#define HCI_DRV_PKT 0xf1 #define HCI_VENDOR_PKT 0xff /* HCI packet types */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 54bfeeaa0995..57f6175fd1cd 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -613,6 +614,8 @@ struct hci_dev { struct list_head monitored_devices; bool advmon_pend_notify; + struct hci_drv *hci_drv; + #if IS_ENABLED(CONFIG_BT_LEDS) struct led_trigger *power_led; #endif diff --git a/include/net/bluetooth/hci_drv.h b/include/net/bluetooth/hci_drv.h new file mode 100644 index 000000000000..2f01c44f05ec --- /dev/null +++ b/include/net/bluetooth/hci_drv.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Google Corporation + */ + +#ifndef __HCI_DRV_H +#define __HCI_DRV_H + +#include + +#include +#include + +struct hci_drv_cmd_hdr { + __le16 opcode; + __le16 len; +} __packed; + +struct hci_drv_ev_hdr { + __le16 opcode; + __le16 len; +} __packed; + +#define HCI_DRV_EV_CMD_STATUS 0x0000 +struct hci_drv_ev_cmd_status { + __le16 opcode; + __u8 status; +} __packed; + +#define HCI_DRV_EV_CMD_COMPLETE 0x0001 +struct hci_drv_ev_cmd_complete { + __le16 opcode; + __u8 status; + __u8 data[]; +} __packed; + +#define HCI_DRV_STATUS_SUCCESS 0x00 +#define HCI_DRV_STATUS_UNSPECIFIED_ERROR 0x01 +#define HCI_DRV_STATUS_UNKNOWN_COMMAND 0x02 +#define HCI_DRV_STATUS_INVALID_PARAMETERS 0x03 + +#define HCI_DRV_MAX_DRIVER_NAME_LENGTH 32 + +/* Common commands that make sense on all drivers start from 0x0000 */ +#define HCI_DRV_OP_READ_INFO 0x0000 +#define HCI_DRV_READ_INFO_SIZE 0 +struct hci_drv_rp_read_info { + __u8 driver_name[HCI_DRV_MAX_DRIVER_NAME_LENGTH]; + __le16 num_supported_commands; + __le16 supported_commands[]; +} __packed; + +/* Driver specific OGF (Opcode Group Field) + * Commands in this group may have different meanings across different drivers. + */ +#define HCI_DRV_OGF_DRIVER_SPECIFIC 0x01 + +int hci_drv_cmd_status(struct hci_dev *hdev, u16 cmd, u8 status); +int hci_drv_cmd_complete(struct hci_dev *hdev, u16 cmd, u8 status, void *rp, + size_t rp_len); +int hci_drv_process_cmd(struct hci_dev *hdev, struct sk_buff *cmd_skb); + +struct hci_drv_handler { + int (*func)(struct hci_dev *hdev, void *data, u16 data_len); + size_t data_len; +}; + +struct hci_drv { + size_t common_handler_count; + const struct hci_drv_handler *common_handlers; + + size_t specific_handler_count; + const struct hci_drv_handler *specific_handlers; +}; + +#endif /* __HCI_DRV_H */ diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h index 082f89531b88..bbd752494ef9 100644 --- a/include/net/bluetooth/hci_mon.h +++ b/include/net/bluetooth/hci_mon.h @@ -51,6 +51,8 @@ struct hci_mon_hdr { #define HCI_MON_CTRL_EVENT 17 #define HCI_MON_ISO_TX_PKT 18 #define HCI_MON_ISO_RX_PKT 19 +#define HCI_MON_DRV_TX_PKT 20 +#define HCI_MON_DRV_RX_PKT 21 struct hci_mon_new_index { __u8 type; diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index 5a3835b7dfcd..a7eede7616d8 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -14,7 +14,8 @@ bluetooth_6lowpan-y := 6lowpan.o bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \ - ecdh_helper.o mgmt_util.o mgmt_config.o hci_codec.o eir.o hci_sync.o + ecdh_helper.o mgmt_util.o mgmt_config.o hci_codec.o eir.o hci_sync.o \ + hci_drv.o bluetooth-$(CONFIG_DEV_COREDUMP) += coredump.o diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 5eb0600bbd03..df7d7bf32eb8 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2911,6 +2911,8 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) break; case HCI_ISODATA_PKT: break; + case HCI_DRV_PKT: + break; default: kfree_skb(skb); return -EINVAL; @@ -3019,6 +3021,15 @@ static int hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) return -EINVAL; } + if (hci_skb_pkt_type(skb) == HCI_DRV_PKT) { + /* Intercept HCI Drv packet here and don't go with hdev->send + * callback. + */ + err = hci_drv_process_cmd(hdev, skb); + kfree_skb(skb); + return err; + } + err = hdev->send(hdev, skb); if (err < 0) { bt_dev_err(hdev, "sending frame failed (%d)", err); diff --git a/net/bluetooth/hci_drv.c b/net/bluetooth/hci_drv.c new file mode 100644 index 000000000000..3dd2d8a006b9 --- /dev/null +++ b/net/bluetooth/hci_drv.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Google Corporation + */ + +#include +#include + +#include +#include +#include +#include + +int hci_drv_cmd_status(struct hci_dev *hdev, u16 cmd, u8 status) +{ + struct hci_drv_ev_hdr *hdr; + struct hci_drv_ev_cmd_status *ev; + struct sk_buff *skb; + + skb = bt_skb_alloc(sizeof(*hdr) + sizeof(*ev), GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = skb_put(skb, sizeof(*hdr)); + hdr->opcode = __cpu_to_le16(HCI_DRV_EV_CMD_STATUS); + hdr->len = __cpu_to_le16(sizeof(*ev)); + + ev = skb_put(skb, sizeof(*ev)); + ev->opcode = __cpu_to_le16(cmd); + ev->status = status; + + hci_skb_pkt_type(skb) = HCI_DRV_PKT; + + return hci_recv_frame(hdev, skb); +} +EXPORT_SYMBOL(hci_drv_cmd_status); + +int hci_drv_cmd_complete(struct hci_dev *hdev, u16 cmd, u8 status, void *rp, + size_t rp_len) +{ + struct hci_drv_ev_hdr *hdr; + struct hci_drv_ev_cmd_complete *ev; + struct sk_buff *skb; + + skb = bt_skb_alloc(sizeof(*hdr) + sizeof(*ev) + rp_len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = skb_put(skb, sizeof(*hdr)); + hdr->opcode = __cpu_to_le16(HCI_DRV_EV_CMD_COMPLETE); + hdr->len = __cpu_to_le16(sizeof(*ev) + rp_len); + + ev = skb_put(skb, sizeof(*ev)); + ev->opcode = __cpu_to_le16(cmd); + ev->status = status; + + skb_put_data(skb, rp, rp_len); + + hci_skb_pkt_type(skb) = HCI_DRV_PKT; + + return hci_recv_frame(hdev, skb); +} +EXPORT_SYMBOL(hci_drv_cmd_complete); + +int hci_drv_process_cmd(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_drv_cmd_hdr *hdr; + const struct hci_drv_handler *handler = NULL; + u16 opcode, len, ogf, ocf; + + hdr = skb_pull_data(skb, sizeof(*hdr)); + if (!hdr) + return -EILSEQ; + + opcode = __le16_to_cpu(hdr->opcode); + len = __le16_to_cpu(hdr->len); + if (len != skb->len) + return -EILSEQ; + + ogf = hci_opcode_ogf(opcode); + ocf = hci_opcode_ocf(opcode); + + if (!hdev->hci_drv) + return hci_drv_cmd_status(hdev, opcode, + HCI_DRV_STATUS_UNKNOWN_COMMAND); + + if (ogf != HCI_DRV_OGF_DRIVER_SPECIFIC) { + if (opcode < hdev->hci_drv->common_handler_count) + handler = &hdev->hci_drv->common_handlers[opcode]; + } else { + if (ocf < hdev->hci_drv->specific_handler_count) + handler = &hdev->hci_drv->specific_handlers[ocf]; + } + + if (!handler || !handler->func) + return hci_drv_cmd_status(hdev, opcode, + HCI_DRV_STATUS_UNKNOWN_COMMAND); + + if (len != handler->data_len) + return hci_drv_cmd_status(hdev, opcode, + HCI_DRV_STATUS_INVALID_PARAMETERS); + + return handler->func(hdev, skb->data, len); +} +EXPORT_SYMBOL(hci_drv_process_cmd); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 022b86797acd..428ee5c7de7e 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -234,7 +234,8 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && - hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT && + hci_skb_pkt_type(skb) != HCI_DRV_PKT) continue; } else { /* Don't send frame to other channel types */ @@ -391,6 +392,12 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) else opcode = cpu_to_le16(HCI_MON_ISO_TX_PKT); break; + case HCI_DRV_PKT: + if (bt_cb(skb)->incoming) + opcode = cpu_to_le16(HCI_MON_DRV_RX_PKT); + else + opcode = cpu_to_le16(HCI_MON_DRV_TX_PKT); + break; case HCI_DIAG_PKT: opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG); break; @@ -1860,7 +1867,8 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && - hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) { + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT && + hci_skb_pkt_type(skb) != HCI_DRV_PKT) { err = -EINVAL; goto drop; } -- cgit v1.2.3 From dd0ccf858057b793beb3779be7576d92c93cf828 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sun, 27 Apr 2025 14:27:25 +0300 Subject: Bluetooth: add support for SIOCETHTOOL ETHTOOL_GET_TS_INFO Bluetooth needs some way for user to get supported so_timestamping flags for the different socket types. Use SIOCETHTOOL API for this purpose. As hci_dev is not associated with struct net_device, the existing implementation can't be reused, so we add a small one here. Add support (only) for ETHTOOL_GET_TS_INFO command. The API differs slightly from netdev in that the result depends also on socket type. Signed-off-by: Pauli Virtanen Acked-by: Willem de Bruijn Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 4 ++ net/bluetooth/af_bluetooth.c | 87 +++++++++++++++++++++++++++++++++++++++ net/bluetooth/hci_conn.c | 33 +++++++++++++++ 3 files changed, 124 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index bbefde319f95..114299bd8b98 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -29,6 +29,7 @@ #include #include #include +#include #define BT_SUBSYS_VERSION 2 #define BT_SUBSYS_REVISION 22 @@ -448,6 +449,9 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, hci_req_complete_t *req_complete, hci_req_complete_skb_t *req_complete_skb); +int hci_ethtool_ts_info(unsigned int index, int sk_proto, + struct kernel_ethtool_ts_info *ts_info); + #define HCI_REQ_START BIT(0) #define HCI_REQ_SKB BIT(1) diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 0b4d0a8bd361..6ad2f72f53f4 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -34,6 +34,9 @@ #include #include +#include +#include + #include "leds.h" #include "selftest.h" @@ -563,6 +566,86 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock, } EXPORT_SYMBOL(bt_sock_poll); +static int bt_ethtool_get_ts_info(struct sock *sk, unsigned int index, + void __user *useraddr) +{ + struct ethtool_ts_info info; + struct kernel_ethtool_ts_info ts_info = {}; + int ret; + + ret = hci_ethtool_ts_info(index, sk->sk_protocol, &ts_info); + if (ret == -ENODEV) + return ret; + else if (ret < 0) + return -EIO; + + memset(&info, 0, sizeof(info)); + + info.cmd = ETHTOOL_GET_TS_INFO; + info.so_timestamping = ts_info.so_timestamping; + info.phc_index = ts_info.phc_index; + info.tx_types = ts_info.tx_types; + info.rx_filters = ts_info.rx_filters; + + if (copy_to_user(useraddr, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + +static int bt_ethtool(struct sock *sk, const struct ifreq *ifr, + void __user *useraddr) +{ + unsigned int index; + u32 ethcmd; + int n; + + if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) + return -EFAULT; + + if (sscanf(ifr->ifr_name, "hci%u%n", &index, &n) != 1 || + n != strlen(ifr->ifr_name)) + return -ENODEV; + + switch (ethcmd) { + case ETHTOOL_GET_TS_INFO: + return bt_ethtool_get_ts_info(sk, index, useraddr); + } + + return -EOPNOTSUPP; +} + +static int bt_dev_ioctl(struct socket *sock, unsigned int cmd, void __user *arg) +{ + struct sock *sk = sock->sk; + struct ifreq ifr = {}; + void __user *data; + char *colon; + int ret = -ENOIOCTLCMD; + + if (get_user_ifreq(&ifr, &data, arg)) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ - 1] = 0; + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + + switch (cmd) { + case SIOCETHTOOL: + ret = bt_ethtool(sk, &ifr, data); + break; + } + + if (colon) + *colon = ':'; + + if (put_user_ifreq(&ifr, arg)) + return -EFAULT; + + return ret; +} + int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; @@ -595,6 +678,10 @@ int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) err = put_user(amount, (int __user *)arg); break; + case SIOCETHTOOL: + err = bt_dev_ioctl(sock, cmd, (void __user *)arg); + break; + default: err = -ENOIOCTLCMD; break; diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index c0207812f432..59a56425521f 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -3049,3 +3049,36 @@ u8 *hci_conn_key_enc_size(struct hci_conn *conn) return NULL; } + +int hci_ethtool_ts_info(unsigned int index, int sk_proto, + struct kernel_ethtool_ts_info *info) +{ + struct hci_dev *hdev; + + hdev = hci_dev_get(index); + if (!hdev) + return -ENODEV; + + info->so_timestamping = + SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; + info->phc_index = -1; + info->tx_types = BIT(HWTSTAMP_TX_OFF); + info->rx_filters = BIT(HWTSTAMP_FILTER_NONE); + + switch (sk_proto) { + case BTPROTO_ISO: + case BTPROTO_L2CAP: + info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE; + info->so_timestamping |= SOF_TIMESTAMPING_TX_COMPLETION; + break; + case BTPROTO_SCO: + info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE; + if (hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) + info->so_timestamping |= SOF_TIMESTAMPING_TX_COMPLETION; + break; + } + + hci_dev_put(hdev); + return 0; +} -- cgit v1.2.3 From 23205562ffc8de20f57afdd984858cab29e77968 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sat, 3 May 2025 17:08:21 +0300 Subject: Bluetooth: separate CIS_LINK and BIS_LINK link types Use separate link type id for unicast and broadcast ISO connections. These connection types are handled with separate HCI commands, socket API is different, and hci_conn has union fields that are different in the two cases, so they shall not be mixed up. Currently in most places it is attempted to distinguish ucast by bacmp(&c->dst, BDADDR_ANY) but it is wrong as dst is set for bcast sink hci_conn in iso_conn_ready(). Additionally checking sync_handle might be OK, but depends on details of bcast conn configuration flow. To avoid complicating it, use separate link types. Fixes: f764a6c2c1e4 ("Bluetooth: ISO: Add broadcast support") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 3 ++- include/net/bluetooth/hci_core.h | 48 ++++++++++++++++++---------------------- net/bluetooth/hci_conn.c | 44 +++++++++++++++++++++--------------- net/bluetooth/hci_core.c | 21 +++++++++++------- net/bluetooth/hci_event.c | 24 +++++++++++--------- net/bluetooth/hci_sync.c | 16 +++++++++----- net/bluetooth/iso.c | 4 ++-- net/bluetooth/mgmt.c | 3 ++- 8 files changed, 89 insertions(+), 74 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 2502febf4da7..82cbd54443ac 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -558,7 +558,8 @@ enum { #define ESCO_LINK 0x02 /* Low Energy links do not have defined link type. Use invented one */ #define LE_LINK 0x80 -#define ISO_LINK 0x82 +#define CIS_LINK 0x82 +#define BIS_LINK 0x83 #define INVALID_LINK 0xff /* LMP features */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 57f6175fd1cd..2b261e74e2c4 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -999,7 +999,8 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c) case ESCO_LINK: h->sco_num++; break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: h->iso_num++; break; } @@ -1025,7 +1026,8 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c) case ESCO_LINK: h->sco_num--; break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: h->iso_num--; break; } @@ -1042,7 +1044,8 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type) case SCO_LINK: case ESCO_LINK: return h->sco_num; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: return h->iso_num; default: return 0; @@ -1103,7 +1106,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, ba) || c->type != ISO_LINK) + if (bacmp(&c->dst, ba) || c->type != BIS_LINK) continue; if (c->iso_qos.bcast.bis == bis) { @@ -1125,7 +1128,7 @@ hci_conn_hash_lookup_create_pa_sync(struct hci_dev *hdev) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) + if (c->type != BIS_LINK) continue; if (!test_bit(HCI_CONN_CREATE_PA_SYNC, &c->flags)) @@ -1151,8 +1154,8 @@ hci_conn_hash_lookup_per_adv_bis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, ba) || c->type != ISO_LINK || - !test_bit(HCI_CONN_PER_ADV, &c->flags)) + if (bacmp(&c->dst, ba) || c->type != BIS_LINK || + !test_bit(HCI_CONN_PER_ADV, &c->flags)) continue; if (c->iso_qos.bcast.big == big && @@ -1241,7 +1244,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) + if (c->type != CIS_LINK) continue; /* Match CIG ID if set */ @@ -1273,7 +1276,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cig(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) + if (c->type != CIS_LINK) continue; if (handle == c->iso_qos.ucast.cig) { @@ -1296,17 +1299,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) - continue; - - /* An ISO_LINK hcon with BDADDR_ANY as destination - * address is a Broadcast connection. A Broadcast - * slave connection is associated with a PA train, - * so the sync_handle can be used to differentiate - * from unicast. - */ - if (bacmp(&c->dst, BDADDR_ANY) && - c->sync_handle == HCI_SYNC_HANDLE_INVALID) + if (c->type != BIS_LINK) continue; if (handle == c->iso_qos.bcast.big) { @@ -1330,7 +1323,7 @@ hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) + if (c->type != BIS_LINK) continue; if (handle == c->iso_qos.bcast.big && num_bis == c->num_bis) { @@ -1353,8 +1346,8 @@ hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK || - c->state != state) + if (c->type != BIS_LINK || bacmp(&c->dst, BDADDR_ANY) || + c->state != state) continue; if (handle == c->iso_qos.bcast.big) { @@ -1377,8 +1370,8 @@ hci_conn_hash_lookup_pa_sync_big_handle(struct hci_dev *hdev, __u8 big) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || - !test_bit(HCI_CONN_PA_SYNC, &c->flags)) + if (c->type != BIS_LINK || + !test_bit(HCI_CONN_PA_SYNC, &c->flags)) continue; if (c->iso_qos.bcast.big == big) { @@ -1400,7 +1393,7 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) + if (c->type != BIS_LINK) continue; /* Ignore the listen hcon, we are looking @@ -2015,7 +2008,8 @@ static inline int hci_proto_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, case ESCO_LINK: return sco_connect_ind(hdev, bdaddr, flags); - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: return iso_connect_ind(hdev, bdaddr, flags); default: diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 59a56425521f..99efeed6a766 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -785,7 +785,7 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c d->sync_handle = conn->sync_handle; if (test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) { - hci_conn_hash_list_flag(hdev, find_bis, ISO_LINK, + hci_conn_hash_list_flag(hdev, find_bis, BIS_LINK, HCI_CONN_PA_SYNC, d); if (!d->count) @@ -795,7 +795,7 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c } if (test_and_clear_bit(HCI_CONN_BIG_SYNC, &conn->flags)) { - hci_conn_hash_list_flag(hdev, find_bis, ISO_LINK, + hci_conn_hash_list_flag(hdev, find_bis, BIS_LINK, HCI_CONN_BIG_SYNC, d); if (!d->count) @@ -885,9 +885,11 @@ static void cis_cleanup(struct hci_conn *conn) /* Check if ISO connection is a CIS and remove CIG if there are * no other connections using it. */ - hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_BOUND, &d); - hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_CONNECT, &d); - hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_CONNECTED, &d); + hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_BOUND, &d); + hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECT, + &d); + hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECTED, + &d); if (d.count) return; @@ -910,7 +912,8 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t if (!hdev->acl_mtu) return ERR_PTR(-ECONNREFUSED); break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: if (hdev->iso_mtu) /* Dedicated ISO Buffer exists */ break; @@ -974,7 +977,8 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t hci_copy_identity_address(hdev, &conn->src, &conn->src_type); conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); @@ -1071,7 +1075,8 @@ static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason) if (HCI_CONN_HANDLE_UNSET(conn->handle)) hci_conn_failed(conn, reason); break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: if ((conn->state != BT_CONNECTED && !test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) || test_bit(HCI_CONN_BIG_CREATED, &conn->flags)) @@ -1146,7 +1151,8 @@ void hci_conn_del(struct hci_conn *conn) hdev->acl_cnt += conn->sent; } else { /* Unacked ISO frames */ - if (conn->type == ISO_LINK) { + if (conn->type == CIS_LINK || + conn->type == BIS_LINK) { if (hdev->iso_pkts) hdev->iso_cnt += conn->sent; else if (hdev->le_pkts) @@ -1532,7 +1538,7 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, memcmp(conn->le_per_adv_data, base, base_len))) return ERR_PTR(-EADDRINUSE); - conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER); + conn = hci_conn_add_unset(hdev, BIS_LINK, dst, HCI_ROLE_MASTER); if (IS_ERR(conn)) return conn; @@ -1740,7 +1746,7 @@ static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos) data.count = 0; /* Create a BIS for each bound connection */ - hci_conn_hash_list_state(hdev, bis_list, ISO_LINK, + hci_conn_hash_list_state(hdev, bis_list, BIS_LINK, BT_BOUND, &data); cp.handle = qos->bcast.big; @@ -1829,12 +1835,12 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos) for (data.cig = 0x00; data.cig < 0xf0; data.cig++) { data.count = 0; - hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, + hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECT, &data); if (data.count) continue; - hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, + hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECTED, &data); if (!data.count) break; @@ -1884,7 +1890,8 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type, qos->ucast.cig, qos->ucast.cis); if (!cis) { - cis = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER); + cis = hci_conn_add_unset(hdev, CIS_LINK, dst, + HCI_ROLE_MASTER); if (IS_ERR(cis)) return cis; cis->cleanup = cis_cleanup; @@ -1976,7 +1983,7 @@ bool hci_iso_setup_path(struct hci_conn *conn) int hci_conn_check_create_cis(struct hci_conn *conn) { - if (conn->type != ISO_LINK || !bacmp(&conn->dst, BDADDR_ANY)) + if (conn->type != CIS_LINK) return -EINVAL; if (!conn->parent || conn->parent->state != BT_CONNECTED || @@ -2072,7 +2079,7 @@ struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, bt_dev_dbg(hdev, "dst %pMR type %d sid %d", dst, dst_type, sid); - conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_SLAVE); + conn = hci_conn_add_unset(hdev, BIS_LINK, dst, HCI_ROLE_SLAVE); if (IS_ERR(conn)) return conn; @@ -2221,7 +2228,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, * the start periodic advertising and create BIG commands have * been queued */ - hci_conn_hash_list_state(hdev, bis_mark_per_adv, ISO_LINK, + hci_conn_hash_list_state(hdev, bis_mark_per_adv, BIS_LINK, BT_BOUND, &data); /* Queue start periodic advertising and create BIG */ @@ -2953,7 +2960,8 @@ void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb) * TODO: SCO support without flowctl (needs to be done in drivers) */ switch (conn->type) { - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: case ACL_LINK: case LE_LINK: break; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index a58fd3bf8851..3b49828160b7 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2898,12 +2898,13 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) break; case HCI_ACLDATA_PKT: /* Detect if ISO packet has been sent as ACL */ - if (hci_conn_num(hdev, ISO_LINK)) { + if (hci_conn_num(hdev, CIS_LINK) || + hci_conn_num(hdev, BIS_LINK)) { __u16 handle = __le16_to_cpu(hci_acl_hdr(skb)->handle); __u8 type; type = hci_conn_lookup_type(hdev, hci_handle(handle)); - if (type == ISO_LINK) + if (type == CIS_LINK || type == BIS_LINK) hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; } break; @@ -3356,7 +3357,8 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) case LE_LINK: cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: cnt = hdev->iso_mtu ? hdev->iso_cnt : hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; break; @@ -3370,7 +3372,7 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) } static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, - int *quote) + __u8 type2, int *quote) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *conn = NULL, *c; @@ -3382,7 +3384,8 @@ static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != type || skb_queue_empty(&c->data_q)) + if ((c->type != type && c->type != type2) || + skb_queue_empty(&c->data_q)) continue; if (c->state != BT_CONNECTED && c->state != BT_CONFIG) @@ -3590,7 +3593,7 @@ static void hci_sched_sco(struct hci_dev *hdev, __u8 type) else cnt = &hdev->sco_cnt; - while (*cnt && (conn = hci_low_sent(hdev, type, "e))) { + while (*cnt && (conn = hci_low_sent(hdev, type, type, "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_conn_frame(hdev, conn, skb); @@ -3718,12 +3721,14 @@ static void hci_sched_iso(struct hci_dev *hdev) BT_DBG("%s", hdev->name); - if (!hci_conn_num(hdev, ISO_LINK)) + if (!hci_conn_num(hdev, CIS_LINK) && + !hci_conn_num(hdev, BIS_LINK)) return; cnt = hdev->iso_pkts ? &hdev->iso_cnt : hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt; - while (*cnt && (conn = hci_low_sent(hdev, ISO_LINK, "e))) { + while (*cnt && (conn = hci_low_sent(hdev, CIS_LINK, BIS_LINK, + "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_conn_frame(hdev, conn, skb); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 4183560582a3..66052d6aaa1d 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3804,7 +3804,7 @@ static void hci_unbound_cis_failed(struct hci_dev *hdev, u8 cig, u8 status) lockdep_assert_held(&hdev->lock); list_for_each_entry_safe(conn, tmp, &hdev->conn_hash.list, list) { - if (conn->type != ISO_LINK || !bacmp(&conn->dst, BDADDR_ANY) || + if (conn->type != CIS_LINK || conn->state == BT_OPEN || conn->iso_qos.ucast.cig != cig) continue; @@ -4467,7 +4467,8 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: if (hdev->iso_pkts) { hdev->iso_cnt += count; if (hdev->iso_cnt > hdev->iso_pkts) @@ -6413,7 +6414,8 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, conn->sync_handle = le16_to_cpu(ev->handle); conn->sid = HCI_SID_INVALID; - mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ISO_LINK, &flags); + mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, BIS_LINK, + &flags); if (!(mask & HCI_LM_ACCEPT)) { hci_le_pa_term_sync(hdev, ev->handle); goto unlock; @@ -6423,7 +6425,7 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, goto unlock; /* Add connection to indicate PA sync event */ - pa_sync = hci_conn_add_unset(hdev, ISO_LINK, BDADDR_ANY, + pa_sync = hci_conn_add_unset(hdev, BIS_LINK, BDADDR_ANY, HCI_ROLE_SLAVE); if (IS_ERR(pa_sync)) @@ -6454,7 +6456,7 @@ static void hci_le_per_adv_report_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); - mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, ISO_LINK, &flags); + mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, BIS_LINK, &flags); if (!(mask & HCI_LM_ACCEPT)) goto unlock; @@ -6738,7 +6740,7 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, goto unlock; } - if (conn->type != ISO_LINK) { + if (conn->type != CIS_LINK) { bt_dev_err(hdev, "Invalid connection link type handle 0x%4.4x", handle); @@ -6856,7 +6858,7 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data, if (!acl) goto unlock; - mask = hci_proto_connect_ind(hdev, &acl->dst, ISO_LINK, &flags); + mask = hci_proto_connect_ind(hdev, &acl->dst, CIS_LINK, &flags); if (!(mask & HCI_LM_ACCEPT)) { hci_le_reject_cis(hdev, ev->cis_handle); goto unlock; @@ -6864,8 +6866,8 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data, cis = hci_conn_hash_lookup_handle(hdev, cis_handle); if (!cis) { - cis = hci_conn_add(hdev, ISO_LINK, &acl->dst, HCI_ROLE_SLAVE, - cis_handle); + cis = hci_conn_add(hdev, CIS_LINK, &acl->dst, + HCI_ROLE_SLAVE, cis_handle); if (IS_ERR(cis)) { hci_le_reject_cis(hdev, ev->cis_handle); goto unlock; @@ -6980,7 +6982,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "ignore too large handle %u", handle); continue; } - bis = hci_conn_add(hdev, ISO_LINK, BDADDR_ANY, + bis = hci_conn_add(hdev, BIS_LINK, BDADDR_ANY, HCI_ROLE_SLAVE, handle); if (IS_ERR(bis)) continue; @@ -7036,7 +7038,7 @@ static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); - mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, ISO_LINK, &flags); + mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, BIS_LINK, &flags); if (!(mask & HCI_LM_ACCEPT)) goto unlock; diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index d00ff18f3be0..62d1ff951ebe 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2860,7 +2860,7 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, if (sent) { struct hci_conn *conn; - conn = hci_conn_hash_lookup_ba(hdev, ISO_LINK, + conn = hci_conn_hash_lookup_ba(hdev, BIS_LINK, &sent->bdaddr); if (conn) { struct bt_iso_qos *qos = &conn->iso_qos; @@ -5477,7 +5477,7 @@ static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, if (conn->type == LE_LINK) return hci_le_connect_cancel_sync(hdev, conn, reason); - if (conn->type == ISO_LINK) { + if (conn->type == CIS_LINK) { /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 1857: * @@ -5490,9 +5490,10 @@ static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, return hci_disconnect_sync(hdev, conn, reason); /* CIS with no Create CIS sent have nothing to cancel */ - if (bacmp(&conn->dst, BDADDR_ANY)) - return HCI_ERROR_LOCAL_HOST_TERM; + return HCI_ERROR_LOCAL_HOST_TERM; + } + if (conn->type == BIS_LINK) { /* There is no way to cancel a BIS without terminating the BIG * which is done later on connection cleanup. */ @@ -5554,9 +5555,12 @@ static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, { struct hci_cp_reject_conn_req cp; - if (conn->type == ISO_LINK) + if (conn->type == CIS_LINK) return hci_le_reject_cis_sync(hdev, conn, reason); + if (conn->type == BIS_LINK) + return -EINVAL; + if (conn->type == SCO_LINK || conn->type == ESCO_LINK) return hci_reject_sco_sync(hdev, conn, reason); @@ -6917,7 +6921,7 @@ static void create_pa_complete(struct hci_dev *hdev, void *data, int err) goto unlock; /* Add connection to indicate PA sync error */ - pa_sync = hci_conn_add_unset(hdev, ISO_LINK, BDADDR_ANY, + pa_sync = hci_conn_add_unset(hdev, BIS_LINK, BDADDR_ANY, HCI_ROLE_SLAVE); if (IS_ERR(pa_sync)) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 2f348f48e99d..6e2c752aaa8f 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -2219,7 +2219,7 @@ done: static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) { - if (hcon->type != ISO_LINK) { + if (hcon->type != CIS_LINK && hcon->type != BIS_LINK) { if (hcon->type != LE_LINK) return; @@ -2260,7 +2260,7 @@ static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason) { - if (hcon->type != ISO_LINK) + if (hcon->type != CIS_LINK && hcon->type != BIS_LINK) return; BT_DBG("hcon %p reason %d", hcon, reason); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 46b22708dfbd..261926dccc7e 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3221,7 +3221,8 @@ failed: static u8 link_to_bdaddr(u8 link_type, u8 addr_type) { switch (link_type) { - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: case LE_LINK: switch (addr_type) { case ADDR_LE_DEV_PUBLIC: -- cgit v1.2.3 From 8b8762eeec59b959fbca60afffe21265bce67168 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 20 May 2025 09:19:05 -0700 Subject: tools: ynl-gen: add makefile deps for neigh Kory is reporting build issues after recent additions to YNL if the system headers are old. Link: https://lore.kernel.org/20250519164949.597d6e92@kmaincent-XPS-13-7390 Reported-by: Kory Maincent Fixes: 0939a418b3b0 ("tools: ynl: submsg: reverse parse / error reporting") Tested-by: Kory Maincent Link: https://patch.msgid.link/20250520161916.413298-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/neighbour.h | 4 ++-- tools/net/ynl/Makefile.deps | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 5e67a7eaf4a7..b851c36ad25d 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_NEIGHBOUR_H -#define __LINUX_NEIGHBOUR_H +#ifndef _UAPI__LINUX_NEIGHBOUR_H +#define _UAPI__LINUX_NEIGHBOUR_H #include #include diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index e5a5cb1b2cff..8c378356fc87 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -35,7 +35,8 @@ CFLAGS_rt-addr:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ $(call get_hdr_inc,__LINUX_IF_ADDR_H,if_addr.h) CFLAGS_rt-link:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ $(call get_hdr_inc,_LINUX_IF_LINK_H,if_link.h) -CFLAGS_rt-neigh:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) +CFLAGS_rt-neigh:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ + $(call get_hdr_inc,__LINUX_NEIGHBOUR_H,neighbour.h) CFLAGS_rt-route:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) CFLAGS_rt-rule:=$(call get_hdr_inc,__LINUX_FIB_RULES_H,fib_rules.h) CFLAGS_tcp_metrics:=$(call get_hdr_inc,_LINUX_TCP_METRICS_H,tcp_metrics.h) -- cgit v1.2.3 From a5bd029c733b8ae790d5873e2afeb88b58e3a151 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:04 -0700 Subject: net: add skb_crc32c() Add skb_crc32c(), which calculates the CRC32C of a sk_buff. It will replace __skb_checksum(), which unnecessarily supports arbitrary checksums. Compared to __skb_checksum(), skb_crc32c(): - Uses the correct type for CRC32C values (u32, not __wsum). - Does not require the caller to provide a skb_checksum_ops struct. - Is faster because it does not use indirect calls and does not use the very slow crc32c_combine(). According to commit 2817a336d4d5 ("net: skb_checksum: allow custom update/combine for walking skb") which added __skb_checksum(), the original motivation for the abstraction layer was to avoid code duplication for CRC32C and other checksums in the future. However: - No additional checksums showed up after CRC32C. __skb_checksum() is only used with the "regular" net checksum and CRC32C. - Indirect calls are expensive. Commit 2544af0344ba ("net: avoid indirect calls in L4 checksum calculation") worked around this using the INDIRECT_CALL_1 macro. But that only avoided the indirect call for the net checksum, and at the cost of an extra branch. - The checksums use different types (__wsum and u32), causing casts to be needed. - It made the checksums of fragments be combined (rather than chained) for both checksums, despite this being highly counterproductive for CRC32C due to how slow crc32c_combine() is. This can clearly be seen in commit 4c2f24549644 ("sctp: linearize early if it's not GSO") which tried to work around this performance bug. With a dedicated function for each checksum, we can instead just use the proper strategy for each checksum. As shown by the following tables, the new function skb_crc32c() is faster than __skb_checksum(), with the improvement varying greatly from 5% to 2500% depending on the case. The largest improvements come from fragmented packets, mainly due to eliminating the inefficient crc32c_combine(). But linear packets are improved too, especially shorter ones, mainly due to eliminating indirect calls. These benchmarks were done on AMD Zen 5. On that CPU, Linux uses IBRS instead of retpoline; an even greater improvement might be seen with retpoline: Linear sk_buffs Length in bytes __skb_checksum cycles skb_crc32c cycles =============== ===================== ================= 64 43 18 256 94 77 1420 204 161 16384 1735 1642 Nonlinear sk_buffs (even split between head and one fragment) Length in bytes __skb_checksum cycles skb_crc32c cycles =============== ===================== ================= 64 579 22 256 829 77 1420 1506 194 16384 4365 1682 Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-3-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 1 + net/core/skbuff.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c7397b17bb08..7ccc6356acac 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4203,6 +4203,7 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum, const struct skb_checksum_ops *ops); __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum); +u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc); static inline void * __must_check __skb_header_pointer(const struct sk_buff *skb, int offset, int len, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4159107f1666..94b977db47f9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include @@ -3633,6 +3634,78 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +#ifdef CONFIG_NET_CRC32C +u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + if (copy > 0) { + copy = min(copy, len); + crc = crc32c(crc, skb->data + offset, copy); + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + + if (WARN_ON_ONCE(!skb_frags_readable(skb))) + return 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + copy = end - offset; + if (copy > 0) { + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; + + copy = min(copy, len); + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_atomic(p); + crc = crc32c(crc, vaddr + p_off, p_len); + kunmap_atomic(vaddr); + } + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + copy = end - offset; + if (copy > 0) { + copy = min(copy, len); + crc = skb_crc32c(frag_iter, offset - start, copy, crc); + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + start = end; + } + BUG_ON(len); + + return crc; +} +EXPORT_SYMBOL(skb_crc32c); +#endif /* CONFIG_NET_CRC32C */ + __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) { __sum16 sum; -- cgit v1.2.3 From 99de9d4022e5004f95f425f798f0aa01e87949ff Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:07 -0700 Subject: sctp: use skb_crc32c() instead of __skb_checksum() Make sctp_compute_cksum() just use the new function skb_crc32c(), instead of calling __skb_checksum() with a skb_checksum_ops struct that does CRC32C. This is faster and simpler. Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-6-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/net/sctp/checksum.h | 29 +++-------------------------- net/netfilter/Kconfig | 4 ++-- net/netfilter/ipvs/Kconfig | 2 +- net/openvswitch/Kconfig | 2 +- net/sched/Kconfig | 2 +- net/sctp/Kconfig | 1 - net/sctp/offload.c | 1 - 7 files changed, 8 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h index 291465c25810..654d37ec0402 100644 --- a/include/net/sctp/checksum.h +++ b/include/net/sctp/checksum.h @@ -15,8 +15,6 @@ * Dinakaran Joseph * Jon Grimm * Sridhar Samudrala - * - * Rewritten to use libcrc32c by: * Vlad Yasevich */ @@ -25,39 +23,18 @@ #include #include -#include -#include - -static inline __wsum sctp_csum_update(const void *buff, int len, __wsum sum) -{ - return (__force __wsum)crc32c((__force __u32)sum, buff, len); -} - -static inline __wsum sctp_csum_combine(__wsum csum, __wsum csum2, - int offset, int len) -{ - return (__force __wsum)crc32c_combine((__force __u32)csum, - (__force __u32)csum2, len); -} - -static const struct skb_checksum_ops sctp_csum_ops = { - .update = sctp_csum_update, - .combine = sctp_csum_combine, -}; static inline __le32 sctp_compute_cksum(const struct sk_buff *skb, unsigned int offset) { struct sctphdr *sh = (struct sctphdr *)(skb->data + offset); __le32 old = sh->checksum; - __wsum new; + u32 new; sh->checksum = 0; - new = ~__skb_checksum(skb, offset, skb->len - offset, ~(__wsum)0, - &sctp_csum_ops); + new = ~skb_crc32c(skb, offset, skb->len - offset, ~0); sh->checksum = old; - - return cpu_to_le32((__force __u32)new); + return cpu_to_le32(new); } #endif /* __sctp_checksum_h__ */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 3b2183fc7e56..2560416218d0 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -212,7 +212,7 @@ config NF_CT_PROTO_SCTP bool 'SCTP protocol connection tracking support' depends on NETFILTER_ADVANCED default y - select CRC32 + select NET_CRC32C help With this option enabled, the layer 3 independent connection tracking code will be able to do state tracking on SCTP connections. @@ -475,7 +475,7 @@ endif # NF_CONNTRACK config NF_TABLES select NETFILTER_NETLINK - select CRC32 + select NET_CRC32C tristate "Netfilter nf_tables support" help nftables is the new packet classification framework that intends to diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 8c5b1fe12d07..c203252e856d 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -105,7 +105,7 @@ config IP_VS_PROTO_AH config IP_VS_PROTO_SCTP bool "SCTP load balancing support" - select CRC32 + select NET_CRC32C help This option enables support for load balancing SCTP transport protocol. Say Y if unsure. diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 5481bd561eb4..e6aaee92dba4 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -11,8 +11,8 @@ config OPENVSWITCH (!NF_NAT || NF_NAT) && \ (!NETFILTER_CONNCOUNT || NETFILTER_CONNCOUNT))) depends on PSAMPLE || !PSAMPLE - select CRC32 select MPLS + select NET_CRC32C select NET_MPLS_GSO select DST_CACHE select NET_NSH diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 9f0b3f943fca..ad914d2b2e22 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -796,7 +796,7 @@ config NET_ACT_SKBEDIT config NET_ACT_CSUM tristate "Checksum Updating" depends on NET_CLS_ACT && INET - select CRC32 + select NET_CRC32C help Say Y here to update some common checksum after some direct packet alterations. diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 3669ba351856..24d5a35ce894 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -7,7 +7,6 @@ menuconfig IP_SCTP tristate "The SCTP Protocol" depends on INET depends on IPV6 || IPV6=n - select CRC32 select CRYPTO select CRYPTO_HMAC select CRYPTO_SHA1 diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 502095173d88..e6f863c031b4 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -111,7 +111,6 @@ int __init sctp_offload_init(void) if (ret) goto ipv4; - crc32c_csum_stub = &sctp_csum_ops; return ret; ipv4: -- cgit v1.2.3 From 70c96c7cb9f035d5b960021f2450afa6240e66b4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:08 -0700 Subject: net: fold __skb_checksum() into skb_checksum() Now that the only remaining caller of __skb_checksum() is skb_checksum(), fold __skb_checksum() into skb_checksum(). This makes struct skb_checksum_ops unnecessary, so remove that too and simply do the "regular" net checksum. It also makes the wrapper functions csum_partial_ext() and csum_block_add_ext() unnecessary, so remove those too and just use the underlying functions. Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-7-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 9 -------- include/net/checksum.h | 12 ---------- net/core/skbuff.c | 59 ++++++-------------------------------------------- 3 files changed, 7 insertions(+), 73 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7ccc6356acac..018c07230513 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4192,15 +4192,6 @@ static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len) return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT; } -struct skb_checksum_ops { - __wsum (*update)(const void *mem, int len, __wsum wsum); - __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len); -}; - -extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly; - -__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, - __wsum csum, const struct skb_checksum_ops *ops); __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum); u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc); diff --git a/include/net/checksum.h b/include/net/checksum.h index 243f972267b8..e57986b173f8 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -98,12 +98,6 @@ csum_block_add(__wsum csum, __wsum csum2, int offset) return csum_add(csum, csum_shift(csum2, offset)); } -static __always_inline __wsum -csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) -{ - return csum_block_add(csum, csum2, offset); -} - static __always_inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { @@ -115,12 +109,6 @@ static __always_inline __wsum csum_unfold(__sum16 n) return (__force __wsum)n; } -static __always_inline -__wsum csum_partial_ext(const void *buff, int len, __wsum sum) -{ - return csum_partial(buff, len, sum); -} - #define CSUM_MANGLED_0 ((__force __sum16)0xffff) static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 94b977db47f9..85fc82f72d26 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3445,8 +3445,7 @@ fault: EXPORT_SYMBOL(skb_store_bits); /* Checksum skb data. */ -__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, - __wsum csum, const struct skb_checksum_ops *ops) +__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum) { int start = skb_headlen(skb); int i, copy = start - offset; @@ -3457,8 +3456,7 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, if (copy > 0) { if (copy > len) copy = len; - csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, - skb->data + offset, copy, csum); + csum = csum_partial(skb->data + offset, copy, csum); if ((len -= copy) == 0) return csum; offset += copy; @@ -3488,13 +3486,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); - csum2 = INDIRECT_CALL_1(ops->update, - csum_partial_ext, - vaddr + p_off, p_len, 0); + csum2 = csum_partial(vaddr + p_off, p_len, 0); kunmap_atomic(vaddr); - csum = INDIRECT_CALL_1(ops->combine, - csum_block_add_ext, csum, - csum2, pos, p_len); + csum = csum_block_add(csum, csum2, pos); pos += p_len; } @@ -3515,10 +3509,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum2; if (copy > len) copy = len; - csum2 = __skb_checksum(frag_iter, offset - start, - copy, 0, ops); - csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, - csum, csum2, pos, copy); + csum2 = skb_checksum(frag_iter, offset - start, copy, + 0); + csum = csum_block_add(csum, csum2, pos); if ((len -= copy) == 0) return csum; offset += copy; @@ -3530,18 +3523,6 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, return csum; } -EXPORT_SYMBOL(__skb_checksum); - -__wsum skb_checksum(const struct sk_buff *skb, int offset, - int len, __wsum csum) -{ - const struct skb_checksum_ops ops = { - .update = csum_partial_ext, - .combine = csum_block_add_ext, - }; - - return __skb_checksum(skb, offset, len, csum, &ops); -} EXPORT_SYMBOL(skb_checksum); /* Both of above in one bottle. */ @@ -3765,32 +3746,6 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) } EXPORT_SYMBOL(__skb_checksum_complete); -static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) -{ - net_warn_ratelimited( - "%s: attempt to compute crc32c without libcrc32c.ko\n", - __func__); - return 0; -} - -static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, - int offset, int len) -{ - net_warn_ratelimited( - "%s: attempt to compute crc32c without libcrc32c.ko\n", - __func__); - return 0; -} - -static const struct skb_checksum_ops default_crc32c_ops = { - .update = warn_crc32c_csum_update, - .combine = warn_crc32c_csum_combine, -}; - -const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = - &default_crc32c_ops; -EXPORT_SYMBOL(crc32c_csum_stub); - /** * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() * @from: source buffer -- cgit v1.2.3 From b82f72292ab4c65250bd734281464a6ab1ff4133 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:09 -0700 Subject: lib/crc32: remove unused support for CRC32C combination crc32c_combine() and crc32c_shift() are no longer used (except by the KUnit test that tests them), and their current implementation is very slow. Remove them. Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-8-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/crc32.h | 23 ----------------------- lib/crc32.c | 6 ------ lib/tests/crc_kunit.c | 6 ------ 3 files changed, 35 deletions(-) (limited to 'include') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 69c2e8bb3782..7f7d0be8a0ac 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -76,29 +76,6 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2) return crc32_le_shift(crc1, len2) ^ crc2; } -u32 crc32c_shift(u32 crc, size_t len); - -/** - * crc32c_combine - Combine two crc32c check values into one. For two sequences - * of bytes, seq1 and seq2 with lengths len1 and len2, crc32c() - * check values were calculated for each, crc1 and crc2. - * - * @crc1: crc32c of the first block - * @crc2: crc32c of the second block - * @len2: length of the second block - * - * Return: The crc32c() check value of seq1 and seq2 concatenated, requiring - * only crc1, crc2, and len2. Note: If seq_full denotes the concatenated - * memory area of seq1 with seq2, and crc_full the crc32c() value of - * seq_full, then crc_full == crc32c_combine(crc1, crc2, len2) when - * crc_full was seeded with the same initializer as crc1, and crc2 seed - * was 0. See also crc_combine_test(). - */ -static inline u32 crc32c_combine(u32 crc1, u32 crc2, size_t len2) -{ - return crc32c_shift(crc1, len2) ^ crc2; -} - #define crc32(seed, data, length) crc32_le(seed, (unsigned char const *)(data), length) /* diff --git a/lib/crc32.c b/lib/crc32.c index fddd424ff224..ade48bbb0083 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -119,12 +119,6 @@ u32 crc32_le_shift(u32 crc, size_t len) } EXPORT_SYMBOL(crc32_le_shift); -u32 crc32c_shift(u32 crc, size_t len) -{ - return crc32_generic_shift(crc, len, CRC32C_POLY_LE); -} -EXPORT_SYMBOL(crc32c_shift); - u32 crc32_be_base(u32 crc, const u8 *p, size_t len) { while (len--) diff --git a/lib/tests/crc_kunit.c b/lib/tests/crc_kunit.c index 585c48b65cef..064c2d581557 100644 --- a/lib/tests/crc_kunit.c +++ b/lib/tests/crc_kunit.c @@ -391,17 +391,11 @@ static u64 crc32c_wrapper(u64 crc, const u8 *p, size_t len) return crc32c(crc, p, len); } -static u64 crc32c_combine_wrapper(u64 crc1, u64 crc2, size_t len2) -{ - return crc32c_combine(crc1, crc2, len2); -} - static const struct crc_variant crc_variant_crc32c = { .bits = 32, .le = true, .poly = 0x82f63b78, .func = crc32c_wrapper, - .combine_func = crc32c_combine_wrapper, }; static void crc32c_test(struct kunit *test) -- cgit v1.2.3 From ea6342d98928e243f2024fb97a9b4d42ee55dfba Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:10 -0700 Subject: net: add skb_copy_and_crc32c_datagram_iter() Since skb_copy_and_hash_datagram_iter() is used only with CRC32C, the crypto_ahash abstraction provides no value. Add skb_copy_and_crc32c_datagram_iter() which just calls crc32c() directly. This is faster and simpler. It also doesn't have the weird dependency issue where skb_copy_and_hash_datagram_iter() depends on CONFIG_CRYPTO_HASH=y without that being expressed explicitly in the kconfig (presumably because it was too heavyweight for NET to select). The new function is conditional on the hidden boolean symbol NET_CRC32C, which selects CRC32. So it gets compiled only when something that actually needs CRC32C packet checksums is enabled, it has no implicit dependency, and it doesn't depend on the heavyweight crypto layer. Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-9-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 ++ net/core/datagram.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 018c07230513..510adf63c211 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4137,6 +4137,8 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, struct ahash_request *hash); +int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, u32 *crcp); int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len); int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); diff --git a/net/core/datagram.c b/net/core/datagram.c index 9ef5442536f5..fa87abb66632 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -519,6 +520,38 @@ int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); +#ifdef CONFIG_NET_CRC32C +static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes, + void *_crcp, struct iov_iter *i) +{ + u32 *crcp = _crcp; + size_t copied; + + copied = copy_to_iter(addr, bytes, i); + *crcp = crc32c(*crcp, addr, copied); + return copied; +} + +/** + * skb_copy_and_crc32c_datagram_iter - Copy datagram to an iovec iterator + * and update a CRC32C value. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + * @crcp: pointer to CRC32C value to update + * + * Return: 0 on success, -EFAULT if there was a fault during copy. + */ +int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, u32 *crcp) +{ + return __skb_datagram_iter(skb, offset, to, len, true, + crc32c_and_copy_to_iter, crcp); +} +EXPORT_SYMBOL(skb_copy_and_crc32c_datagram_iter); +#endif /* CONFIG_NET_CRC32C */ + static size_t simple_copy_to_iter(const void *addr, size_t bytes, void *data __always_unused, struct iov_iter *i) { -- cgit v1.2.3 From c93f75b2d755c35b596084ddd3feb3528284a53f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 19 May 2025 10:50:12 -0700 Subject: net: remove skb_copy_and_hash_datagram_iter() Now that skb_copy_and_hash_datagram_iter() is no longer used, remove it. Signed-off-by: Eric Biggers Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20250519175012.36581-11-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 4 ---- net/core/datagram.c | 37 ------------------------------------- 2 files changed, 41 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 510adf63c211..5520524c93bf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -274,7 +274,6 @@ SKB_DATA_ALIGN(sizeof(struct sk_buff)) + \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) -struct ahash_request; struct net_device; struct scatterlist; struct pipe_inode_info; @@ -4134,9 +4133,6 @@ static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, } int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, struct msghdr *msg); -int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, - struct iov_iter *to, int len, - struct ahash_request *hash); int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, u32 *crcp); int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, diff --git a/net/core/datagram.c b/net/core/datagram.c index fa87abb66632..b352a1009304 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -62,7 +62,6 @@ #include #include #include -#include #include "devmem.h" @@ -484,42 +483,6 @@ short_copy: return 0; } -static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, - struct iov_iter *i) -{ -#ifdef CONFIG_CRYPTO_HASH - struct ahash_request *hash = hashp; - struct scatterlist sg; - size_t copied; - - copied = copy_to_iter(addr, bytes, i); - sg_init_one(&sg, addr, copied); - ahash_request_set_crypt(hash, &sg, NULL, copied); - crypto_ahash_update(hash); - return copied; -#else - return 0; -#endif -} - -/** - * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator - * and update a hash. - * @skb: buffer to copy - * @offset: offset in the buffer to start copying from - * @to: iovec iterator to copy to - * @len: amount of data to copy from buffer to iovec - * @hash: hash request to update - */ -int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, - struct iov_iter *to, int len, - struct ahash_request *hash) -{ - return __skb_datagram_iter(skb, offset, to, len, true, - hash_and_copy_to_iter, hash); -} -EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); - #ifdef CONFIG_NET_CRC32C static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes, void *_crcp, struct iov_iter *i) -- cgit v1.2.3 From 31afd6bc55cc0093c3e5b0a368319e423d4de8ea Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sat, 17 May 2025 22:13:45 +0200 Subject: net: phy: pass PHY driver to .match_phy_device OP Pass PHY driver pointer to .match_phy_device OP in addition to phydev. Having access to the PHY driver struct might be useful to check the PHY ID of the driver is being matched for in case the PHY ID scanned in the phydev is not consistent. A scenario for this is a PHY that change PHY ID after a firmware is loaded, in such case, the PHY ID stored in PHY device struct is not valid anymore and PHY will manually scan the ID in the match_phy_device function. Having the PHY driver info is also useful for those PHY driver that implement multiple simple .match_phy_device OP to match specific MMD PHY ID. With this extra info if the parsing logic is the same, the matching function can be generalized by using the phy_id in the PHY driver instead of hardcoding. Rust wrapper callback is updated to align to the new match_phy_device arguments. Suggested-by: Russell King (Oracle) Reviewed-by: Russell King (Oracle) Signed-off-by: Christian Marangi Reviewed-by: Benno Lossin # for Rust Reviewed-by: FUJITA Tomonori Link: https://patch.msgid.link/20250517201353.5137-2-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/bcm87xx.c | 6 ++++-- drivers/net/phy/icplus.c | 6 ++++-- drivers/net/phy/marvell10g.c | 12 ++++++++---- drivers/net/phy/micrel.c | 6 ++++-- drivers/net/phy/nxp-c45-tja11xx.c | 12 ++++++++---- drivers/net/phy/nxp-tja11xx.c | 6 ++++-- drivers/net/phy/phy_device.c | 2 +- drivers/net/phy/realtek/realtek_main.c | 27 ++++++++++++++++++--------- drivers/net/phy/teranetics.c | 3 ++- include/linux/phy.h | 3 ++- rust/kernel/net/phy.rs | 1 + 11 files changed, 56 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/bcm87xx.c b/drivers/net/phy/bcm87xx.c index e81404bf8994..1e1e2259fc2b 100644 --- a/drivers/net/phy/bcm87xx.c +++ b/drivers/net/phy/bcm87xx.c @@ -185,12 +185,14 @@ static irqreturn_t bcm87xx_handle_interrupt(struct phy_device *phydev) return IRQ_HANDLED; } -static int bcm8706_match_phy_device(struct phy_device *phydev) +static int bcm8706_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return phydev->c45_ids.device_ids[4] == PHY_ID_BCM8706; } -static int bcm8727_match_phy_device(struct phy_device *phydev) +static int bcm8727_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return phydev->c45_ids.device_ids[4] == PHY_ID_BCM8727; } diff --git a/drivers/net/phy/icplus.c b/drivers/net/phy/icplus.c index bbcc7d2b54cd..c0c4f19cfb6a 100644 --- a/drivers/net/phy/icplus.c +++ b/drivers/net/phy/icplus.c @@ -520,12 +520,14 @@ static int ip101a_g_match_phy_device(struct phy_device *phydev, bool ip101a) return ip101a == !ret; } -static int ip101a_match_phy_device(struct phy_device *phydev) +static int ip101a_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return ip101a_g_match_phy_device(phydev, true); } -static int ip101g_match_phy_device(struct phy_device *phydev) +static int ip101g_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return ip101a_g_match_phy_device(phydev, false); } diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c index 5354c8895163..13e81dff42c1 100644 --- a/drivers/net/phy/marvell10g.c +++ b/drivers/net/phy/marvell10g.c @@ -1264,7 +1264,8 @@ static int mv3310_get_number_of_ports(struct phy_device *phydev) return ret + 1; } -static int mv3310_match_phy_device(struct phy_device *phydev) +static int mv3310_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { if ((phydev->c45_ids.device_ids[MDIO_MMD_PMAPMD] & MARVELL_PHY_ID_MASK) != MARVELL_PHY_ID_88X3310) @@ -1273,7 +1274,8 @@ static int mv3310_match_phy_device(struct phy_device *phydev) return mv3310_get_number_of_ports(phydev) == 1; } -static int mv3340_match_phy_device(struct phy_device *phydev) +static int mv3340_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { if ((phydev->c45_ids.device_ids[MDIO_MMD_PMAPMD] & MARVELL_PHY_ID_MASK) != MARVELL_PHY_ID_88X3310) @@ -1297,12 +1299,14 @@ static int mv211x_match_phy_device(struct phy_device *phydev, bool has_5g) return !!(val & MDIO_PCS_SPEED_5G) == has_5g; } -static int mv2110_match_phy_device(struct phy_device *phydev) +static int mv2110_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return mv211x_match_phy_device(phydev, true); } -static int mv2111_match_phy_device(struct phy_device *phydev) +static int mv2111_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return mv211x_match_phy_device(phydev, false); } diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index c2e5be404f07..64aa03aed770 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -768,7 +768,8 @@ static int ksz8051_ksz8795_match_phy_device(struct phy_device *phydev, return !ret; } -static int ksz8051_match_phy_device(struct phy_device *phydev) +static int ksz8051_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return ksz8051_ksz8795_match_phy_device(phydev, true); } @@ -888,7 +889,8 @@ static int ksz8061_config_init(struct phy_device *phydev) return kszphy_config_init(phydev); } -static int ksz8795_match_phy_device(struct phy_device *phydev) +static int ksz8795_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return ksz8051_ksz8795_match_phy_device(phydev, false); } diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c index f11dd32494c3..22921b192a8b 100644 --- a/drivers/net/phy/nxp-c45-tja11xx.c +++ b/drivers/net/phy/nxp-c45-tja11xx.c @@ -1966,25 +1966,29 @@ static int nxp_c45_macsec_ability(struct phy_device *phydev) return macsec_ability; } -static int tja1103_match_phy_device(struct phy_device *phydev) +static int tja1103_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return phy_id_compare(phydev->phy_id, PHY_ID_TJA_1103, PHY_ID_MASK) && !nxp_c45_macsec_ability(phydev); } -static int tja1104_match_phy_device(struct phy_device *phydev) +static int tja1104_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return phy_id_compare(phydev->phy_id, PHY_ID_TJA_1103, PHY_ID_MASK) && nxp_c45_macsec_ability(phydev); } -static int tja1120_match_phy_device(struct phy_device *phydev) +static int tja1120_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return phy_id_compare(phydev->phy_id, PHY_ID_TJA_1120, PHY_ID_MASK) && !nxp_c45_macsec_ability(phydev); } -static int tja1121_match_phy_device(struct phy_device *phydev) +static int tja1121_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return phy_id_compare(phydev->phy_id, PHY_ID_TJA_1120, PHY_ID_MASK) && nxp_c45_macsec_ability(phydev); diff --git a/drivers/net/phy/nxp-tja11xx.c b/drivers/net/phy/nxp-tja11xx.c index 07e94a2478ac..3c38a8ddae2f 100644 --- a/drivers/net/phy/nxp-tja11xx.c +++ b/drivers/net/phy/nxp-tja11xx.c @@ -651,12 +651,14 @@ static int tja1102_match_phy_device(struct phy_device *phydev, bool port0) return !ret; } -static int tja1102_p0_match_phy_device(struct phy_device *phydev) +static int tja1102_p0_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return tja1102_match_phy_device(phydev, true); } -static int tja1102_p1_match_phy_device(struct phy_device *phydev) +static int tja1102_p1_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return tja1102_match_phy_device(phydev, false); } diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 781dfa6680eb..6006cff1a73e 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -554,7 +554,7 @@ static int phy_bus_match(struct device *dev, const struct device_driver *drv) return 0; if (phydrv->match_phy_device) - return phydrv->match_phy_device(phydev); + return phydrv->match_phy_device(phydev, phydrv); if (phydev->is_c45) { for (i = 1; i < num_ids; i++) { diff --git a/drivers/net/phy/realtek/realtek_main.c b/drivers/net/phy/realtek/realtek_main.c index b5e00cdf0123..c3dcb6257430 100644 --- a/drivers/net/phy/realtek/realtek_main.c +++ b/drivers/net/phy/realtek/realtek_main.c @@ -1315,13 +1315,15 @@ static bool rtlgen_supports_mmd(struct phy_device *phydev) return val > 0; } -static int rtlgen_match_phy_device(struct phy_device *phydev) +static int rtlgen_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return phydev->phy_id == RTL_GENERIC_PHYID && !rtlgen_supports_2_5gbps(phydev); } -static int rtl8226_match_phy_device(struct phy_device *phydev) +static int rtl8226_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return phydev->phy_id == RTL_GENERIC_PHYID && rtlgen_supports_2_5gbps(phydev) && @@ -1337,32 +1339,38 @@ static int rtlgen_is_c45_match(struct phy_device *phydev, unsigned int id, return !is_c45 && (id == phydev->phy_id); } -static int rtl8221b_match_phy_device(struct phy_device *phydev) +static int rtl8221b_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return phydev->phy_id == RTL_8221B && rtlgen_supports_mmd(phydev); } -static int rtl8221b_vb_cg_c22_match_phy_device(struct phy_device *phydev) +static int rtl8221b_vb_cg_c22_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return rtlgen_is_c45_match(phydev, RTL_8221B_VB_CG, false); } -static int rtl8221b_vb_cg_c45_match_phy_device(struct phy_device *phydev) +static int rtl8221b_vb_cg_c45_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return rtlgen_is_c45_match(phydev, RTL_8221B_VB_CG, true); } -static int rtl8221b_vn_cg_c22_match_phy_device(struct phy_device *phydev) +static int rtl8221b_vn_cg_c22_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return rtlgen_is_c45_match(phydev, RTL_8221B_VN_CG, false); } -static int rtl8221b_vn_cg_c45_match_phy_device(struct phy_device *phydev) +static int rtl8221b_vn_cg_c45_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return rtlgen_is_c45_match(phydev, RTL_8221B_VN_CG, true); } -static int rtl_internal_nbaset_match_phy_device(struct phy_device *phydev) +static int rtl_internal_nbaset_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { if (phydev->is_c45) return false; @@ -1381,7 +1389,8 @@ static int rtl_internal_nbaset_match_phy_device(struct phy_device *phydev) return rtlgen_supports_2_5gbps(phydev) && !rtlgen_supports_mmd(phydev); } -static int rtl8251b_c45_match_phy_device(struct phy_device *phydev) +static int rtl8251b_c45_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return rtlgen_is_c45_match(phydev, RTL_8251B, true); } diff --git a/drivers/net/phy/teranetics.c b/drivers/net/phy/teranetics.c index 752d4bf7bb99..46c5ff7d7b56 100644 --- a/drivers/net/phy/teranetics.c +++ b/drivers/net/phy/teranetics.c @@ -67,7 +67,8 @@ static int teranetics_read_status(struct phy_device *phydev) return 0; } -static int teranetics_match_phy_device(struct phy_device *phydev) +static int teranetics_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { return phydev->c45_ids.device_ids[3] == PHY_ID_TN2020; } diff --git a/include/linux/phy.h b/include/linux/phy.h index 92a88b5ce356..10e66d45a8e8 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -990,7 +990,8 @@ struct phy_driver { * driver for the given phydev. If NULL, matching is based on * phy_id and phy_id_mask. */ - int (*match_phy_device)(struct phy_device *phydev); + int (*match_phy_device)(struct phy_device *phydev, + const struct phy_driver *phydrv); /** * @set_wol: Some devices (e.g. qnap TS-119P II) require PHY diff --git a/rust/kernel/net/phy.rs b/rust/kernel/net/phy.rs index a59469c785e3..32ea43ece646 100644 --- a/rust/kernel/net/phy.rs +++ b/rust/kernel/net/phy.rs @@ -421,6 +421,7 @@ impl Adapter { /// `phydev` must be passed by the corresponding callback in `phy_driver`. unsafe extern "C" fn match_phy_device_callback( phydev: *mut bindings::phy_device, + _phydrv: *const bindings::phy_driver, ) -> crate::ffi::c_int { // SAFETY: This callback is called only in contexts // where we hold `phy_device->lock`, so the accessors on -- cgit v1.2.3 From d6c45707ac84c2d9f274ece1cea4dddb97996bde Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sat, 17 May 2025 22:13:48 +0200 Subject: net: phy: introduce genphy_match_phy_device() Introduce new API, genphy_match_phy_device(), to provide a way to check to match a PHY driver for a PHY device based on the info stored in the PHY device struct. The function generalize the logic used in phy_bus_match() to check the PHY ID whether if C45 or C22 ID should be used for matching. This is useful for custom .match_phy_device function that wants to use the generic logic under some condition. (example a PHY is already setup and provide the correct PHY ID) Reviewed-by: Russell King (Oracle) Signed-off-by: Christian Marangi Link: https://patch.msgid.link/20250517201353.5137-5-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 52 +++++++++++++++++++++++++++++++------------- include/linux/phy.h | 3 +++ 2 files changed, 40 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 6006cff1a73e..0f6f86252622 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -543,20 +543,26 @@ static int phy_scan_fixups(struct phy_device *phydev) return 0; } -static int phy_bus_match(struct device *dev, const struct device_driver *drv) +/** + * genphy_match_phy_device - match a PHY device with a PHY driver + * @phydev: target phy_device struct + * @phydrv: target phy_driver struct + * + * Description: Checks whether the given PHY device matches the specified + * PHY driver. For Clause 45 PHYs, iterates over the available device + * identifiers and compares them against the driver's expected PHY ID, + * applying the provided mask. For Clause 22 PHYs, a direct ID comparison + * is performed. + * + * Return: 1 if the PHY device matches the driver, 0 otherwise. + */ +int genphy_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv) { - struct phy_device *phydev = to_phy_device(dev); - const struct phy_driver *phydrv = to_phy_driver(drv); - const int num_ids = ARRAY_SIZE(phydev->c45_ids.device_ids); - int i; - - if (!(phydrv->mdiodrv.flags & MDIO_DEVICE_IS_PHY)) - return 0; - - if (phydrv->match_phy_device) - return phydrv->match_phy_device(phydev, phydrv); - if (phydev->is_c45) { + const int num_ids = ARRAY_SIZE(phydev->c45_ids.device_ids); + int i; + for (i = 1; i < num_ids; i++) { if (phydev->c45_ids.device_ids[i] == 0xffffffff) continue; @@ -565,11 +571,27 @@ static int phy_bus_match(struct device *dev, const struct device_driver *drv) phydrv->phy_id, phydrv->phy_id_mask)) return 1; } + return 0; - } else { - return phy_id_compare(phydev->phy_id, phydrv->phy_id, - phydrv->phy_id_mask); } + + return phy_id_compare(phydev->phy_id, phydrv->phy_id, + phydrv->phy_id_mask); +} +EXPORT_SYMBOL_GPL(genphy_match_phy_device); + +static int phy_bus_match(struct device *dev, const struct device_driver *drv) +{ + struct phy_device *phydev = to_phy_device(dev); + const struct phy_driver *phydrv = to_phy_driver(drv); + + if (!(phydrv->mdiodrv.flags & MDIO_DEVICE_IS_PHY)) + return 0; + + if (phydrv->match_phy_device) + return phydrv->match_phy_device(phydev, phydrv); + + return genphy_match_phy_device(phydev, phydrv); } static ssize_t diff --git a/include/linux/phy.h b/include/linux/phy.h index 10e66d45a8e8..32b9da274115 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1868,6 +1868,9 @@ char *phy_attached_info_irq(struct phy_device *phydev) __malloc; void phy_attached_info(struct phy_device *phydev); +int genphy_match_phy_device(struct phy_device *phydev, + const struct phy_driver *phydrv); + /* Clause 22 PHY */ int genphy_read_abilities(struct phy_device *phydev); int genphy_setup_forced(struct phy_device *phydev); -- cgit v1.2.3 From 4ff4d86f6cceb6bea583bdb230e5439655778cce Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Mon, 19 May 2025 10:45:05 +0200 Subject: net: Add support for providing the PTP hardware source in tsinfo Multi-PTP source support within a network topology has been merged, but the hardware timestamp source is not yet exposed to users. Currently, users only see the PTP index, which does not indicate whether the timestamp comes from a PHY or a MAC. Add support for reporting the hwtstamp source using a hwtstamp-source field, alongside hwtstamp-phyindex, to describe the origin of the hardware timestamp. Remove HWTSTAMP_SOURCE_UNSPEC enum value as it is not used at all. Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20250519-feature_ptp_source-v4-1-5d10e19a0265@bootlin.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 27 ++++++++++++++++++++++++ include/linux/ethtool.h | 5 +++++ include/linux/net_tstamp.h | 7 +------ include/uapi/linux/ethtool_netlink_generated.h | 14 +++++++++++++ net/ethtool/common.c | 29 +++++++++++++++++++++----- net/ethtool/tsinfo.c | 23 ++++++++++++++++++++ 6 files changed, 94 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index c650cd3dcb80..9f98715a6512 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -98,6 +98,24 @@ definitions: name: tcp-data-split type: enum entries: [ unknown, disabled, enabled ] + - + name: hwtstamp-source + doc: Source of the hardware timestamp + enum-name: hwtstamp-source + name-prefix: hwtstamp-source- + type: enum + entries: + - + name: netdev + doc: | + Hardware timestamp comes from a MAC or a device + which has MAC and PHY integrated + value: 1 + - + name: phylib + doc: | + Hardware timestamp comes from one PHY device + of the network topology attribute-sets: - @@ -896,6 +914,13 @@ attribute-sets: name: hwtstamp-provider type: nest nested-attributes: ts-hwtstamp-provider + - + name: hwtstamp-source + type: u32 + enum: hwtstamp-source + - + name: hwtstamp-phyindex + type: u32 - name: cable-result attr-cnt-name: __ethtool-a-cable-result-cnt @@ -1981,6 +2006,8 @@ operations: - phc-index - stats - hwtstamp-provider + - hwtstamp-source + - hwtstamp-phyindex dump: *tsinfo-get-op - name: cable-test-act diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 117718c24814..5e0dd333ad1f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #define ETHTOOL_MM_MAX_VERIFY_TIME_MS 128 @@ -830,6 +831,8 @@ struct ethtool_rxfh_param { * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags * @phc_index: device index of the associated PHC, or -1 if there is none * @phc_qualifier: qualifier of the associated PHC + * @phc_source: source device of the associated PHC + * @phc_phyindex: index of PHY device source of the associated PHC * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values */ @@ -838,6 +841,8 @@ struct kernel_ethtool_ts_info { u32 so_timestamping; int phc_index; enum hwtstamp_provider_qualifier phc_qualifier; + enum hwtstamp_source phc_source; + int phc_phyindex; enum hwtstamp_tx_types tx_types; enum hwtstamp_rx_filters rx_filters; }; diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index ff0758e88ea1..f4936d9c2b3c 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -4,6 +4,7 @@ #define _LINUX_NET_TIMESTAMPING_H_ #include +#include #define SOF_TIMESTAMPING_SOFTWARE_MASK (SOF_TIMESTAMPING_RX_SOFTWARE | \ SOF_TIMESTAMPING_TX_SOFTWARE | \ @@ -13,12 +14,6 @@ SOF_TIMESTAMPING_TX_HARDWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) -enum hwtstamp_source { - HWTSTAMP_SOURCE_UNSPEC, - HWTSTAMP_SOURCE_NETDEV, - HWTSTAMP_SOURCE_PHYLIB, -}; - /** * struct hwtstamp_provider_desc - hwtstamp provider description * diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 30c8dad6214e..9a02f579de22 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -37,6 +37,18 @@ enum ethtool_tcp_data_split { ETHTOOL_TCP_DATA_SPLIT_ENABLED, }; +/** + * enum hwtstamp_source - Source of the hardware timestamp + * @HWTSTAMP_SOURCE_NETDEV: Hardware timestamp comes from a MAC or a device + * which has MAC and PHY integrated + * @HWTSTAMP_SOURCE_PHYLIB: Hardware timestamp comes from one PHY device of the + * network topology + */ +enum hwtstamp_source { + HWTSTAMP_SOURCE_NETDEV = 1, + HWTSTAMP_SOURCE_PHYLIB, +}; + enum { ETHTOOL_A_HEADER_UNSPEC, ETHTOOL_A_HEADER_DEV_INDEX, @@ -401,6 +413,8 @@ enum { ETHTOOL_A_TSINFO_PHC_INDEX, ETHTOOL_A_TSINFO_STATS, ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER, + ETHTOOL_A_TSINFO_HWTSTAMP_SOURCE, + ETHTOOL_A_TSINFO_HWTSTAMP_PHYINDEX, __ETHTOOL_A_TSINFO_CNT, ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1) diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 49bea6b45bd5..eb253e0fd61b 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -921,9 +921,18 @@ int ethtool_get_ts_info_by_phc(struct net_device *dev, phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc); if (IS_ERR(phy)) - err = PTR_ERR(phy); - else - err = 0; + return PTR_ERR(phy); + + /* Report the phc source only if we have a real + * phc source with an index. + */ + if (info->phc_index >= 0) { + info->phc_source = HWTSTAMP_SOURCE_PHYLIB; + info->phc_phyindex = phy->phyindex; + } + err = 0; + } else if (!err && info->phc_index >= 0) { + info->phc_source = HWTSTAMP_SOURCE_NETDEV; } info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | @@ -947,10 +956,20 @@ int __ethtool_get_ts_info(struct net_device *dev, ethtool_init_tsinfo(info); if (phy_is_default_hwtstamp(phydev) && - phy_has_tsinfo(phydev)) + phy_has_tsinfo(phydev)) { err = phy_ts_info(phydev, info); - else if (ops->get_ts_info) + /* Report the phc source only if we have a real + * phc source with an index. + */ + if (!err && info->phc_index >= 0) { + info->phc_source = HWTSTAMP_SOURCE_PHYLIB; + info->phc_phyindex = phydev->phyindex; + } + } else if (ops->get_ts_info) { err = ops->get_ts_info(dev, info); + if (!err && info->phc_index >= 0) + info->phc_source = HWTSTAMP_SOURCE_NETDEV; + } info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c index 8130b406ef10..8c654caa6805 100644 --- a/net/ethtool/tsinfo.c +++ b/net/ethtool/tsinfo.c @@ -160,6 +160,12 @@ static int tsinfo_reply_size(const struct ethnl_req_info *req_base, /* _TSINFO_HWTSTAMP_PROVIDER */ len += nla_total_size(0) + 2 * nla_total_size(sizeof(u32)); } + if (ts_info->phc_source) { + len += nla_total_size(sizeof(u32)); /* _TSINFO_HWTSTAMP_SOURCE */ + if (ts_info->phc_phyindex) + /* _TSINFO_HWTSTAMP_PHYINDEX */ + len += nla_total_size(sizeof(u32)); + } if (req_base->flags & ETHTOOL_FLAG_STATS) len += nla_total_size(0) + /* _TSINFO_STATS */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT; @@ -259,6 +265,16 @@ static int tsinfo_fill_reply(struct sk_buff *skb, nla_nest_end(skb, nest); } + if (ts_info->phc_source) { + if (nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_SOURCE, + ts_info->phc_source)) + return -EMSGSIZE; + + if (ts_info->phc_phyindex && + nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PHYINDEX, + ts_info->phc_phyindex)) + return -EMSGSIZE; + } if (req_base->flags & ETHTOOL_FLAG_STATS && tsinfo_put_stats(skb, &data->stats)) return -EMSGSIZE; @@ -346,6 +362,11 @@ static int ethnl_tsinfo_dump_one_phydev(struct sk_buff *skb, if (ret < 0) goto err; + if (reply_data->ts_info.phc_index >= 0) { + reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_PHYLIB; + reply_data->ts_info.phc_phyindex = phydev->phyindex; + } + ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr); if (ret < 0) goto err; @@ -389,6 +410,8 @@ static int ethnl_tsinfo_dump_one_netdev(struct sk_buff *skb, if (ret < 0) goto err; + if (reply_data->ts_info.phc_index >= 0) + reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_NETDEV; ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr); if (ret < 0) -- cgit v1.2.3 From 38b95d588f8fd07027ad8dbca3e1d2b5c13413ae Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:54 -0700 Subject: scm: Move scm_recv() from scm.h to scm.c. scm_recv() has been placed in scm.h since the pre-git era for no particular reason (I think), which makes the file really fragile. For example, when you move SOCK_PASSCRED from include/linux/net.h to enum sock_flags in include/net/sock.h, you will see weird build failure due to terrible dependency. To avoid the build failure in the future, let's move scm_recv(_unix())? and its callees to scm.c. Note that only scm_recv() needs to be exported for Bluetooth. scm_send() should be moved to scm.c too, but I'll revisit later. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/scm.h | 121 ++--------------------------------------------------- net/core/scm.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 117 deletions(-) (limited to 'include') diff --git a/include/net/scm.h b/include/net/scm.h index 22bb49589fde..84c4707e78a5 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -102,123 +102,10 @@ static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, return __scm_send(sock, msg, scm); } -#ifdef CONFIG_SECURITY_NETWORK -static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) -{ - struct lsm_context ctx; - int err; - - if (test_bit(SOCK_PASSSEC, &sock->flags)) { - err = security_secid_to_secctx(scm->secid, &ctx); - - if (err >= 0) { - put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len, - ctx.context); - security_release_secctx(&ctx); - } - } -} - -static inline bool scm_has_secdata(struct socket *sock) -{ - return test_bit(SOCK_PASSSEC, &sock->flags); -} -#else -static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) -{ } - -static inline bool scm_has_secdata(struct socket *sock) -{ - return false; -} -#endif /* CONFIG_SECURITY_NETWORK */ - -static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) -{ - struct file *pidfd_file = NULL; - int len, pidfd; - - /* put_cmsg() doesn't return an error if CMSG is truncated, - * that's why we need to opencode these checks here. - */ - if (msg->msg_flags & MSG_CMSG_COMPAT) - len = sizeof(struct compat_cmsghdr) + sizeof(int); - else - len = sizeof(struct cmsghdr) + sizeof(int); - - if (msg->msg_controllen < len) { - msg->msg_flags |= MSG_CTRUNC; - return; - } - - if (!scm->pid) - return; - - pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); - - if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { - if (pidfd_file) { - put_unused_fd(pidfd); - fput(pidfd_file); - } - - return; - } - - if (pidfd_file) - fd_install(pidfd, pidfd_file); -} - -static inline bool __scm_recv_common(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) -{ - if (!msg->msg_control) { - if (test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags) || - scm->fp || scm_has_secdata(sock)) - msg->msg_flags |= MSG_CTRUNC; - scm_destroy(scm); - return false; - } - - if (test_bit(SOCK_PASSCRED, &sock->flags)) { - struct user_namespace *current_ns = current_user_ns(); - struct ucred ucreds = { - .pid = scm->creds.pid, - .uid = from_kuid_munged(current_ns, scm->creds.uid), - .gid = from_kgid_munged(current_ns, scm->creds.gid), - }; - put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); - } - - scm_passec(sock, msg, scm); - - if (scm->fp) - scm_detach_fds(msg, scm); - - return true; -} - -static inline void scm_recv(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) -{ - if (!__scm_recv_common(sock, msg, scm, flags)) - return; - - scm_destroy_cred(scm); -} - -static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) -{ - if (!__scm_recv_common(sock, msg, scm, flags)) - return; - - if (test_bit(SOCK_PASSPIDFD, &sock->flags)) - scm_pidfd_recv(msg, scm); - - scm_destroy_cred(scm); -} +void scm_recv(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags); +void scm_recv_unix(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags); static inline int scm_recv_one_fd(struct file *f, int __user *ufd, unsigned int flags) diff --git a/net/core/scm.c b/net/core/scm.c index 733c0cbd393d..66e02b18c359 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -404,3 +404,126 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) return new_fpl; } EXPORT_SYMBOL(scm_fp_dup); + +#ifdef CONFIG_SECURITY_NETWORK +static void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) +{ + struct lsm_context ctx; + int err; + + if (test_bit(SOCK_PASSSEC, &sock->flags)) { + err = security_secid_to_secctx(scm->secid, &ctx); + + if (err >= 0) { + put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len, + ctx.context); + + security_release_secctx(&ctx); + } + } +} + +static bool scm_has_secdata(struct socket *sock) +{ + return test_bit(SOCK_PASSSEC, &sock->flags); +} +#else +static void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) +{ +} + +static bool scm_has_secdata(struct socket *sock) +{ + return false; +} +#endif + +static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) +{ + struct file *pidfd_file = NULL; + int len, pidfd; + + /* put_cmsg() doesn't return an error if CMSG is truncated, + * that's why we need to opencode these checks here. + */ + if (msg->msg_flags & MSG_CMSG_COMPAT) + len = sizeof(struct compat_cmsghdr) + sizeof(int); + else + len = sizeof(struct cmsghdr) + sizeof(int); + + if (msg->msg_controllen < len) { + msg->msg_flags |= MSG_CTRUNC; + return; + } + + if (!scm->pid) + return; + + pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); + + if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { + if (pidfd_file) { + put_unused_fd(pidfd); + fput(pidfd_file); + } + + return; + } + + if (pidfd_file) + fd_install(pidfd, pidfd_file); +} + +static bool __scm_recv_common(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!msg->msg_control) { + if (test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags) || + scm->fp || scm_has_secdata(sock)) + msg->msg_flags |= MSG_CTRUNC; + + scm_destroy(scm); + return false; + } + + if (test_bit(SOCK_PASSCRED, &sock->flags)) { + struct user_namespace *current_ns = current_user_ns(); + struct ucred ucreds = { + .pid = scm->creds.pid, + .uid = from_kuid_munged(current_ns, scm->creds.uid), + .gid = from_kgid_munged(current_ns, scm->creds.gid), + }; + + put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); + } + + scm_passec(sock, msg, scm); + + if (scm->fp) + scm_detach_fds(msg, scm); + + return true; +} + +void scm_recv(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!__scm_recv_common(sock, msg, scm, flags)) + return; + + scm_destroy_cred(scm); +} +EXPORT_SYMBOL(scm_recv); + +void scm_recv_unix(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!__scm_recv_common(sock, msg, scm, flags)) + return; + + if (test_bit(SOCK_PASSPIDFD, &sock->flags)) + scm_pidfd_recv(msg, scm); + + scm_destroy_cred(scm); +} -- cgit v1.2.3 From 7d8d93fdde50b86bbbf46a203c368ed320e729ab Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:56 -0700 Subject: net: Restrict SO_PASS{CRED,PIDFD,SEC} to AF_{UNIX,NETLINK,BLUETOOTH}. SCM_CREDENTIALS and SCM_SECURITY can be recv()ed by calling scm_recv() or scm_recv_unix(), and SCM_PIDFD is only used by scm_recv_unix(). scm_recv() is called from AF_NETLINK and AF_BLUETOOTH. scm_recv_unix() is literally called from AF_UNIX. Let's restrict SO_PASSCRED and SO_PASSSEC to such sockets and SO_PASSPIDFD to AF_UNIX only. Later, SOCK_PASS{CRED,PIDFD,SEC} will be moved to struct sock and united with another field. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 14 +++++++++++++- net/core/sock.c | 18 ++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 35ca6b13c6d2..483522377955 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2773,9 +2773,14 @@ static inline bool sk_is_udp(const struct sock *sk) sk->sk_protocol == IPPROTO_UDP; } +static inline bool sk_is_unix(const struct sock *sk) +{ + return sk->sk_family == AF_UNIX; +} + static inline bool sk_is_stream_unix(const struct sock *sk) { - return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM; + return sk_is_unix(sk) && sk->sk_type == SOCK_STREAM; } static inline bool sk_is_vsock(const struct sock *sk) @@ -2783,6 +2788,13 @@ static inline bool sk_is_vsock(const struct sock *sk) return sk->sk_family == AF_VSOCK; } +static inline bool sk_may_scm_recv(const struct sock *sk) +{ + return (IS_ENABLED(CONFIG_UNIX) && sk->sk_family == AF_UNIX) || + sk->sk_family == AF_NETLINK || + (IS_ENABLED(CONFIG_BT) && sk->sk_family == AF_BLUETOOTH); +} + /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from diff --git a/net/core/sock.c b/net/core/sock.c index d7d6d3a8efe5..fd5f9d3873c1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1221,12 +1221,21 @@ int sk_setsockopt(struct sock *sk, int level, int optname, } return -EPERM; case SO_PASSSEC: + if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + assign_bit(SOCK_PASSSEC, &sock->flags, valbool); return 0; case SO_PASSCRED: + if (!sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + assign_bit(SOCK_PASSCRED, &sock->flags, valbool); return 0; case SO_PASSPIDFD: + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); return 0; case SO_TYPE: @@ -1855,10 +1864,16 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PASSCRED: + if (!sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); break; case SO_PASSPIDFD: + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); break; @@ -1956,6 +1971,9 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PASSSEC: + if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); break; -- cgit v1.2.3 From 0e81cfd971dc4833c699dcd8924e54a5021bc4e8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:57 -0700 Subject: af_unix: Move SOCK_PASS{CRED,PIDFD,SEC} to struct sock. As explained in the next patch, SO_PASSRIGHTS would have a problem if we assigned a corresponding bit to socket->flags, so it must be managed in struct sock. Mixing socket->flags and sk->sk_flags for similar options will look confusing, and sk->sk_flags does not have enough space on 32bit system. Also, as mentioned in commit 16e572626961 ("af_unix: dont send SCM_CREDENTIALS by default"), SOCK_PASSCRED and SOCK_PASSPID handling is known to be slow, and managing the flags in struct socket cannot avoid that for embryo sockets. Let's move SOCK_PASS{CRED,PIDFD,SEC} to struct sock. While at it, other SOCK_XXX flags in net.h are grouped as enum. Note that assign_bit() was atomic, so the writer side is moved down after lock_sock() in setsockopt(), but the bit is only read once in sendmsg() and recvmsg(), so lock_sock() is not needed there. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/net.h | 15 +++++++-------- include/net/sock.h | 16 +++++++++++++++- net/core/scm.c | 29 ++++++++++++++--------------- net/core/sock.c | 44 +++++++++++++++++++++++--------------------- net/unix/af_unix.c | 18 ++---------------- 5 files changed, 61 insertions(+), 61 deletions(-) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index 0ff950eecc6b..f8418d6e33e0 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -36,14 +36,13 @@ struct net; * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected. * Eventually all flags will be in sk->sk_wq->flags. */ -#define SOCKWQ_ASYNC_NOSPACE 0 -#define SOCKWQ_ASYNC_WAITDATA 1 -#define SOCK_NOSPACE 2 -#define SOCK_PASSCRED 3 -#define SOCK_PASSSEC 4 -#define SOCK_SUPPORT_ZC 5 -#define SOCK_CUSTOM_SOCKOPT 6 -#define SOCK_PASSPIDFD 7 +enum socket_flags { + SOCKWQ_ASYNC_NOSPACE, + SOCKWQ_ASYNC_WAITDATA, + SOCK_NOSPACE, + SOCK_SUPPORT_ZC, + SOCK_CUSTOM_SOCKOPT, +}; #ifndef ARCH_HAS_SOCKET_TYPES /** diff --git a/include/net/sock.h b/include/net/sock.h index 483522377955..d90a71f66ab8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -337,6 +337,11 @@ struct sk_filter; * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags + * @sk_scm_recv_flags: all flags used by scm_recv() + * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS + * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY + * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD + * @sk_scm_unused: unused flags for scm_recv() * @ns_tracker: tracker for netns reference * @sk_user_frags: xarray of pages the user is holding a reference on. * @sk_owner: reference to the real owner of the socket that calls @@ -523,7 +528,16 @@ struct sock { #endif int sk_disconnects; - u8 sk_txrehash; + union { + u8 sk_txrehash; + u8 sk_scm_recv_flags; + struct { + u8 sk_scm_credentials : 1, + sk_scm_security : 1, + sk_scm_pidfd : 1, + sk_scm_unused : 5; + }; + }; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, diff --git a/net/core/scm.c b/net/core/scm.c index 66e02b18c359..0225bd94170f 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -406,12 +406,12 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) EXPORT_SYMBOL(scm_fp_dup); #ifdef CONFIG_SECURITY_NETWORK -static void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) +static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) { struct lsm_context ctx; int err; - if (test_bit(SOCK_PASSSEC, &sock->flags)) { + if (sk->sk_scm_security) { err = security_secid_to_secctx(scm->secid, &ctx); if (err >= 0) { @@ -423,16 +423,16 @@ static void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cooki } } -static bool scm_has_secdata(struct socket *sock) +static bool scm_has_secdata(struct sock *sk) { - return test_bit(SOCK_PASSSEC, &sock->flags); + return sk->sk_scm_security; } #else -static void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) +static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) { } -static bool scm_has_secdata(struct socket *sock) +static bool scm_has_secdata(struct sock *sk) { return false; } @@ -474,20 +474,19 @@ static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) fd_install(pidfd, pidfd_file); } -static bool __scm_recv_common(struct socket *sock, struct msghdr *msg, +static bool __scm_recv_common(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm, int flags) { if (!msg->msg_control) { - if (test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags) || - scm->fp || scm_has_secdata(sock)) + if (sk->sk_scm_credentials || sk->sk_scm_pidfd || + scm->fp || scm_has_secdata(sk)) msg->msg_flags |= MSG_CTRUNC; scm_destroy(scm); return false; } - if (test_bit(SOCK_PASSCRED, &sock->flags)) { + if (sk->sk_scm_credentials) { struct user_namespace *current_ns = current_user_ns(); struct ucred ucreds = { .pid = scm->creds.pid, @@ -498,7 +497,7 @@ static bool __scm_recv_common(struct socket *sock, struct msghdr *msg, put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); } - scm_passec(sock, msg, scm); + scm_passec(sk, msg, scm); if (scm->fp) scm_detach_fds(msg, scm); @@ -509,7 +508,7 @@ static bool __scm_recv_common(struct socket *sock, struct msghdr *msg, void scm_recv(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { - if (!__scm_recv_common(sock, msg, scm, flags)) + if (!__scm_recv_common(sock->sk, msg, scm, flags)) return; scm_destroy_cred(scm); @@ -519,10 +518,10 @@ EXPORT_SYMBOL(scm_recv); void scm_recv_unix(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { - if (!__scm_recv_common(sock, msg, scm, flags)) + if (!__scm_recv_common(sock->sk, msg, scm, flags)) return; - if (test_bit(SOCK_PASSPIDFD, &sock->flags)) + if (sock->sk->sk_scm_pidfd) scm_pidfd_recv(msg, scm); scm_destroy_cred(scm); diff --git a/net/core/sock.c b/net/core/sock.c index fd5f9d3873c1..381abf8f25b7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1220,24 +1220,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname, return 0; } return -EPERM; - case SO_PASSSEC: - if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || sk_may_scm_recv(sk)) - return -EOPNOTSUPP; - - assign_bit(SOCK_PASSSEC, &sock->flags, valbool); - return 0; - case SO_PASSCRED: - if (!sk_may_scm_recv(sk)) - return -EOPNOTSUPP; - - assign_bit(SOCK_PASSCRED, &sock->flags, valbool); - return 0; - case SO_PASSPIDFD: - if (!sk_is_unix(sk)) - return -EOPNOTSUPP; - - assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); - return 0; case SO_TYPE: case SO_PROTOCOL: case SO_DOMAIN: @@ -1568,6 +1550,26 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; + case SO_PASSCRED: + if (sk_may_scm_recv(sk)) + sk->sk_scm_credentials = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSSEC: + if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk)) + sk->sk_scm_security = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSPIDFD: + if (sk_is_unix(sk)) + sk->sk_scm_pidfd = valbool; + else + ret = -EOPNOTSUPP; + break; case SO_INCOMING_CPU: reuseport_update_incoming_cpu(sk, val); @@ -1867,14 +1869,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname, if (!sk_may_scm_recv(sk)) return -EOPNOTSUPP; - v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); + v.val = sk->sk_scm_credentials; break; case SO_PASSPIDFD: if (!sk_is_unix(sk)) return -EOPNOTSUPP; - v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); + v.val = sk->sk_scm_pidfd; break; case SO_PEERCRED: @@ -1974,7 +1976,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) return -EOPNOTSUPP; - v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); + v.val = sk->sk_scm_security; break; case SO_PEERSEC: diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index a39497fd6e98..27ebda4cd9b9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -767,10 +767,7 @@ static void copy_peercred(struct sock *sk, struct sock *peersk) static bool unix_may_passcred(const struct sock *sk) { - struct socket *sock = sk->sk_socket; - - return test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags); + return sk->sk_scm_credentials || sk->sk_scm_pidfd; } static int unix_listen(struct socket *sock, int backlog) @@ -1713,17 +1710,6 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) return 0; } -static void unix_sock_inherit_flags(const struct socket *old, - struct socket *new) -{ - if (test_bit(SOCK_PASSCRED, &old->flags)) - set_bit(SOCK_PASSCRED, &new->flags); - if (test_bit(SOCK_PASSPIDFD, &old->flags)) - set_bit(SOCK_PASSPIDFD, &new->flags); - if (test_bit(SOCK_PASSSEC, &old->flags)) - set_bit(SOCK_PASSSEC, &new->flags); -} - static int unix_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { @@ -1760,7 +1746,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, unix_state_lock(tsk); unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; - unix_sock_inherit_flags(sock, newsock); + tsk->sk_scm_recv_flags = READ_ONCE(sk->sk_scm_recv_flags); sock_graft(tsk, newsock); unix_state_unlock(tsk); return 0; -- cgit v1.2.3 From 77cbe1a6d8730a07f99f9263c2d5f2304cf5e830 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:59 -0700 Subject: af_unix: Introduce SO_PASSRIGHTS. As long as recvmsg() or recvmmsg() is used with cmsg, it is not possible to avoid receiving file descriptors via SCM_RIGHTS. This behaviour has occasionally been flagged as problematic, as it can be (ab)used to trigger DoS during close(), for example, by passing a FUSE-controlled fd or a hung NFS fd. For instance, as noted on the uAPI Group page [0], an untrusted peer could send a file descriptor pointing to a hung NFS mount and then close it. Once the receiver calls recvmsg() with msg_control, the descriptor is automatically installed, and then the responsibility for the final close() now falls on the receiver, which may result in blocking the process for a long time. Regarding this, systemd calls cmsg_close_all() [1] after each recvmsg() to close() unwanted file descriptors sent via SCM_RIGHTS. However, this cannot work around the issue at all, because the final fput() may still occur on the receiver's side once sendmsg() with SCM_RIGHTS succeeds. Also, even filtering by LSM at recvmsg() does not work for the same reason. Thus, we need a better way to refuse SCM_RIGHTS at sendmsg(). Let's introduce SO_PASSRIGHTS to disable SCM_RIGHTS. Note that this option is enabled by default for backward compatibility. Link: https://uapi-group.org/kernel-features/#disabling-reception-of-scm_rights-for-af_unix-sockets #[0] Link: https://github.com/systemd/systemd/blob/v257.5/src/basic/fd-util.c#L612-L628 #[1] Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ include/net/sock.h | 4 +++- include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 14 ++++++++++++++ net/unix/af_unix.c | 22 ++++++++++++++++++++-- tools/include/uapi/asm-generic/socket.h | 2 ++ 9 files changed, 49 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 3df5f2dd4c0f..8f1f18adcdb5 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -150,6 +150,8 @@ #define SO_RCVPRIORITY 82 +#define SO_PASSRIGHTS 83 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 22fa8f19924a..31ac655b7837 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -161,6 +161,8 @@ #define SO_RCVPRIORITY 82 +#define SO_PASSRIGHTS 83 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 96831c988606..1f2d5b7a7f5d 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -142,6 +142,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 0x4050 +#define SO_PASSRIGHTS 0x4051 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 5b464a568664..adcba7329386 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -143,6 +143,8 @@ #define SO_RCVPRIORITY 0x005b +#define SO_PASSRIGHTS 0x005c + #if !defined(__KERNEL__) diff --git a/include/net/sock.h b/include/net/sock.h index d90a71f66ab8..92e7c1aae3cc 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -341,6 +341,7 @@ struct sk_filter; * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD + * @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS * @sk_scm_unused: unused flags for scm_recv() * @ns_tracker: tracker for netns reference * @sk_user_frags: xarray of pages the user is holding a reference on. @@ -535,7 +536,8 @@ struct sock { u8 sk_scm_credentials : 1, sk_scm_security : 1, sk_scm_pidfd : 1, - sk_scm_unused : 5; + sk_scm_rights : 1, + sk_scm_unused : 4; }; }; u8 sk_clockid; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index aa5016ff3d91..f333a0ac4ee4 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -145,6 +145,8 @@ #define SO_RCVPRIORITY 82 +#define SO_PASSRIGHTS 83 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/net/core/sock.c b/net/core/sock.c index 381abf8f25b7..0cb52e590094 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1571,6 +1571,13 @@ set_sndbuf: ret = -EOPNOTSUPP; break; + case SO_PASSRIGHTS: + if (sk_is_unix(sk)) + sk->sk_scm_rights = valbool; + else + ret = -EOPNOTSUPP; + break; + case SO_INCOMING_CPU: reuseport_update_incoming_cpu(sk, val); break; @@ -1879,6 +1886,13 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = sk->sk_scm_pidfd; break; + case SO_PASSRIGHTS: + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_rights; + break; + case SO_PEERCRED: { struct ucred peercred; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 900bad88fbd2..bd507f74e35e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1015,6 +1015,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sock_init_data(sock, sk); + sk->sk_scm_rights = 1; sk->sk_hash = unix_unbound_hash(sk); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; @@ -2073,6 +2074,11 @@ restart_locked: goto out_unlock; } + if (UNIXCB(skb).fp && !other->sk_scm_rights) { + err = -EPERM; + goto out_unlock; + } + if (sk->sk_type != SOCK_SEQPACKET) { err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) @@ -2174,9 +2180,13 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) { - unix_state_unlock(other); err = -EPIPE; - goto out; + goto out_unlock; + } + + if (UNIXCB(skb).fp && !other->sk_scm_rights) { + err = -EPERM; + goto out_unlock; } unix_maybe_add_creds(skb, sk, other); @@ -2192,6 +2202,8 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, other->sk_data_ready(other); return 0; +out_unlock: + unix_state_unlock(other); out: consume_skb(skb); return err; @@ -2295,6 +2307,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, (other->sk_shutdown & RCV_SHUTDOWN)) goto out_pipe_unlock; + if (UNIXCB(skb).fp && !other->sk_scm_rights) { + unix_state_unlock(other); + err = -EPERM; + goto out_free; + } + unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); diff --git a/tools/include/uapi/asm-generic/socket.h b/tools/include/uapi/asm-generic/socket.h index aa5016ff3d91..f333a0ac4ee4 100644 --- a/tools/include/uapi/asm-generic/socket.h +++ b/tools/include/uapi/asm-generic/socket.h @@ -145,6 +145,8 @@ #define SO_RCVPRIORITY 82 +#define SO_PASSRIGHTS 83 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) -- cgit v1.2.3 From 9a119669fb1924cd9658c16da39a5a585e129e50 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 May 2025 11:38:48 +0200 Subject: netfilter: nf_tables: nft_fib: consistent l3mdev handling fib has two modes: 1. Obtain output device according to source or destination address 2. Obtain the type of the address, e.g. local, unicast, multicast. 'fib daddr type' should return 'local' if the address is configured in this netns or unicast otherwise. 'fib daddr . iif type' should return 'local' if the address is configured on the input interface or unicast otherwise, i.e. more restrictive. However, if the interface is part of a VRF, then 'fib daddr type' returns unicast even if the address is configured on the incoming interface. This is broken for both ipv4 and ipv6. In the ipv4 case, inet_dev_addr_type must only be used if the 'iif' or 'oif' (strict mode) was requested. Else inet_addr_type_dev_table() needs to be used and the correct dev argument must be passed as well so the correct fib (vrf) table is used. In the ipv6 case, the bug is similar, without strict mode, dev is NULL so .flowi6_l3mdev will be set to 0. Add a new 'nft_fib_l3mdev_master_ifindex_rcu()' helper and use that to init the .l3mdev structure member. For ipv6, use it from nft_fib6_flowi_init() which gets called from both the 'type' and the 'route' mode eval functions. This provides consistent behaviour for all modes for both ipv4 and ipv6: If strict matching is requested, the input respectively output device of the netfilter hooks is used. Otherwise, use skb->dev to obtain the l3mdev ifindex. Without this, most type checks in updated nft_fib.sh selftest fail: FAIL: did not find veth0 . 10.9.9.1 . local in fibtype4 FAIL: did not find veth0 . dead:1::1 . local in fibtype6 FAIL: did not find veth0 . dead:9::1 . local in fibtype6 FAIL: did not find tvrf . 10.0.1.1 . local in fibtype4 FAIL: did not find tvrf . 10.9.9.1 . local in fibtype4 FAIL: did not find tvrf . dead:1::1 . local in fibtype6 FAIL: did not find tvrf . dead:9::1 . local in fibtype6 FAIL: fib expression address types match (iif in vrf) (fib errounously returns 'unicast' for all of them, even though all of these addresses are local to the vrf). Fixes: f6d0cbcf09c5 ("netfilter: nf_tables: add fib expression") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_fib.h | 9 +++++++++ net/ipv4/netfilter/nft_fib_ipv4.c | 11 +++++++++-- net/ipv6/netfilter/nft_fib_ipv6.c | 4 +--- 3 files changed, 19 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 6e202ed5e63f..7370fba844ef 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -2,6 +2,7 @@ #ifndef _NFT_FIB_H_ #define _NFT_FIB_H_ +#include #include struct nft_fib { @@ -39,6 +40,14 @@ static inline bool nft_fib_can_skip(const struct nft_pktinfo *pkt) return nft_fib_is_loopback(pkt->skb, indev); } +static inline int nft_fib_l3mdev_master_ifindex_rcu(const struct nft_pktinfo *pkt, + const struct net_device *iif) +{ + const struct net_device *dev = iif ? iif : pkt->skb->dev; + + return l3mdev_master_ifindex_rcu(dev); +} + int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 9082ca17e845..7e7c49535e3f 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -50,7 +50,12 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, else addr = iph->saddr; - *dst = inet_dev_addr_type(nft_net(pkt), dev, addr); + if (priv->flags & (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) { + *dst = inet_dev_addr_type(nft_net(pkt), dev, addr); + return; + } + + *dst = inet_addr_type_dev_table(nft_net(pkt), pkt->skb->dev, addr); } EXPORT_SYMBOL_GPL(nft_fib4_eval_type); @@ -65,8 +70,8 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, struct flowi4 fl4 = { .flowi4_scope = RT_SCOPE_UNIVERSE, .flowi4_iif = LOOPBACK_IFINDEX, + .flowi4_proto = pkt->tprot, .flowi4_uid = sock_net_uid(nft_net(pkt), NULL), - .flowi4_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)), }; const struct net_device *oif; const struct net_device *found; @@ -90,6 +95,8 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, else oif = NULL; + fl4.flowi4_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, oif); + iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); if (!iph) { regs->verdict.code = NFT_BREAK; diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index f1f5640da672..421036a3605b 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -50,6 +50,7 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, fl6->flowi6_mark = pkt->skb->mark; fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK; + fl6->flowi6_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, dev); return lookup_flags; } @@ -73,8 +74,6 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, else if (priv->flags & NFTA_FIB_F_OIF) dev = nft_out(pkt); - fl6.flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev); - nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph); if (dev && nf_ipv6_chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) @@ -166,7 +165,6 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_proto = pkt->tprot, .flowi6_uid = sock_net_uid(nft_net(pkt), NULL), - .flowi6_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)), }; struct rt6_info *rt; int lookup_flags; -- cgit v1.2.3 From a1f1acb9c5db9b385c9b3eb1f27f897c06df49ae Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 12:28:44 +0200 Subject: netfilter: nf_dup{4, 6}: Move duplication check to task_struct nf_skb_duplicated is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Due to the recursion involved, the simplest change is to make it a per-task variable. Move the per-CPU variable nf_skb_duplicated to task_struct and name it in_nf_duplicate. Add it to the existing bitfield so it doesn't use additional memory. Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Valentin Schneider Acked-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 11 ----------- include/linux/sched.h | 1 + net/ipv4/netfilter/ip_tables.c | 2 +- net/ipv4/netfilter/nf_dup_ipv4.c | 6 +++--- net/ipv6/netfilter/ip6_tables.c | 2 +- net/ipv6/netfilter/nf_dup_ipv6.c | 6 +++--- net/netfilter/core.c | 3 --- 7 files changed, 9 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 2b8aac2c70ad..892d12823ed4 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -497,17 +497,6 @@ struct nf_defrag_hook { extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook; extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook; -/* - * nf_skb_duplicated - TEE target has sent a packet - * - * When a xtables target sends a packet, the OUTPUT and POSTROUTING - * hooks are traversed again, i.e. nft and xtables are invoked recursively. - * - * This is used by xtables TEE target to prevent the duplicated skb from - * being duplicated again. - */ -DECLARE_PER_CPU(bool, nf_skb_duplicated); - /* * Contains bitmask of ctnetlink event subscribers, if any. * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag. diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac1982893..52d9c52dc8f2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1044,6 +1044,7 @@ struct task_struct { /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif + unsigned in_nf_duplicate:1; #ifdef CONFIG_PREEMPT_RT struct netdev_xmit net_xmit; #endif diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 3d101613f27f..23c8deff8095 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -270,7 +270,7 @@ ipt_do_table(void *priv, * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + jumpstack += private->stacksize * current->in_nf_duplicate; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index 25e1e8eb18dd..ed08fb78cfa8 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -54,7 +54,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, struct iphdr *iph; local_bh_disable(); - if (this_cpu_read(nf_skb_duplicated)) + if (current->in_nf_duplicate) goto out; /* * Copy the skb, and route the copy. Will later return %XT_CONTINUE for @@ -86,9 +86,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, --iph->ttl; if (nf_dup_ipv4_route(net, skb, gw, oif)) { - __this_cpu_write(nf_skb_duplicated, true); + current->in_nf_duplicate = true; ip_local_out(net, skb->sk, skb); - __this_cpu_write(nf_skb_duplicated, false); + current->in_nf_duplicate = false; } else { kfree_skb(skb); } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 7d5602950ae7..d585ac3c1113 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -292,7 +292,7 @@ ip6t_do_table(void *priv, struct sk_buff *skb, * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + jumpstack += private->stacksize * current->in_nf_duplicate; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index 0c39c77fe8a8..b903c62c00c9 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -48,7 +48,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, const struct in6_addr *gw, int oif) { local_bh_disable(); - if (this_cpu_read(nf_skb_duplicated)) + if (current->in_nf_duplicate) goto out; skb = pskb_copy(skb, GFP_ATOMIC); if (skb == NULL) @@ -64,9 +64,9 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, --iph->hop_limit; } if (nf_dup_ipv6_route(net, skb, gw, oif)) { - __this_cpu_write(nf_skb_duplicated, true); + current->in_nf_duplicate = true; ip6_local_out(net, skb->sk, skb); - __this_cpu_write(nf_skb_duplicated, false); + current->in_nf_duplicate = false; } else { kfree_skb(skb); } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b9f551f02c81..11a702065bab 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -31,9 +31,6 @@ const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); -DEFINE_PER_CPU(bool, nf_skb_duplicated); -EXPORT_SYMBOL_GPL(nf_skb_duplicated); - #ifdef CONFIG_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); -- cgit v1.2.3 From f37ad91270397a6d053e8623bdb3cf79859691d2 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 12 May 2025 12:28:46 +0200 Subject: netfilter: nf_dup_netdev: Move the recursion counter struct netdev_xmit nf_dup_skb_recursion is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Move nf_dup_skb_recursion to struct netdev_xmit, provide wrappers. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Pablo Neira Ayuso --- include/linux/netdevice_xmit.h | 3 +++ net/netfilter/nf_dup_netdev.c | 22 ++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h index 848735b3a7c0..813a19122ebb 100644 --- a/include/linux/netdevice_xmit.h +++ b/include/linux/netdevice_xmit.h @@ -11,6 +11,9 @@ struct netdev_xmit { #if IS_ENABLED(CONFIG_NET_ACT_MIRRED) u8 sched_mirred_nest; #endif +#if IS_ENABLED(CONFIG_NF_DUP_NETDEV) + u8 nf_dup_skb_recursion; +#endif }; #endif diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index a8e2425e43b0..fab8b9011098 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -15,12 +15,26 @@ #define NF_RECURSION_LIMIT 2 -static DEFINE_PER_CPU(u8, nf_dup_skb_recursion); +#ifndef CONFIG_PREEMPT_RT +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion); +} +#else + +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return ¤t->net_xmit.nf_dup_skb_recursion; +} + +#endif static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, enum nf_dev_hooks hook) { - if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT) + u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion(); + + if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) goto err; if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) { @@ -32,9 +46,9 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; skb_clear_tstamp(skb); - __this_cpu_inc(nf_dup_skb_recursion); + (*nf_dup_skb_recursion)++; dev_queue_xmit(skb); - __this_cpu_dec(nf_dup_skb_recursion); + (*nf_dup_skb_recursion)--; return; err: kfree_skb(skb); -- cgit v1.2.3 From 90869f43d06dfc836def2f53850a878f829e443e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 22 May 2025 15:49:33 +0200 Subject: netfilter: conntrack: make nf_conntrack_id callable without a module dependency While nf_conntrack_id() doesn't need any functionaliy from conntrack, it does reside in nf_conntrack_core.c -- callers add a module dependency on conntrack. Followup patch will need to compute the conntrack id from nf_tables_trace.c to include it in nf_trace messages emitted to userspace via netlink. I don't want to introduce a module dependency between nf_tables and conntrack for this. Since trace is slowpath, the added indirection is ok. One alternative is to move nf_conntrack_id to the netfilter/core.c, but I don't see a compelling reason so far. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 1 + net/netfilter/nf_conntrack_core.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 892d12823ed4..20947f2c685b 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -470,6 +470,7 @@ struct nf_ct_hook { void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); void (*set_closing)(struct nf_conntrack *nfct); int (*confirm)(struct sk_buff *skb); + u32 (*get_id)(const struct nf_conntrack *nfct); }; extern const struct nf_ct_hook __rcu *nf_ct_hook; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index de8d50af9b5b..201d3c4ec623 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -505,6 +505,11 @@ u32 nf_ct_get_id(const struct nf_conn *ct) } EXPORT_SYMBOL_GPL(nf_ct_get_id); +static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct) +{ + return nf_ct_get_id(nf_ct_to_nf_conn(nfct)); +} + static void clean_from_lists(struct nf_conn *ct) { @@ -2710,6 +2715,7 @@ static const struct nf_ct_hook nf_conntrack_hook = { .attach = nf_conntrack_attach, .set_closing = nf_conntrack_set_closing, .confirm = __nf_conntrack_confirm, + .get_id = nf_conntrack_get_id, }; void nf_conntrack_init_end(void) -- cgit v1.2.3 From 7e5c6aa67e6f6133c5a2c53852e1dd9af2c0c3fc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 22 May 2025 15:49:34 +0200 Subject: netfilter: nf_tables: add packets conntrack state to debug trace info Add the minimal relevant info needed for userspace ("nftables monitor trace") to provide the conntrack view of the packet: - state (new, related, established) - direction (original, reply) - status (e.g., if connection is subject to dnat) - id (allows to query ctnetlink for remaining conntrack state info) Example: trace id a62 inet filter PRE_RAW packet: iif "enp0s3" ether [..] [..] trace id a62 inet filter PRE_MANGLE conntrack: ct direction original ct state new ct id 32 trace id a62 inet filter PRE_MANGLE packet: [..] [..] trace id a62 inet filter IN conntrack: ct direction original ct state new ct status dnat-done ct id 32 [..] In this case one can see that while NAT is active, the new connection isn't subject to a translation. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 8 +++++ net/netfilter/nf_tables_trace.c | 54 +++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 7d6bc19a0153..2beb30be2c5f 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1841,6 +1841,10 @@ enum nft_xfrm_keys { * @NFTA_TRACE_MARK: nfmark (NLA_U32) * @NFTA_TRACE_NFPROTO: nf protocol processed (NLA_U32) * @NFTA_TRACE_POLICY: policy that decided fate of packet (NLA_U32) + * @NFTA_TRACE_CT_ID: conntrack id (NLA_U32) + * @NFTA_TRACE_CT_DIRECTION: packets direction (NLA_U8) + * @NFTA_TRACE_CT_STATUS: conntrack status (NLA_U32) + * @NFTA_TRACE_CT_STATE: packet state (new, established, ...) (NLA_U32) */ enum nft_trace_attributes { NFTA_TRACE_UNSPEC, @@ -1861,6 +1865,10 @@ enum nft_trace_attributes { NFTA_TRACE_NFPROTO, NFTA_TRACE_POLICY, NFTA_TRACE_PAD, + NFTA_TRACE_CT_ID, + NFTA_TRACE_CT_DIRECTION, + NFTA_TRACE_CT_STATUS, + NFTA_TRACE_CT_STATE, __NFTA_TRACE_MAX }; #define NFTA_TRACE_MAX (__NFTA_TRACE_MAX - 1) diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 580c55268f65..ae3fe87195ab 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -90,6 +91,49 @@ static int nf_trace_fill_dev_info(struct sk_buff *nlskb, return 0; } +static int nf_trace_fill_ct_info(struct sk_buff *nlskb, + const struct sk_buff *skb) +{ + const struct nf_ct_hook *ct_hook; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + u32 state; + + ct_hook = rcu_dereference(nf_ct_hook); + if (!ct_hook) + return 0; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) { + if (ctinfo != IP_CT_UNTRACKED) /* not seen by conntrack or invalid */ + return 0; + + state = NF_CT_STATE_UNTRACKED_BIT; + } else { + state = NF_CT_STATE_BIT(ctinfo); + } + + if (nla_put_be32(nlskb, NFTA_TRACE_CT_STATE, htonl(state))) + return -1; + + if (ct) { + u32 id = ct_hook->get_id(&ct->ct_general); + u32 status = READ_ONCE(ct->status); + u8 dir = CTINFO2DIR(ctinfo); + + if (nla_put_u8(nlskb, NFTA_TRACE_CT_DIRECTION, dir)) + return -1; + + if (nla_put_be32(nlskb, NFTA_TRACE_CT_ID, (__force __be32)id)) + return -1; + + if (status && nla_put_be32(nlskb, NFTA_TRACE_CT_STATUS, htonl(status))) + return -1; + } + + return 0; +} + static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, const struct nft_pktinfo *pkt) { @@ -210,7 +254,11 @@ void nft_trace_notify(const struct nft_pktinfo *pkt, nla_total_size(sizeof(__be32)) + /* trace type */ nla_total_size(0) + /* VERDICT, nested */ nla_total_size(sizeof(u32)) + /* verdict code */ - nla_total_size(sizeof(u32)) + /* id */ + nla_total_size(sizeof(u32)) + /* ct id */ + nla_total_size(sizeof(u8)) + /* ct direction */ + nla_total_size(sizeof(u32)) + /* ct state */ + nla_total_size(sizeof(u32)) + /* ct status */ + nla_total_size(sizeof(u32)) + /* trace id */ nla_total_size(NFT_TRACETYPE_LL_HSIZE) + nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) + nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) + @@ -291,6 +339,10 @@ void nft_trace_notify(const struct nft_pktinfo *pkt, if (nf_trace_fill_pkt_info(skb, pkt)) goto nla_put_failure; + + if (nf_trace_fill_ct_info(skb, pkt->skb)) + goto nla_put_failure; + info->packet_dumped = true; } -- cgit v1.2.3 From e225376d78fb2d85e99a2436a9e65765dc1ac234 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 21 May 2025 22:44:23 +0200 Subject: netfilter: nf_tables: Introduce nft_hook_find_ops{,_rcu}() Also a pretty dull wrapper around the hook->ops.dev comparison for now. Will search the embedded nf_hook_ops list in future. The ugly cast to eliminate the const qualifier will vanish then, too. Since this future list will be RCU-protected, also introduce an _rcu() variant here. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++++ net/netfilter/nf_tables_api.c | 21 ++++++++++++++++++++- net/netfilter/nf_tables_offload.c | 2 +- net/netfilter/nft_chain_filter.c | 6 ++++-- net/netfilter/nft_flow_offload.c | 2 +- 5 files changed, 31 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 803d5f1601f9..df0b151743a2 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1205,6 +1205,11 @@ struct nft_hook { u8 ifnamelen; }; +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, + const struct net_device *dev); +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, + const struct net_device *dev); + /** * struct nft_base_chain - nf_tables base chain * diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9998fcf44a38..c5b7922ca5bf 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9600,13 +9600,32 @@ nla_put_failure: return -EMSGSIZE; } +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, + const struct net_device *dev) +{ + if (hook->ops.dev == dev) + return (struct nf_hook_ops *)&hook->ops; + + return NULL; +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops); + +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, + const struct net_device *dev) +{ + return nft_hook_find_ops(hook, dev); +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu); + static void nft_flowtable_event(unsigned long event, struct net_device *dev, struct nft_flowtable *flowtable) { + struct nf_hook_ops *ops; struct nft_hook *hook; list_for_each_entry(hook, &flowtable->hook_list, list) { - if (hook->ops.dev != dev) + ops = nft_hook_find_ops(hook, dev); + if (!ops) continue; /* flow_offload_netdev_event() cleans up entries for us. */ diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 64675f1c7f29..75b756f0b9f0 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -638,7 +638,7 @@ static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *n found = NULL; basechain = nft_base_chain(chain); list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.dev != dev) + if (!nft_hook_find_ops(hook, dev)) continue; found = hook; diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 19a553550c76..783e4b5ef3e0 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -321,14 +321,16 @@ static const struct nft_chain_type nft_chain_filter_netdev = { static void nft_netdev_event(unsigned long event, struct net_device *dev, struct nft_base_chain *basechain) { + struct nf_hook_ops *ops; struct nft_hook *hook; list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.dev != dev) + ops = nft_hook_find_ops(hook, dev); + if (!ops) continue; if (!(basechain->chain.table->flags & NFT_TABLE_F_DORMANT)) - nf_unregister_net_hook(dev_net(dev), &hook->ops); + nf_unregister_net_hook(dev_net(dev), ops); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 221d50223018..225ff293cd50 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -175,7 +175,7 @@ static bool nft_flowtable_find_dev(const struct net_device *dev, bool found = false; list_for_each_entry_rcu(hook, &ft->hook_list, list) { - if (hook->ops.dev != dev) + if (!nft_hook_find_ops_rcu(hook, dev)) continue; found = true; -- cgit v1.2.3 From 73319a8ee18b9cf0b2dac87f8521595e0381ba0c Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 21 May 2025 22:44:26 +0200 Subject: netfilter: nf_tables: Have a list of nf_hook_ops in nft_hook Supporting a 1:n relationship between nft_hook and nf_hook_ops is convenient since a chain's or flowtable's nft_hooks may remain in place despite matching interfaces disappearing. This stabilizes ruleset dumps in that regard and opens the possibility to claim newly added interfaces which match the spec. Also it prepares for wildcard interface specs since these will potentially match multiple interfaces. All spots dealing with hook registration are updated to handle a list of multiple nf_hook_ops, but nft_netdev_hook_alloc() only adds a single item for now to retain the old behaviour. The only expected functional change here is how vanishing interfaces are handled: Instead of dropping the respective nft_hook, only the matching nf_hook_ops are dropped. To safely remove individual ops from the list in netdev handlers, an rcu_head is added to struct nf_hook_ops so kfree_rcu() may be used. There is at least nft_flowtable_find_dev() which may be iterating through the list at the same time. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 3 + include/net/netfilter/nf_tables.h | 2 +- net/netfilter/nf_tables_api.c | 142 +++++++++++++++++++++++++++----------- net/netfilter/nf_tables_offload.c | 49 +++++++------ net/netfilter/nft_chain_filter.c | 4 +- 5 files changed, 137 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 20947f2c685b..5f896fcc074d 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -95,6 +95,9 @@ enum nf_hook_ops_type { }; struct nf_hook_ops { + struct list_head list; + struct rcu_head rcu; + /* User fills in from here down. */ nf_hookfn *hook; struct net_device *dev; diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index df0b151743a2..5e49619ae49c 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1199,7 +1199,7 @@ struct nft_stats { struct nft_hook { struct list_head list; - struct nf_hook_ops ops; + struct list_head ops_list; struct rcu_head rcu; char ifname[IFNAMSIZ]; u8 ifnamelen; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8fb8bcdfdcb2..62bf498d1ec9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -300,37 +300,60 @@ void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) static int nft_netdev_register_hooks(struct net *net, struct list_head *hook_list) { + struct nf_hook_ops *ops; struct nft_hook *hook; int err, j; j = 0; list_for_each_entry(hook, hook_list, list) { - err = nf_register_net_hook(net, &hook->ops); - if (err < 0) - goto err_register; + list_for_each_entry(ops, &hook->ops_list, list) { + err = nf_register_net_hook(net, ops); + if (err < 0) + goto err_register; - j++; + j++; + } } return 0; err_register: list_for_each_entry(hook, hook_list, list) { - if (j-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (j-- <= 0) + break; - nf_unregister_net_hook(net, &hook->ops); + nf_unregister_net_hook(net, ops); + } } return err; } +static void nft_netdev_hook_free_ops(struct nft_hook *hook) +{ + struct nf_hook_ops *ops, *next; + + list_for_each_entry_safe(ops, next, &hook->ops_list, list) { + list_del(&ops->list); + kfree(ops); + } +} + static void nft_netdev_hook_free(struct nft_hook *hook) { + nft_netdev_hook_free_ops(hook); kfree(hook); } +static void __nft_netdev_hook_free_rcu(struct rcu_head *rcu) +{ + struct nft_hook *hook = container_of(rcu, struct nft_hook, rcu); + + nft_netdev_hook_free(hook); +} + static void nft_netdev_hook_free_rcu(struct nft_hook *hook) { - kfree_rcu(hook, rcu); + call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu); } static void nft_netdev_unregister_hooks(struct net *net, @@ -338,9 +361,11 @@ static void nft_netdev_unregister_hooks(struct net *net, bool release_netdev) { struct nft_hook *hook, *next; + struct nf_hook_ops *ops; list_for_each_entry_safe(hook, next, hook_list, list) { - nf_unregister_net_hook(net, &hook->ops); + list_for_each_entry(ops, &hook->ops_list, list) + nf_unregister_net_hook(net, ops); if (release_netdev) { list_del(&hook->list); nft_netdev_hook_free_rcu(hook); @@ -2284,6 +2309,7 @@ void nf_tables_chain_destroy(struct nft_chain *chain) static struct nft_hook *nft_netdev_hook_alloc(struct net *net, const struct nlattr *attr) { + struct nf_hook_ops *ops; struct net_device *dev; struct nft_hook *hook; int err; @@ -2293,6 +2319,7 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, err = -ENOMEM; goto err_hook_alloc; } + INIT_LIST_HEAD(&hook->ops_list); err = nla_strscpy(hook->ifname, attr, IFNAMSIZ); if (err < 0) @@ -2309,7 +2336,14 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, err = -ENOENT; goto err_hook_dev; } - hook->ops.dev = dev; + + ops = kzalloc(sizeof(struct nf_hook_ops), GFP_KERNEL_ACCOUNT); + if (!ops) { + err = -ENOMEM; + goto err_hook_dev; + } + ops->dev = dev; + list_add_tail(&ops->list, &hook->ops_list); return hook; @@ -2569,6 +2603,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, struct nft_chain_hook *hook, u32 flags) { struct nft_chain *chain; + struct nf_hook_ops *ops; struct nft_hook *h; basechain->type = hook->type; @@ -2577,8 +2612,10 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, if (nft_base_chain_netdev(family, hook->num)) { list_splice_init(&hook->list, &basechain->hook_list); - list_for_each_entry(h, &basechain->hook_list, list) - nft_basechain_hook_init(&h->ops, family, hook, chain); + list_for_each_entry(h, &basechain->hook_list, list) { + list_for_each_entry(ops, &h->ops_list, list) + nft_basechain_hook_init(ops, family, hook, chain); + } } nft_basechain_hook_init(&basechain->ops, family, hook, chain); @@ -2797,11 +2834,13 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) { list_for_each_entry_safe(h, next, &hook.list, list) { - h->ops.pf = basechain->ops.pf; - h->ops.hooknum = basechain->ops.hooknum; - h->ops.priority = basechain->ops.priority; - h->ops.priv = basechain->ops.priv; - h->ops.hook = basechain->ops.hook; + list_for_each_entry(ops, &h->ops_list, list) { + ops->pf = basechain->ops.pf; + ops->hooknum = basechain->ops.hooknum; + ops->priority = basechain->ops.priority; + ops->priv = basechain->ops.priv; + ops->hook = basechain->ops.hook; + } if (nft_hook_list_find(&basechain->hook_list, h)) { list_del(&h->list); @@ -2923,8 +2962,10 @@ err_trans: err_hooks: if (nla[NFTA_CHAIN_HOOK]) { list_for_each_entry_safe(h, next, &hook.list, list) { - if (unregister) - nf_unregister_net_hook(ctx->net, &h->ops); + if (unregister) { + list_for_each_entry(ops, &h->ops_list, list) + nf_unregister_net_hook(ctx->net, ops); + } list_del(&h->list); nft_netdev_hook_free_rcu(h); } @@ -8795,6 +8836,7 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, struct netlink_ext_ack *extack, bool add) { struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; + struct nf_hook_ops *ops; struct nft_hook *hook; int hooknum, priority; int err; @@ -8849,11 +8891,13 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, } list_for_each_entry(hook, &flowtable_hook->list, list) { - hook->ops.pf = NFPROTO_NETDEV; - hook->ops.hooknum = flowtable_hook->num; - hook->ops.priority = flowtable_hook->priority; - hook->ops.priv = &flowtable->data; - hook->ops.hook = flowtable->data.type->hook; + list_for_each_entry(ops, &hook->ops_list, list) { + ops->pf = NFPROTO_NETDEV; + ops->hooknum = flowtable_hook->num; + ops->priority = flowtable_hook->priority; + ops->priv = &flowtable->data; + ops->hook = flowtable->data.type->hook; + } } return err; @@ -8910,9 +8954,11 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, bool release_netdev) { struct nft_hook *hook, *next; + struct nf_hook_ops *ops; list_for_each_entry_safe(hook, next, hook_list, list) { - nft_unregister_flowtable_ops(net, flowtable, &hook->ops); + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(net, flowtable, ops); if (release_netdev) { list_del(&hook->list); nft_netdev_hook_free_rcu(hook); @@ -8954,6 +9000,7 @@ static int nft_register_flowtable_net_hooks(struct net *net, { struct nft_hook *hook, *next; struct nft_flowtable *ft; + struct nf_hook_ops *ops; int err, i = 0; list_for_each_entry(hook, hook_list, list) { @@ -8967,21 +9014,25 @@ static int nft_register_flowtable_net_hooks(struct net *net, } } - err = nft_register_flowtable_ops(net, flowtable, &hook->ops); - if (err < 0) - goto err_unregister_net_hooks; + list_for_each_entry(ops, &hook->ops_list, list) { + err = nft_register_flowtable_ops(net, flowtable, ops); + if (err < 0) + goto err_unregister_net_hooks; - i++; + i++; + } } return 0; err_unregister_net_hooks: list_for_each_entry_safe(hook, next, hook_list, list) { - if (i-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (i-- <= 0) + break; - nft_unregister_flowtable_ops(net, flowtable, &hook->ops); + nft_unregister_flowtable_ops(net, flowtable, ops); + } list_del_rcu(&hook->list); nft_netdev_hook_free_rcu(hook); } @@ -9006,6 +9057,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; struct nft_hook *hook, *next; + struct nf_hook_ops *ops; struct nft_trans *trans; bool unregister = false; u32 flags; @@ -9063,8 +9115,11 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, err_flowtable_update_hook: list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { - if (unregister) - nft_unregister_flowtable_ops(ctx->net, flowtable, &hook->ops); + if (unregister) { + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(ctx->net, + flowtable, ops); + } list_del_rcu(&hook->list); nft_netdev_hook_free_rcu(hook); } @@ -9611,9 +9666,12 @@ nla_put_failure: struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, const struct net_device *dev) { - if (hook->ops.dev == dev) - return (struct nf_hook_ops *)&hook->ops; + struct nf_hook_ops *ops; + list_for_each_entry(ops, &hook->ops_list, list) { + if (ops->dev == dev) + return ops; + } return NULL; } EXPORT_SYMBOL_GPL(nft_hook_find_ops); @@ -9621,7 +9679,13 @@ EXPORT_SYMBOL_GPL(nft_hook_find_ops); struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, const struct net_device *dev) { - return nft_hook_find_ops(hook, dev); + struct nf_hook_ops *ops; + + list_for_each_entry_rcu(ops, &hook->ops_list, list) { + if (ops->dev == dev) + return ops; + } + return NULL; } EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu); @@ -9638,8 +9702,8 @@ static void nft_flowtable_event(unsigned long event, struct net_device *dev, /* flow_offload_netdev_event() cleans up entries for us. */ nft_unregister_flowtable_ops(dev_net(dev), flowtable, ops); - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); break; } } diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 75b756f0b9f0..fd30e205de84 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -220,6 +220,7 @@ static int nft_chain_offload_priority(const struct nft_base_chain *basechain) bool nft_chain_offload_support(const struct nft_base_chain *basechain) { + struct nf_hook_ops *ops; struct net_device *dev; struct nft_hook *hook; @@ -227,13 +228,16 @@ bool nft_chain_offload_support(const struct nft_base_chain *basechain) return false; list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.pf != NFPROTO_NETDEV || - hook->ops.hooknum != NF_NETDEV_INGRESS) - return false; - - dev = hook->ops.dev; - if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists()) - return false; + list_for_each_entry(ops, &hook->ops_list, list) { + if (ops->pf != NFPROTO_NETDEV || + ops->hooknum != NF_NETDEV_INGRESS) + return false; + + dev = ops->dev; + if (!dev->netdev_ops->ndo_setup_tc && + !flow_indr_dev_exists()) + return false; + } } return true; @@ -455,34 +459,37 @@ static int nft_flow_block_chain(struct nft_base_chain *basechain, const struct net_device *this_dev, enum flow_block_command cmd) { - struct net_device *dev; + struct nf_hook_ops *ops; struct nft_hook *hook; int err, i = 0; list_for_each_entry(hook, &basechain->hook_list, list) { - dev = hook->ops.dev; - if (this_dev && this_dev != dev) - continue; + list_for_each_entry(ops, &hook->ops_list, list) { + if (this_dev && this_dev != ops->dev) + continue; - err = nft_chain_offload_cmd(basechain, dev, cmd); - if (err < 0 && cmd == FLOW_BLOCK_BIND) { - if (!this_dev) - goto err_flow_block; + err = nft_chain_offload_cmd(basechain, ops->dev, cmd); + if (err < 0 && cmd == FLOW_BLOCK_BIND) { + if (!this_dev) + goto err_flow_block; - return err; + return err; + } + i++; } - i++; } return 0; err_flow_block: list_for_each_entry(hook, &basechain->hook_list, list) { - if (i-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (i-- <= 0) + break; - dev = hook->ops.dev; - nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); + nft_chain_offload_cmd(basechain, ops->dev, + FLOW_BLOCK_UNBIND); + } } return err; } diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 783e4b5ef3e0..862eab45851a 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -332,8 +332,8 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, if (!(basechain->chain.table->flags & NFT_TABLE_F_DORMANT)) nf_unregister_net_hook(dev_net(dev), ops); - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); break; } } -- cgit v1.2.3 From 465b9ee0ee7bc268d7f261356afd6c4262e48d82 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 21 May 2025 22:44:33 +0200 Subject: netfilter: nf_tables: Add notifications for hook changes Notify user space if netdev hooks are updated due to netdev add/remove events. Send minimal notification messages by introducing NFT_MSG_NEWDEV/DELDEV message types describing a single device only. Upon NETDEV_CHANGENAME, the callback has no information about the interface's old name. To provide a clear message to user space, include the hook's stored interface name in the notification. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++ include/uapi/linux/netfilter/nf_tables.h | 10 ++++++ include/uapi/linux/netfilter/nfnetlink.h | 2 ++ net/netfilter/nf_tables_api.c | 59 ++++++++++++++++++++++++++++++++ net/netfilter/nfnetlink.c | 1 + net/netfilter/nft_chain_filter.c | 2 ++ 6 files changed, 79 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 5e49619ae49c..e4d8e451e935 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1142,6 +1142,11 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); +struct nft_hook; +void nf_tables_chain_device_notify(const struct nft_chain *chain, + const struct nft_hook *hook, + const struct net_device *dev, int event); + enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 2beb30be2c5f..518ba144544c 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -142,6 +142,8 @@ enum nf_tables_msg_types { NFT_MSG_DESTROYOBJ, NFT_MSG_DESTROYFLOWTABLE, NFT_MSG_GETSETELEM_RESET, + NFT_MSG_NEWDEV, + NFT_MSG_DELDEV, NFT_MSG_MAX, }; @@ -1784,10 +1786,18 @@ enum nft_synproxy_attributes { * enum nft_device_attributes - nf_tables device netlink attributes * * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) + * @NFTA_DEVICE_TABLE: table containing the flowtable or chain hooking into the device (NLA_STRING) + * @NFTA_DEVICE_FLOWTABLE: flowtable hooking into the device (NLA_STRING) + * @NFTA_DEVICE_CHAIN: chain hooking into the device (NLA_STRING) + * @NFTA_DEVICE_SPEC: hook spec matching the device (NLA_STRING) */ enum nft_devices_attributes { NFTA_DEVICE_UNSPEC, NFTA_DEVICE_NAME, + NFTA_DEVICE_TABLE, + NFTA_DEVICE_FLOWTABLE, + NFTA_DEVICE_CHAIN, + NFTA_DEVICE_SPEC, __NFTA_DEVICE_MAX }; #define NFTA_DEVICE_MAX (__NFTA_DEVICE_MAX - 1) diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h index 6cd58cd2a6f0..50d807af2649 100644 --- a/include/uapi/linux/netfilter/nfnetlink.h +++ b/include/uapi/linux/netfilter/nfnetlink.h @@ -25,6 +25,8 @@ enum nfnetlink_groups { #define NFNLGRP_ACCT_QUOTA NFNLGRP_ACCT_QUOTA NFNLGRP_NFTRACE, #define NFNLGRP_NFTRACE NFNLGRP_NFTRACE + NFNLGRP_NFT_DEV, +#define NFNLGRP_NFT_DEV NFNLGRP_NFT_DEV __NFNLGRP_MAX, }; #define NFNLGRP_MAX (__NFNLGRP_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a7240736f98e..24c71ecb2179 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9686,6 +9686,64 @@ struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, } EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu); +static void +nf_tables_device_notify(const struct nft_table *table, int attr, + const char *name, const struct nft_hook *hook, + const struct net_device *dev, int event) +{ + struct net *net = dev_net(dev); + struct nlmsghdr *nlh; + struct sk_buff *skb; + u16 flags = 0; + + if (!nfnetlink_has_listeners(net, NFNLGRP_NFT_DEV)) + return; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + goto err; + + event = event == NETDEV_REGISTER ? NFT_MSG_NEWDEV : NFT_MSG_DELDEV; + event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); + nlh = nfnl_msg_put(skb, 0, 0, event, flags, table->family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) + goto err; + + if (nla_put_string(skb, NFTA_DEVICE_TABLE, table->name) || + nla_put_string(skb, attr, name) || + nla_put(skb, NFTA_DEVICE_SPEC, hook->ifnamelen, hook->ifname) || + nla_put_string(skb, NFTA_DEVICE_NAME, dev->name)) + goto err; + + nlmsg_end(skb, nlh); + nfnetlink_send(skb, net, 0, NFNLGRP_NFT_DEV, + nlmsg_report(nlh), GFP_KERNEL); + return; +err: + if (skb) + kfree_skb(skb); + nfnetlink_set_err(net, 0, NFNLGRP_NFT_DEV, -ENOBUFS); +} + +void +nf_tables_chain_device_notify(const struct nft_chain *chain, + const struct nft_hook *hook, + const struct net_device *dev, int event) +{ + nf_tables_device_notify(chain->table, NFTA_DEVICE_CHAIN, + chain->name, hook, dev, event); +} + +static void +nf_tables_flowtable_device_notify(const struct nft_flowtable *ft, + const struct nft_hook *hook, + const struct net_device *dev, int event) +{ + nf_tables_device_notify(ft->table, NFTA_DEVICE_FLOWTABLE, + ft->name, hook, dev, event); +} + static int nft_flowtable_event(unsigned long event, struct net_device *dev, struct nft_flowtable *flowtable, bool changename) { @@ -9733,6 +9791,7 @@ static int nft_flowtable_event(unsigned long event, struct net_device *dev, list_add_tail_rcu(&ops->list, &hook->ops_list); break; } + nf_tables_flowtable_device_notify(flowtable, hook, dev, event); break; } return 0; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e598a2a252b0..ac77fc21632d 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -86,6 +86,7 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES, [NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT, [NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES, + [NFNLGRP_NFT_DEV] = NFNL_SUBSYS_NFTABLES, }; static struct nfnl_net *nfnl_pernet(struct net *net) diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index b16185e9a6dd..846d48ba8965 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -363,6 +363,8 @@ static int nft_netdev_event(unsigned long event, struct net_device *dev, list_add_tail_rcu(&ops->list, &hook->ops_list); break; } + nf_tables_chain_device_notify(&basechain->chain, + hook, dev, event); break; } return 0; -- cgit v1.2.3 From 384492c48e6a88c9a7f0376d8e8ac7f557988e92 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 20 May 2025 13:30:42 -0700 Subject: net: devmem: support single IOV with sendmsg sendmsg() with a single iov becomes ITER_UBUF, sendmsg() with multiple iovs becomes ITER_IOVEC. iter_iov_len does not return correct value for UBUF, so teach to treat UBUF differently. Cc: Al Viro Cc: Pavel Begunkov Cc: Mina Almasry Fixes: bd61848900bf ("net: devmem: Implement TX path") Signed-off-by: Stanislav Fomichev Acked-by: Mina Almasry Reviewed-by: Pavel Begunkov Signed-off-by: David S. Miller --- include/linux/uio.h | 8 +++++++- net/core/datagram.c | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/uio.h b/include/linux/uio.h index 49ece9e1888f..393d0622cc28 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -99,7 +99,13 @@ static inline const struct iovec *iter_iov(const struct iov_iter *iter) } #define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset) -#define iter_iov_len(iter) (iter_iov(iter)->iov_len - (iter)->iov_offset) + +static inline size_t iter_iov_len(const struct iov_iter *i) +{ + if (i->iter_type == ITER_UBUF) + return i->count; + return iter_iov(i)->iov_len - i->iov_offset; +} static inline enum iter_type iov_iter_type(const struct iov_iter *i) { diff --git a/net/core/datagram.c b/net/core/datagram.c index b352a1009304..94cc4705e91d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -702,7 +702,8 @@ zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from, * iov_addrs are interpreted as an offset in bytes into the dma-buf to * send from. We do not support other iter types. */ - if (iov_iter_type(from) != ITER_IOVEC) + if (iov_iter_type(from) != ITER_IOVEC && + iov_iter_type(from) != ITER_UBUF) return -EFAULT; while (length && iov_iter_count(from)) { -- cgit v1.2.3 From e45b7196df60a4aef86c3998611c91fcc93d21f3 Mon Sep 17 00:00:00 2001 From: Qiu Yutan Date: Wed, 21 May 2025 10:14:08 +0800 Subject: net: neigh: use kfree_skb_reason() in neigh_resolve_output() and neigh_connected_output() Replace kfree_skb() used in neigh_resolve_output() and neigh_connected_output() with kfree_skb_reason(). Following new skb drop reason is added: /* failed to fill the device hard header */ SKB_DROP_REASON_NEIGH_HH_FILLFAIL Signed-off-by: Qiu Yutan Signed-off-by: Jiang Kun Reviewed-by: Kuniyuki Iwashima Reviewed-by: Xu Xin Signed-off-by: David S. Miller --- include/net/dropreason-core.h | 3 +++ net/core/neighbour.c | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index bea77934a235..bcf9d7467e1a 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -62,6 +62,7 @@ FN(NEIGH_FAILED) \ FN(NEIGH_QUEUEFULL) \ FN(NEIGH_DEAD) \ + FN(NEIGH_HH_FILLFAIL) \ FN(TC_EGRESS) \ FN(SECURITY_HOOK) \ FN(QDISC_DROP) \ @@ -348,6 +349,8 @@ enum skb_drop_reason { SKB_DROP_REASON_NEIGH_QUEUEFULL, /** @SKB_DROP_REASON_NEIGH_DEAD: neigh entry is dead */ SKB_DROP_REASON_NEIGH_DEAD, + /** @SKB_DROP_REASON_NEIGH_HH_FILLFAIL: failed to fill the device hard header */ + SKB_DROP_REASON_NEIGH_HH_FILLFAIL, /** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */ SKB_DROP_REASON_TC_EGRESS, /** @SKB_DROP_REASON_SECURITY_HOOK: dropped due to security HOOK */ diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 254067b719da..a6e2c91ec3e7 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1517,7 +1517,7 @@ out: return rc; out_kfree_skb: rc = -EINVAL; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); goto out; } EXPORT_SYMBOL(neigh_resolve_output); @@ -1541,7 +1541,7 @@ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) err = dev_queue_xmit(skb); else { err = -EINVAL; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); } return err; } -- cgit v1.2.3 From 45ca7e9f0730ae36fc610e675b990e9cc9ca0714 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 21 May 2025 14:17:05 +0200 Subject: vsock/virtio: fix `rx_bytes` accounting for stream sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In `struct virtio_vsock_sock`, we maintain two counters: - `rx_bytes`: used internally to track how many bytes have been read. This supports mechanisms like .stream_has_data() and sock_rcvlowat(). - `fwd_cnt`: used for the credit mechanism to inform available receive buffer space to the remote peer. These counters are updated via virtio_transport_inc_rx_pkt() and virtio_transport_dec_rx_pkt(). Since the beginning with commit 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko"), we call virtio_transport_dec_rx_pkt() in virtio_transport_stream_do_dequeue() only when we consume the entire packet, so partial reads, do not update `rx_bytes` and `fwd_cnt`. This is fine for `fwd_cnt`, because we still have space used for the entire packet, and we don't want to update the credit for the other peer until we free the space of the entire packet. However, this causes `rx_bytes` to be stale on partial reads. Previously, this didn’t cause issues because `rx_bytes` was used only by .stream_has_data(), and any unread portion of a packet implied data was still available. However, since commit 93b808876682 ("virtio/vsock: fix logic which reduces credit update messages"), we now rely on `rx_bytes` to determine if a credit update should be sent when the data in the RX queue drops below SO_RCVLOWAT value. This patch fixes the accounting by updating `rx_bytes` with the number of bytes actually read, even on partial reads, while leaving `fwd_cnt` untouched until the packet is fully consumed. Also introduce a new `buf_used` counter to check that the remote peer is honoring the given credit; this was previously done via `rx_bytes`. Fixes: 93b808876682 ("virtio/vsock: fix logic which reduces credit update messages") Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20250521121705.196379-1-sgarzare@redhat.com Signed-off-by: Paolo Abeni --- include/linux/virtio_vsock.h | 1 + net/vmw_vsock/virtio_transport_common.c | 26 ++++++++++++++++---------- 2 files changed, 17 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 0387d64e2c66..36fb3edfa403 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -140,6 +140,7 @@ struct virtio_vsock_sock { u32 last_fwd_cnt; u32 rx_bytes; u32 buf_alloc; + u32 buf_used; struct sk_buff_head rx_queue; u32 msg_count; }; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 7f7de6d88096..2c9b1011cdcc 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -441,18 +441,20 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, u32 len) { - if (vvs->rx_bytes + len > vvs->buf_alloc) + if (vvs->buf_used + len > vvs->buf_alloc) return false; vvs->rx_bytes += len; + vvs->buf_used += len; return true; } static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, - u32 len) + u32 bytes_read, u32 bytes_dequeued) { - vvs->rx_bytes -= len; - vvs->fwd_cnt += len; + vvs->rx_bytes -= bytes_read; + vvs->buf_used -= bytes_dequeued; + vvs->fwd_cnt += bytes_dequeued; } void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb) @@ -581,11 +583,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; - size_t bytes, total = 0; struct sk_buff *skb; u32 fwd_cnt_delta; bool low_rx_bytes; int err = -EFAULT; + size_t total = 0; u32 free_space; spin_lock_bh(&vvs->rx_lock); @@ -597,6 +599,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, } while (total < len && !skb_queue_empty(&vvs->rx_queue)) { + size_t bytes, dequeued = 0; + skb = skb_peek(&vvs->rx_queue); bytes = min_t(size_t, len - total, @@ -620,12 +624,12 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, VIRTIO_VSOCK_SKB_CB(skb)->offset += bytes; if (skb->len == VIRTIO_VSOCK_SKB_CB(skb)->offset) { - u32 pkt_len = le32_to_cpu(virtio_vsock_hdr(skb)->len); - - virtio_transport_dec_rx_pkt(vvs, pkt_len); + dequeued = le32_to_cpu(virtio_vsock_hdr(skb)->len); __skb_unlink(skb, &vvs->rx_queue); consume_skb(skb); } + + virtio_transport_dec_rx_pkt(vvs, bytes, dequeued); } fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt; @@ -781,7 +785,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, msg->msg_flags |= MSG_EOR; } - virtio_transport_dec_rx_pkt(vvs, pkt_len); + virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len); kfree_skb(skb); } @@ -1735,6 +1739,7 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto struct sock *sk = sk_vsock(vsk); struct virtio_vsock_hdr *hdr; struct sk_buff *skb; + u32 pkt_len; int off = 0; int err; @@ -1752,7 +1757,8 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) vvs->msg_count--; - virtio_transport_dec_rx_pkt(vvs, le32_to_cpu(hdr->len)); + pkt_len = le32_to_cpu(hdr->len); + virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len); spin_unlock_bh(&vvs->rx_lock); virtio_transport_send_credit_update(vsk); -- cgit v1.2.3 From ed449ddbd867f2cc02d6890c231431f264a876eb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 May 2025 13:46:09 -0700 Subject: net: core: Convert inet_addr_is_any() to sockaddr_storage All the callers of inet_addr_is_any() have a sockaddr_storage-backed sockaddr. Avoid casts and switch prototype to the actual object being used. Reviewed-by: Kuniyuki Iwashima Reviewed-by: Martin K. Petersen # SCSI Signed-off-by: Kees Cook Link: https://patch.msgid.link/20250521204619.2301870-1-kees@kernel.org Signed-off-by: Paolo Abeni --- drivers/nvme/target/rdma.c | 2 +- drivers/nvme/target/tcp.c | 2 +- drivers/target/iscsi/iscsi_target.c | 2 +- include/linux/inet.h | 2 +- net/core/utils.c | 8 ++++---- 5 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 2a4536ef6184..79a5aad2e9d0 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1999,7 +1999,7 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, struct nvmet_rdma_port *port = nport->priv; struct rdma_cm_id *cm_id = port->cm_id; - if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { + if (inet_addr_is_any(&cm_id->route.addr.src_addr)) { struct nvmet_rdma_rsp *rsp = container_of(req, struct nvmet_rdma_rsp, req); struct rdma_cm_id *req_cm_id = rsp->queue->cm_id; diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 12a5cb8641ca..5cd1cf74f8ff 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -2194,7 +2194,7 @@ static void nvmet_tcp_disc_port_addr(struct nvmet_req *req, { struct nvmet_tcp_port *port = nport->priv; - if (inet_addr_is_any((struct sockaddr *)&port->addr)) { + if (inet_addr_is_any(&port->addr)) { struct nvmet_tcp_cmd *cmd = container_of(req, struct nvmet_tcp_cmd, req); struct nvmet_tcp_queue *queue = cmd->queue; diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 620ba6e0ab07..a2dde08c8a62 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -3419,7 +3419,7 @@ iscsit_build_sendtargets_response(struct iscsit_cmd *cmd, } } - if (inet_addr_is_any((struct sockaddr *)&np->np_sockaddr)) + if (inet_addr_is_any(&np->np_sockaddr)) sockaddr = &conn->local_sockaddr; else sockaddr = &np->np_sockaddr; diff --git a/include/linux/inet.h b/include/linux/inet.h index bd8276e96e60..9158772f3559 100644 --- a/include/linux/inet.h +++ b/include/linux/inet.h @@ -55,6 +55,6 @@ extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char extern int inet_pton_with_scope(struct net *net, unsigned short af, const char *src, const char *port, struct sockaddr_storage *addr); -extern bool inet_addr_is_any(struct sockaddr *addr); +bool inet_addr_is_any(struct sockaddr_storage *addr); #endif /* _LINUX_INET_H */ diff --git a/net/core/utils.c b/net/core/utils.c index 27f4cffaae05..e47feeaa5a49 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -399,9 +399,9 @@ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af, } EXPORT_SYMBOL(inet_pton_with_scope); -bool inet_addr_is_any(struct sockaddr *addr) +bool inet_addr_is_any(struct sockaddr_storage *addr) { - if (addr->sa_family == AF_INET6) { + if (addr->ss_family == AF_INET6) { struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr; const struct sockaddr_in6 in6_any = { .sin6_addr = IN6ADDR_ANY_INIT }; @@ -409,13 +409,13 @@ bool inet_addr_is_any(struct sockaddr *addr) if (!memcmp(in6->sin6_addr.s6_addr, in6_any.sin6_addr.s6_addr, 16)) return true; - } else if (addr->sa_family == AF_INET) { + } else if (addr->ss_family == AF_INET) { struct sockaddr_in *in = (struct sockaddr_in *)addr; if (in->sin_addr.s_addr == htonl(INADDR_ANY)) return true; } else { - pr_warn("unexpected address family %u\n", addr->sa_family); + pr_warn("unexpected address family %u\n", addr->ss_family); } return false; -- cgit v1.2.3 From 161972650d6795ea00f8b72557cf3c3e593ed250 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 May 2025 13:46:10 -0700 Subject: net: core: Switch netif_set_mac_address() to struct sockaddr_storage In order to avoid passing around struct sockaddr that has a size the compiler cannot reason about (nor track at runtime), convert netif_set_mac_address() to take struct sockaddr_storage. This is just a cast conversion, so there is are no binary changes. Following patches will make actual allocation changes. Acked-by: Gustavo A. R. Silva Signed-off-by: Kees Cook Link: https://patch.msgid.link/20250521204619.2301870-2-kees@kernel.org Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- net/core/dev.c | 10 +++++----- net/core/dev_api.c | 4 ++-- net/core/rtnetlink.c | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ea9d335de130..47200a394a02 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4212,7 +4212,7 @@ int netif_set_mtu(struct net_device *dev, int new_mtu); int dev_set_mtu(struct net_device *, int); int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); -int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa, +int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); diff --git a/net/core/dev.c b/net/core/dev.c index 3eb4e945f312..6a0560546eeb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9669,7 +9669,7 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, } EXPORT_SYMBOL(dev_pre_changeaddr_notify); -int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa, +int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; @@ -9677,15 +9677,15 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa, if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; - if (sa->sa_family != dev->type) + if (ss->ss_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); + err = dev_pre_changeaddr_notify(dev, ss->__data, extack); if (err) return err; - if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) { - err = ops->ndo_set_mac_address(dev, sa); + if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) { + err = ops->ndo_set_mac_address(dev, ss); if (err) return err; } diff --git a/net/core/dev_api.c b/net/core/dev_api.c index f9a160ab596f..b5f293e637d9 100644 --- a/net/core/dev_api.c +++ b/net/core/dev_api.c @@ -91,7 +91,7 @@ int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, down_write(&dev_addr_sem); netdev_lock_ops(dev); - ret = netif_set_mac_address(dev, sa, extack); + ret = netif_set_mac_address(dev, (struct sockaddr_storage *)sa, extack); netdev_unlock_ops(dev); up_write(&dev_addr_sem); @@ -332,7 +332,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, int ret; netdev_lock_ops(dev); - ret = netif_set_mac_address(dev, sa, extack); + ret = netif_set_mac_address(dev, (struct sockaddr_storage *)sa, extack); netdev_unlock_ops(dev); return ret; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8a914b37ef6e..9743f1c2ae3c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3100,7 +3100,7 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); - err = netif_set_mac_address(dev, sa, extack); + err = netif_set_mac_address(dev, (struct sockaddr_storage *)sa, extack); kfree(sa); if (err) { up_write(&dev_addr_sem); -- cgit v1.2.3 From 9ca6804ab7c34f65fcf2e29333a39e7807c30b60 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 May 2025 13:46:14 -0700 Subject: net: core: Convert dev_set_mac_address() to struct sockaddr_storage All users of dev_set_mac_address() are now using a struct sockaddr_storage. Convert the internal data type to struct sockaddr_storage, drop the casts, and update pointer types. Acked-by: Gustavo A. R. Silva Signed-off-by: Kees Cook Link: https://patch.msgid.link/20250521204619.2301870-6-kees@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_alb.c | 8 +++----- drivers/net/bonding/bond_main.c | 15 ++++++--------- drivers/net/hyperv/netvsc_drv.c | 6 +++--- drivers/net/macvlan.c | 18 +++++++++--------- drivers/net/team/team_core.c | 2 +- drivers/net/usb/r8152.c | 2 +- include/linux/netdevice.h | 2 +- net/core/dev.c | 1 + net/core/dev_api.c | 6 +++--- net/ieee802154/nl-phy.c | 2 +- net/ncsi/ncsi-manage.c | 2 +- 11 files changed, 30 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index 7edf0fd58c34..2d37b07c8215 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -1035,7 +1035,7 @@ static int alb_set_slave_mac_addr(struct slave *slave, const u8 addr[], */ memcpy(ss.__data, addr, len); ss.ss_family = dev->type; - if (dev_set_mac_address(dev, (struct sockaddr *)&ss, NULL)) { + if (dev_set_mac_address(dev, &ss, NULL)) { slave_err(slave->bond->dev, dev, "dev_set_mac_address on slave failed! ALB mode requires that the base driver support setting the hw address also when the network device's interface is open\n"); return -EOPNOTSUPP; } @@ -1273,8 +1273,7 @@ unwind: break; bond_hw_addr_copy(tmp_addr, rollback_slave->dev->dev_addr, rollback_slave->dev->addr_len); - dev_set_mac_address(rollback_slave->dev, - (struct sockaddr *)&ss, NULL); + dev_set_mac_address(rollback_slave->dev, &ss, NULL); dev_addr_set(rollback_slave->dev, tmp_addr); } @@ -1763,8 +1762,7 @@ void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave bond->dev->addr_len); ss.ss_family = bond->dev->type; /* we don't care if it can't change its mac, best effort */ - dev_set_mac_address(new_slave->dev, (struct sockaddr *)&ss, - NULL); + dev_set_mac_address(new_slave->dev, &ss, NULL); dev_addr_set(new_slave->dev, tmp_addr); } diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 98cf4486fcee..c4d53e8e7c15 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1112,8 +1112,7 @@ static void bond_do_fail_over_mac(struct bonding *bond, ss.ss_family = bond->dev->type; } - rv = dev_set_mac_address(new_active->dev, - (struct sockaddr *)&ss, NULL); + rv = dev_set_mac_address(new_active->dev, &ss, NULL); if (rv) { slave_err(bond->dev, new_active->dev, "Error %d setting MAC of new active slave\n", -rv); @@ -1127,8 +1126,7 @@ static void bond_do_fail_over_mac(struct bonding *bond, new_active->dev->addr_len); ss.ss_family = old_active->dev->type; - rv = dev_set_mac_address(old_active->dev, - (struct sockaddr *)&ss, NULL); + rv = dev_set_mac_address(old_active->dev, &ss, NULL); if (rv) slave_err(bond->dev, old_active->dev, "Error %d setting MAC of old active slave\n", -rv); @@ -2127,7 +2125,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, } ss.ss_family = slave_dev->type; - res = dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, extack); + res = dev_set_mac_address(slave_dev, &ss, extack); if (res) { slave_err(bond_dev, slave_dev, "Error %d calling set_mac_address\n", res); goto err_restore_mtu; @@ -2455,7 +2453,7 @@ err_restore_mac: bond_hw_addr_copy(ss.__data, new_slave->perm_hwaddr, new_slave->dev->addr_len); ss.ss_family = slave_dev->type; - dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL); + dev_set_mac_address(slave_dev, &ss, NULL); } err_restore_mtu: @@ -2649,7 +2647,7 @@ static int __bond_release_one(struct net_device *bond_dev, bond_hw_addr_copy(ss.__data, slave->perm_hwaddr, slave->dev->addr_len); ss.ss_family = slave_dev->type; - dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL); + dev_set_mac_address(slave_dev, &ss, NULL); } if (unregister) { @@ -4936,8 +4934,7 @@ unwind: if (rollback_slave == slave) break; - tmp_res = dev_set_mac_address(rollback_slave->dev, - (struct sockaddr *)&tmp_ss, NULL); + tmp_res = dev_set_mac_address(rollback_slave->dev, &tmp_ss, NULL); if (tmp_res) { slave_dbg(bond_dev, rollback_slave->dev, "%s: unwind err %d\n", __func__, tmp_res); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index d8b169ac0343..14a0d04e21ae 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -1371,7 +1371,7 @@ static int netvsc_set_mac_addr(struct net_device *ndev, void *p) struct net_device_context *ndc = netdev_priv(ndev); struct net_device *vf_netdev = rtnl_dereference(ndc->vf_netdev); struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev); - struct sockaddr *addr = p; + struct sockaddr_storage *addr = p; int err; err = eth_prepare_mac_addr_change(ndev, p); @@ -1387,12 +1387,12 @@ static int netvsc_set_mac_addr(struct net_device *ndev, void *p) return err; } - err = rndis_filter_set_device_mac(nvdev, addr->sa_data); + err = rndis_filter_set_device_mac(nvdev, addr->__data); if (!err) { eth_commit_mac_addr_change(ndev, p); } else if (vf_netdev) { /* rollback change on VF */ - memcpy(addr->sa_data, ndev->dev_addr, ETH_ALEN); + memcpy(addr->__data, ndev->dev_addr, ETH_ALEN); dev_set_mac_address(vf_netdev, addr, NULL); } diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 7045b1d58754..4df991e494bd 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -754,13 +754,13 @@ static int macvlan_sync_address(struct net_device *dev, static int macvlan_set_mac_address(struct net_device *dev, void *p) { struct macvlan_dev *vlan = netdev_priv(dev); - struct sockaddr *addr = p; + struct sockaddr_storage *addr = p; - if (!is_valid_ether_addr(addr->sa_data)) + if (!is_valid_ether_addr(addr->__data)) return -EADDRNOTAVAIL; /* If the addresses are the same, this is a no-op */ - if (ether_addr_equal(dev->dev_addr, addr->sa_data)) + if (ether_addr_equal(dev->dev_addr, addr->__data)) return 0; if (vlan->mode == MACVLAN_MODE_PASSTHRU) { @@ -768,10 +768,10 @@ static int macvlan_set_mac_address(struct net_device *dev, void *p) return dev_set_mac_address(vlan->lowerdev, addr, NULL); } - if (macvlan_addr_busy(vlan->port, addr->sa_data)) + if (macvlan_addr_busy(vlan->port, addr->__data)) return -EADDRINUSE; - return macvlan_sync_address(dev, addr->sa_data); + return macvlan_sync_address(dev, addr->__data); } static void macvlan_change_rx_flags(struct net_device *dev, int change) @@ -1295,11 +1295,11 @@ static void macvlan_port_destroy(struct net_device *dev) */ if (macvlan_passthru(port) && !ether_addr_equal(port->dev->dev_addr, port->perm_addr)) { - struct sockaddr sa; + struct sockaddr_storage ss; - sa.sa_family = port->dev->type; - memcpy(&sa.sa_data, port->perm_addr, port->dev->addr_len); - dev_set_mac_address(port->dev, &sa, NULL); + ss.ss_family = port->dev->type; + memcpy(&ss.__data, port->perm_addr, port->dev->addr_len); + dev_set_mac_address(port->dev, &ss, NULL); } kfree(port); diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index b75ceb90359f..8bc56186b2a3 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -55,7 +55,7 @@ static int __set_port_dev_addr(struct net_device *port_dev, memcpy(addr.__data, dev_addr, port_dev->addr_len); addr.ss_family = port_dev->type; - return dev_set_mac_address(port_dev, (struct sockaddr *)&addr, NULL); + return dev_set_mac_address(port_dev, &addr, NULL); } static int team_port_set_orig_dev_addr(struct team_port *port) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index b18dee1b1bb3..d6589b24c68d 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -8432,7 +8432,7 @@ static int rtl8152_post_reset(struct usb_interface *intf) /* reset the MAC address in case of policy change */ if (determine_ethernet_addr(tp, &ss) >= 0) - dev_set_mac_address(tp->netdev, (struct sockaddr *)&ss, NULL); + dev_set_mac_address(tp->netdev, &ss, NULL); netdev = tp->netdev; if (!netif_running(netdev)) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 47200a394a02..b4242b997373 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4214,7 +4214,7 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, +int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); diff --git a/net/core/dev.c b/net/core/dev.c index 6a0560546eeb..2b514d95c528 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9697,6 +9697,7 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, DECLARE_RWSEM(dev_addr_sem); +/* "sa" is a true struct sockaddr with limited "sa_data" member. */ int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { size_t size = sizeof(sa->sa_data_min); diff --git a/net/core/dev_api.c b/net/core/dev_api.c index b5f293e637d9..6011a5ef649d 100644 --- a/net/core/dev_api.c +++ b/net/core/dev_api.c @@ -319,20 +319,20 @@ EXPORT_SYMBOL(dev_set_allmulti); /** * dev_set_mac_address() - change Media Access Control Address * @dev: device - * @sa: new address + * @ss: new address * @extack: netlink extended ack * * Change the hardware (MAC) address of the device * * Return: 0 on success, -errno on failure. */ -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, +int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack) { int ret; netdev_lock_ops(dev); - ret = netif_set_mac_address(dev, (struct sockaddr_storage *)sa, extack); + ret = netif_set_mac_address(dev, ss, extack); netdev_unlock_ops(dev); return ret; diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index ee2b190e8e0d..4c07a475c567 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -234,7 +234,7 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) * dev_set_mac_address require RTNL_LOCK */ rtnl_lock(); - rc = dev_set_mac_address(dev, (struct sockaddr *)&addr, NULL); + rc = dev_set_mac_address(dev, &addr, NULL); rtnl_unlock(); if (rc) goto dev_unregister; diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 0202db2aea3e..b36947063783 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1058,7 +1058,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) break; case ncsi_dev_state_config_apply_mac: rtnl_lock(); - ret = dev_set_mac_address(dev, (struct sockaddr *)&ndp->pending_mac, NULL); + ret = dev_set_mac_address(dev, &ndp->pending_mac, NULL); rtnl_unlock(); if (ret < 0) netdev_warn(dev, "NCSI: 'Writing MAC address to device failed\n"); -- cgit v1.2.3 From ae9fcd5a0f8ab7e12619e1c66312a03b842935c3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 May 2025 13:46:16 -0700 Subject: net: core: Convert dev_set_mac_address_user() to use struct sockaddr_storage Convert callers of dev_set_mac_address_user() to use struct sockaddr_storage. Add sanity checks on dev->addr_len usage. Signed-off-by: Kees Cook Acked-by: Gustavo A. R. Silva Link: https://patch.msgid.link/20250521204619.2301870-8-kees@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/tap.c | 14 +++++++++----- drivers/net/tun.c | 8 +++++++- include/linux/netdevice.h | 2 +- net/core/dev_api.c | 5 +++-- net/core/dev_ioctl.c | 6 ++++-- 5 files changed, 24 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/net/tap.c b/drivers/net/tap.c index d4ece538f1b2..bdf0788d8e66 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -923,7 +923,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd, unsigned int __user *up = argp; unsigned short u; int __user *sp = argp; - struct sockaddr sa; + struct sockaddr_storage ss; int s; int ret; @@ -1000,16 +1000,17 @@ static long tap_ioctl(struct file *file, unsigned int cmd, return -ENOLINK; } ret = 0; - dev_get_mac_address(&sa, dev_net(tap->dev), tap->dev->name); + dev_get_mac_address((struct sockaddr *)&ss, dev_net(tap->dev), + tap->dev->name); if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) || - copy_to_user(&ifr->ifr_hwaddr, &sa, sizeof(sa))) + copy_to_user(&ifr->ifr_hwaddr, &ss, sizeof(ifr->ifr_hwaddr))) ret = -EFAULT; tap_put_tap_dev(tap); rtnl_unlock(); return ret; case SIOCSIFHWADDR: - if (copy_from_user(&sa, &ifr->ifr_hwaddr, sizeof(sa))) + if (copy_from_user(&ss, &ifr->ifr_hwaddr, sizeof(ifr->ifr_hwaddr))) return -EFAULT; rtnl_lock(); tap = tap_get_tap_dev(q); @@ -1017,7 +1018,10 @@ static long tap_ioctl(struct file *file, unsigned int cmd, rtnl_unlock(); return -ENOLINK; } - ret = dev_set_mac_address_user(tap->dev, &sa, NULL); + if (tap->dev->addr_len > sizeof(ifr->ifr_hwaddr)) + ret = -EINVAL; + else + ret = dev_set_mac_address_user(tap->dev, &ss, NULL); tap_put_tap_dev(tap); rtnl_unlock(); return ret; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 7babd1e9a378..1207196cbbed 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3193,7 +3193,13 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, case SIOCSIFHWADDR: /* Set hw address */ - ret = dev_set_mac_address_user(tun->dev, &ifr.ifr_hwaddr, NULL); + if (tun->dev->addr_len > sizeof(ifr.ifr_hwaddr)) { + ret = -EINVAL; + break; + } + ret = dev_set_mac_address_user(tun->dev, + (struct sockaddr_storage *)&ifr.ifr_hwaddr, + NULL); break; case TUNGETSNDBUF: diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b4242b997373..adb14db25798 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4216,7 +4216,7 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); -int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, +int dev_set_mac_address_user(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); int dev_get_port_parent_id(struct net_device *dev, diff --git a/net/core/dev_api.c b/net/core/dev_api.c index 6011a5ef649d..1bf0153195f2 100644 --- a/net/core/dev_api.c +++ b/net/core/dev_api.c @@ -84,14 +84,15 @@ void dev_set_group(struct net_device *dev, int new_group) netdev_unlock_ops(dev); } -int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, +int dev_set_mac_address_user(struct net_device *dev, + struct sockaddr_storage *ss, struct netlink_ext_ack *extack) { int ret; down_write(&dev_addr_sem); netdev_lock_ops(dev); - ret = netif_set_mac_address(dev, (struct sockaddr_storage *)sa, extack); + ret = netif_set_mac_address(dev, ss, extack); netdev_unlock_ops(dev); up_write(&dev_addr_sem); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index fff13a8b48f1..616479e71466 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -572,9 +572,11 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, return dev_set_mtu(dev, ifr->ifr_mtu); case SIOCSIFHWADDR: - if (dev->addr_len > sizeof(struct sockaddr)) + if (dev->addr_len > sizeof(ifr->ifr_hwaddr)) return -EINVAL; - return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL); + return dev_set_mac_address_user(dev, + (struct sockaddr_storage *)&ifr->ifr_hwaddr, + NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) -- cgit v1.2.3 From ba3d7b93dbe3202bf8ead473d75885af773068bc Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Wed, 21 May 2025 23:27:06 +0200 Subject: wireguard: allowedips: add WGALLOWEDIP_F_REMOVE_ME flag The current netlink API for WireGuard does not directly support removal of allowed ips from a peer. A user can remove an allowed ip from a peer in one of two ways: 1. By using the WGPEER_F_REPLACE_ALLOWEDIPS flag and providing a new list of allowed ips which omits the allowed ip that is to be removed. 2. By reassigning an allowed ip to a "dummy" peer then removing that peer with WGPEER_F_REMOVE_ME. With the first approach, the driver completely rebuilds the allowed ip list for a peer. If my current configuration is such that a peer has allowed ips 192.168.0.2 and 192.168.0.3 and I want to remove 192.168.0.2 the actual transition looks like this. [192.168.0.2, 192.168.0.3] <-- Initial state [] <-- Step 1: Allowed ips removed for peer [192.168.0.3] <-- Step 2: Allowed ips added back for peer This is true even if the allowed ip list is small and the update does not need to be batched into multiple WG_CMD_SET_DEVICE requests, as the removal and subsequent addition of ips is non-atomic within a single request. Consequently, wg_allowedips_lookup_dst and wg_allowedips_lookup_src may return NULL while reconfiguring a peer even for packets bound for ips a user did not intend to remove leading to unintended interruptions in connectivity. This presents in userspace as failed calls to sendto and sendmsg for UDP sockets. In my case, I ran netperf while repeatedly reconfiguring the allowed ips for a peer with wg. /usr/local/bin/netperf -H 10.102.73.72 -l 10m -t UDP_STREAM -- -R 1 -m 1024 send_data: data send error: No route to host (errno 113) netperf: send_omni: send_data failed: No route to host While this may not be of particular concern for environments where peers and allowed ips are mostly static, systems like Cilium manage peers and allowed ips in a dynamic environment where peers (i.e. Kubernetes nodes) and allowed ips (i.e. pods running on those nodes) can frequently change making WGPEER_F_REPLACE_ALLOWEDIPS problematic. The second approach avoids any possible connectivity interruptions but is hacky and less direct, requiring the creation of a temporary peer just to dispose of an allowed ip. Introduce a new flag called WGALLOWEDIP_F_REMOVE_ME which in the same way that WGPEER_F_REMOVE_ME allows a user to remove a single peer from a WireGuard device's configuration allows a user to remove an ip from a peer's set of allowed ips. This enables incremental updates to a device's configuration without any connectivity blips or messy workarounds. A corresponding patch for wg extends the existing `wg set` interface to leverage this feature. $ wg set wg0 peer allowed-ips +192.168.88.0/24,-192.168.0.1/32 When '+' or '-' is prepended to any ip in the list, wg clears WGPEER_F_REPLACE_ALLOWEDIPS and sets the WGALLOWEDIP_F_REMOVE_ME flag on any ip prefixed with '-'. Signed-off-by: Jordan Rife [Jason: minor style nits, fixes to selftest, bump of wireguard-tools version] Signed-off-by: Jason A. Donenfeld Link: https://patch.msgid.link/20250521212707.1767879-5-Jason@zx2c4.com Signed-off-by: Paolo Abeni --- drivers/net/wireguard/allowedips.c | 102 +++++++++++++++++------- drivers/net/wireguard/allowedips.h | 4 + drivers/net/wireguard/netlink.c | 37 ++++++--- drivers/net/wireguard/selftest/allowedips.c | 48 +++++++++++ include/uapi/linux/wireguard.h | 9 +++ tools/testing/selftests/wireguard/netns.sh | 29 +++++++ tools/testing/selftests/wireguard/qemu/Makefile | 2 +- 7 files changed, 187 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c index 4b8528206cc8..09f7fcd7da78 100644 --- a/drivers/net/wireguard/allowedips.c +++ b/drivers/net/wireguard/allowedips.c @@ -249,6 +249,52 @@ static int add(struct allowedips_node __rcu **trie, u8 bits, const u8 *key, return 0; } +static void remove_node(struct allowedips_node *node, struct mutex *lock) +{ + struct allowedips_node *child, **parent_bit, *parent; + bool free_parent; + + list_del_init(&node->peer_list); + RCU_INIT_POINTER(node->peer, NULL); + if (node->bit[0] && node->bit[1]) + return; + child = rcu_dereference_protected(node->bit[!rcu_access_pointer(node->bit[0])], + lockdep_is_held(lock)); + if (child) + child->parent_bit_packed = node->parent_bit_packed; + parent_bit = (struct allowedips_node **)(node->parent_bit_packed & ~3UL); + *parent_bit = child; + parent = (void *)parent_bit - + offsetof(struct allowedips_node, bit[node->parent_bit_packed & 1]); + free_parent = !rcu_access_pointer(node->bit[0]) && !rcu_access_pointer(node->bit[1]) && + (node->parent_bit_packed & 3) <= 1 && !rcu_access_pointer(parent->peer); + if (free_parent) + child = rcu_dereference_protected(parent->bit[!(node->parent_bit_packed & 1)], + lockdep_is_held(lock)); + call_rcu(&node->rcu, node_free_rcu); + if (!free_parent) + return; + if (child) + child->parent_bit_packed = parent->parent_bit_packed; + *(struct allowedips_node **)(parent->parent_bit_packed & ~3UL) = child; + call_rcu(&parent->rcu, node_free_rcu); +} + +static int remove(struct allowedips_node __rcu **trie, u8 bits, const u8 *key, + u8 cidr, struct wg_peer *peer, struct mutex *lock) +{ + struct allowedips_node *node; + + if (unlikely(cidr > bits)) + return -EINVAL; + if (!rcu_access_pointer(*trie) || !node_placement(*trie, key, cidr, bits, &node, lock) || + peer != rcu_access_pointer(node->peer)) + return 0; + + remove_node(node, lock); + return 0; +} + void wg_allowedips_init(struct allowedips *table) { table->root4 = table->root6 = NULL; @@ -300,44 +346,38 @@ int wg_allowedips_insert_v6(struct allowedips *table, const struct in6_addr *ip, return add(&table->root6, 128, key, cidr, peer, lock); } +int wg_allowedips_remove_v4(struct allowedips *table, const struct in_addr *ip, + u8 cidr, struct wg_peer *peer, struct mutex *lock) +{ + /* Aligned so it can be passed to fls */ + u8 key[4] __aligned(__alignof(u32)); + + ++table->seq; + swap_endian(key, (const u8 *)ip, 32); + return remove(&table->root4, 32, key, cidr, peer, lock); +} + +int wg_allowedips_remove_v6(struct allowedips *table, const struct in6_addr *ip, + u8 cidr, struct wg_peer *peer, struct mutex *lock) +{ + /* Aligned so it can be passed to fls64 */ + u8 key[16] __aligned(__alignof(u64)); + + ++table->seq; + swap_endian(key, (const u8 *)ip, 128); + return remove(&table->root6, 128, key, cidr, peer, lock); +} + void wg_allowedips_remove_by_peer(struct allowedips *table, struct wg_peer *peer, struct mutex *lock) { - struct allowedips_node *node, *child, **parent_bit, *parent, *tmp; - bool free_parent; + struct allowedips_node *node, *tmp; if (list_empty(&peer->allowedips_list)) return; ++table->seq; - list_for_each_entry_safe(node, tmp, &peer->allowedips_list, peer_list) { - list_del_init(&node->peer_list); - RCU_INIT_POINTER(node->peer, NULL); - if (node->bit[0] && node->bit[1]) - continue; - child = rcu_dereference_protected(node->bit[!rcu_access_pointer(node->bit[0])], - lockdep_is_held(lock)); - if (child) - child->parent_bit_packed = node->parent_bit_packed; - parent_bit = (struct allowedips_node **)(node->parent_bit_packed & ~3UL); - *parent_bit = child; - parent = (void *)parent_bit - - offsetof(struct allowedips_node, bit[node->parent_bit_packed & 1]); - free_parent = !rcu_access_pointer(node->bit[0]) && - !rcu_access_pointer(node->bit[1]) && - (node->parent_bit_packed & 3) <= 1 && - !rcu_access_pointer(parent->peer); - if (free_parent) - child = rcu_dereference_protected( - parent->bit[!(node->parent_bit_packed & 1)], - lockdep_is_held(lock)); - call_rcu(&node->rcu, node_free_rcu); - if (!free_parent) - continue; - if (child) - child->parent_bit_packed = parent->parent_bit_packed; - *(struct allowedips_node **)(parent->parent_bit_packed & ~3UL) = child; - call_rcu(&parent->rcu, node_free_rcu); - } + list_for_each_entry_safe(node, tmp, &peer->allowedips_list, peer_list) + remove_node(node, lock); } int wg_allowedips_read_node(struct allowedips_node *node, u8 ip[16], u8 *cidr) diff --git a/drivers/net/wireguard/allowedips.h b/drivers/net/wireguard/allowedips.h index 2346c797eb4d..931958cb6e10 100644 --- a/drivers/net/wireguard/allowedips.h +++ b/drivers/net/wireguard/allowedips.h @@ -38,6 +38,10 @@ int wg_allowedips_insert_v4(struct allowedips *table, const struct in_addr *ip, u8 cidr, struct wg_peer *peer, struct mutex *lock); int wg_allowedips_insert_v6(struct allowedips *table, const struct in6_addr *ip, u8 cidr, struct wg_peer *peer, struct mutex *lock); +int wg_allowedips_remove_v4(struct allowedips *table, const struct in_addr *ip, + u8 cidr, struct wg_peer *peer, struct mutex *lock); +int wg_allowedips_remove_v6(struct allowedips *table, const struct in6_addr *ip, + u8 cidr, struct wg_peer *peer, struct mutex *lock); void wg_allowedips_remove_by_peer(struct allowedips *table, struct wg_peer *peer, struct mutex *lock); /* The ip input pointer should be __aligned(__alignof(u64))) */ diff --git a/drivers/net/wireguard/netlink.c b/drivers/net/wireguard/netlink.c index bbb1a7fe1c57..67f962eb8b46 100644 --- a/drivers/net/wireguard/netlink.c +++ b/drivers/net/wireguard/netlink.c @@ -46,7 +46,8 @@ static const struct nla_policy peer_policy[WGPEER_A_MAX + 1] = { static const struct nla_policy allowedip_policy[WGALLOWEDIP_A_MAX + 1] = { [WGALLOWEDIP_A_FAMILY] = { .type = NLA_U16 }, [WGALLOWEDIP_A_IPADDR] = NLA_POLICY_MIN_LEN(sizeof(struct in_addr)), - [WGALLOWEDIP_A_CIDR_MASK] = { .type = NLA_U8 } + [WGALLOWEDIP_A_CIDR_MASK] = { .type = NLA_U8 }, + [WGALLOWEDIP_A_FLAGS] = NLA_POLICY_MASK(NLA_U32, __WGALLOWEDIP_F_ALL), }; static struct wg_device *lookup_interface(struct nlattr **attrs, @@ -329,6 +330,7 @@ static int set_port(struct wg_device *wg, u16 port) static int set_allowedip(struct wg_peer *peer, struct nlattr **attrs) { int ret = -EINVAL; + u32 flags = 0; u16 family; u8 cidr; @@ -337,19 +339,30 @@ static int set_allowedip(struct wg_peer *peer, struct nlattr **attrs) return ret; family = nla_get_u16(attrs[WGALLOWEDIP_A_FAMILY]); cidr = nla_get_u8(attrs[WGALLOWEDIP_A_CIDR_MASK]); + if (attrs[WGALLOWEDIP_A_FLAGS]) + flags = nla_get_u32(attrs[WGALLOWEDIP_A_FLAGS]); if (family == AF_INET && cidr <= 32 && - nla_len(attrs[WGALLOWEDIP_A_IPADDR]) == sizeof(struct in_addr)) - ret = wg_allowedips_insert_v4( - &peer->device->peer_allowedips, - nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, peer, - &peer->device->device_update_lock); - else if (family == AF_INET6 && cidr <= 128 && - nla_len(attrs[WGALLOWEDIP_A_IPADDR]) == sizeof(struct in6_addr)) - ret = wg_allowedips_insert_v6( - &peer->device->peer_allowedips, - nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, peer, - &peer->device->device_update_lock); + nla_len(attrs[WGALLOWEDIP_A_IPADDR]) == sizeof(struct in_addr)) { + if (flags & WGALLOWEDIP_F_REMOVE_ME) + ret = wg_allowedips_remove_v4(&peer->device->peer_allowedips, + nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, + peer, &peer->device->device_update_lock); + else + ret = wg_allowedips_insert_v4(&peer->device->peer_allowedips, + nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, + peer, &peer->device->device_update_lock); + } else if (family == AF_INET6 && cidr <= 128 && + nla_len(attrs[WGALLOWEDIP_A_IPADDR]) == sizeof(struct in6_addr)) { + if (flags & WGALLOWEDIP_F_REMOVE_ME) + ret = wg_allowedips_remove_v6(&peer->device->peer_allowedips, + nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, + peer, &peer->device->device_update_lock); + else + ret = wg_allowedips_insert_v6(&peer->device->peer_allowedips, + nla_data(attrs[WGALLOWEDIP_A_IPADDR]), cidr, + peer, &peer->device->device_update_lock); + } return ret; } diff --git a/drivers/net/wireguard/selftest/allowedips.c b/drivers/net/wireguard/selftest/allowedips.c index 25de7058701a..41837efa70cb 100644 --- a/drivers/net/wireguard/selftest/allowedips.c +++ b/drivers/net/wireguard/selftest/allowedips.c @@ -460,6 +460,10 @@ static __init struct wg_peer *init_peer(void) wg_allowedips_insert_v##version(&t, ip##version(ipa, ipb, ipc, ipd), \ cidr, mem, &mutex) +#define remove(version, mem, ipa, ipb, ipc, ipd, cidr) \ + wg_allowedips_remove_v##version(&t, ip##version(ipa, ipb, ipc, ipd), \ + cidr, mem, &mutex) + #define maybe_fail() do { \ ++i; \ if (!_s) { \ @@ -585,6 +589,50 @@ bool __init wg_allowedips_selftest(void) test_negative(4, a, 192, 0, 0, 0); test_negative(4, a, 255, 0, 0, 0); + insert(4, a, 1, 0, 0, 0, 32); + insert(4, a, 192, 0, 0, 0, 24); + insert(6, a, 0x24446801, 0x40e40800, 0xdeaebeef, 0xdefbeef, 128); + insert(6, a, 0x24446800, 0xf0e40800, 0xeeaebeef, 0, 98); + test(4, a, 1, 0, 0, 0); + test(4, a, 192, 0, 0, 1); + test(6, a, 0x24446801, 0x40e40800, 0xdeaebeef, 0xdefbeef); + test(6, a, 0x24446800, 0xf0e40800, 0xeeaebeef, 0x10101010); + /* Must be an exact match to remove */ + remove(4, a, 192, 0, 0, 0, 32); + test(4, a, 192, 0, 0, 1); + /* NULL peer should have no effect and return 0 */ + test_boolean(!remove(4, NULL, 192, 0, 0, 0, 24)); + test(4, a, 192, 0, 0, 1); + /* different peer should have no effect and return 0 */ + test_boolean(!remove(4, b, 192, 0, 0, 0, 24)); + test(4, a, 192, 0, 0, 1); + /* invalid CIDR should have no effect and return -EINVAL */ + test_boolean(remove(4, b, 192, 0, 0, 0, 33) == -EINVAL); + test(4, a, 192, 0, 0, 1); + remove(4, a, 192, 0, 0, 0, 24); + test_negative(4, a, 192, 0, 0, 1); + remove(4, a, 1, 0, 0, 0, 32); + test_negative(4, a, 1, 0, 0, 0); + /* Must be an exact match to remove */ + remove(6, a, 0x24446801, 0x40e40800, 0xdeaebeef, 0xdefbeef, 96); + test(6, a, 0x24446801, 0x40e40800, 0xdeaebeef, 0xdefbeef); + /* NULL peer should have no effect and return 0 */ + test_boolean(!remove(6, NULL, 0x24446801, 0x40e40800, 0xdeaebeef, 0xdefbeef, 128)); + test(6, a, 0x24446801, 0x40e40800, 0xdeaebeef, 0xdefbeef); + /* different peer should have no effect and return 0 */ + test_boolean(!remove(6, b, 0x24446801, 0x40e40800, 0xdeaebeef, 0xdefbeef, 128)); + test(6, a, 0x24446801, 0x40e40800, 0xdeaebeef, 0xdefbeef); + /* invalid CIDR should have no effect and return -EINVAL */ + test_boolean(remove(6, a, 0x24446801, 0x40e40800, 0xdeaebeef, 0xdefbeef, 129) == -EINVAL); + test(6, a, 0x24446801, 0x40e40800, 0xdeaebeef, 0xdefbeef); + remove(6, a, 0x24446801, 0x40e40800, 0xdeaebeef, 0xdefbeef, 128); + test_negative(6, a, 0x24446801, 0x40e40800, 0xdeaebeef, 0xdefbeef); + /* Must match the peer to remove */ + remove(6, b, 0x24446800, 0xf0e40800, 0xeeaebeef, 0, 98); + test(6, a, 0x24446800, 0xf0e40800, 0xeeaebeef, 0x10101010); + remove(6, a, 0x24446800, 0xf0e40800, 0xeeaebeef, 0, 98); + test_negative(6, a, 0x24446800, 0xf0e40800, 0xeeaebeef, 0x10101010); + wg_allowedips_free(&t, &mutex); wg_allowedips_init(&t); insert(4, a, 192, 168, 0, 0, 16); diff --git a/include/uapi/linux/wireguard.h b/include/uapi/linux/wireguard.h index ae88be14c947..8c26391196d5 100644 --- a/include/uapi/linux/wireguard.h +++ b/include/uapi/linux/wireguard.h @@ -101,6 +101,10 @@ * WGALLOWEDIP_A_FAMILY: NLA_U16 * WGALLOWEDIP_A_IPADDR: struct in_addr or struct in6_addr * WGALLOWEDIP_A_CIDR_MASK: NLA_U8 + * WGALLOWEDIP_A_FLAGS: NLA_U32, WGALLOWEDIP_F_REMOVE_ME if + * the specified IP should be removed; + * otherwise, this IP will be added if + * it is not already present. * 0: NLA_NESTED * ... * 0: NLA_NESTED @@ -184,11 +188,16 @@ enum wgpeer_attribute { }; #define WGPEER_A_MAX (__WGPEER_A_LAST - 1) +enum wgallowedip_flag { + WGALLOWEDIP_F_REMOVE_ME = 1U << 0, + __WGALLOWEDIP_F_ALL = WGALLOWEDIP_F_REMOVE_ME +}; enum wgallowedip_attribute { WGALLOWEDIP_A_UNSPEC, WGALLOWEDIP_A_FAMILY, WGALLOWEDIP_A_IPADDR, WGALLOWEDIP_A_CIDR_MASK, + WGALLOWEDIP_A_FLAGS, __WGALLOWEDIP_A_LAST }; #define WGALLOWEDIP_A_MAX (__WGALLOWEDIP_A_LAST - 1) diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index 55500f901fbc..a8f550aecb35 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -611,6 +611,35 @@ n0 wg set wg0 peer "$pub2" allowed-ips "$allowedips" } < <(n0 wg show wg0 allowed-ips) ip0 link del wg0 +allowedips=( ) +for i in {1..197}; do + allowedips+=( 192.168.0.$i ) + allowedips+=( abcd::$i ) +done +saved_ifs="$IFS" +IFS=, +allowedips="${allowedips[*]}" +IFS="$saved_ifs" +ip0 link add wg0 type wireguard +n0 wg set wg0 peer "$pub1" allowed-ips "$allowedips" +n0 wg set wg0 peer "$pub1" allowed-ips -192.168.0.1/32,-192.168.0.20/32,-192.168.0.100/32,-abcd::1/128,-abcd::20/128,-abcd::100/128 +{ + read -r pub allowedips + [[ $pub == "$pub1" ]] + i=0 + for ip in $allowedips; do + [[ $ip != "192.168.0.1" ]] + [[ $ip != "192.168.0.20" ]] + [[ $ip != "192.168.0.100" ]] + [[ $ip != "abcd::1" ]] + [[ $ip != "abcd::20" ]] + [[ $ip != "abcd::100" ]] + ((++i)) + done + ((i == 388)) +} < <(n0 wg show wg0 allowed-ips) +ip0 link del wg0 + ! n0 wg show doesnotexist || false ip0 link add wg0 type wireguard diff --git a/tools/testing/selftests/wireguard/qemu/Makefile b/tools/testing/selftests/wireguard/qemu/Makefile index 35856b11c143..f6fbd88914ee 100644 --- a/tools/testing/selftests/wireguard/qemu/Makefile +++ b/tools/testing/selftests/wireguard/qemu/Makefile @@ -43,7 +43,7 @@ $(eval $(call tar_download,IPROUTE2,iproute2,5.17.0,.tar.gz,https://www.kernel.o $(eval $(call tar_download,IPTABLES,iptables,1.8.7,.tar.bz2,https://www.netfilter.org/projects/iptables/files/,c109c96bb04998cd44156622d36f8e04b140701ec60531a10668cfdff5e8d8f0)) $(eval $(call tar_download,NMAP,nmap,7.92,.tgz,https://nmap.org/dist/,064183ea642dc4c12b1ab3b5358ce1cef7d2e7e11ffa2849f16d339f5b717117)) $(eval $(call tar_download,IPUTILS,iputils,s20190709,.tar.gz,https://github.com/iputils/iputils/archive/s20190709.tar.gz/#,a15720dd741d7538dd2645f9f516d193636ae4300ff7dbc8bfca757bf166490a)) -$(eval $(call tar_download,WIREGUARD_TOOLS,wireguard-tools,1.0.20210914,.tar.xz,https://git.zx2c4.com/wireguard-tools/snapshot/,97ff31489217bb265b7ae850d3d0f335ab07d2652ba1feec88b734bc96bd05ac)) +$(eval $(call tar_download,WIREGUARD_TOOLS,wireguard-tools,1.0.20250521,.tar.xz,https://git.zx2c4.com/wireguard-tools/snapshot/,b6f2628b85b1b23cc06517ec9c74f82d52c4cdbd020f3dd2f00c972a1782950e)) export CFLAGS := -O3 -pipe ifeq ($(HOST_ARCH),$(ARCH)) -- cgit v1.2.3 From 5ec40864aaecc4bd66fe67541d4a41091ed664a5 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 22 May 2025 01:18:22 +0200 Subject: vsock: Move lingering logic to af_vsock core Lingering should be transport-independent in the long run. In preparation for supporting other transports, as well as the linger on shutdown(), move code to core. Generalize by querying vsock_transport::unsent_bytes(), guard against the callback being unimplemented. Do not pass sk_lingertime explicitly. Pull SOCK_LINGER check into vsock_linger(). Flatten the function. Remove the nested block by inverting the condition: return early on !timeout. Suggested-by: Stefano Garzarella Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250522-vsock-linger-v6-2-2ad00b0e447e@rbox.co Signed-off-by: Paolo Abeni --- include/net/af_vsock.h | 1 + net/vmw_vsock/af_vsock.c | 33 +++++++++++++++++++++++++++++++++ net/vmw_vsock/virtio_transport_common.c | 23 ++--------------------- 3 files changed, 36 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 9e85424c8343..d56e6e135158 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -221,6 +221,7 @@ void vsock_for_each_connected_socket(struct vsock_transport *transport, void (*fn)(struct sock *sk)); int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); bool vsock_find_cid(unsigned int cid); +void vsock_linger(struct sock *sk); /**** TAP ****/ diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index fc6afbc8d680..2e7a3034e965 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1013,6 +1013,39 @@ out: return err; } +void vsock_linger(struct sock *sk) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + ssize_t (*unsent)(struct vsock_sock *vsk); + struct vsock_sock *vsk = vsock_sk(sk); + long timeout; + + if (!sock_flag(sk, SOCK_LINGER)) + return; + + timeout = sk->sk_lingertime; + if (!timeout) + return; + + /* Transports must implement `unsent_bytes` if they want to support + * SOCK_LINGER through `vsock_linger()` since we use it to check when + * the socket can be closed. + */ + unsent = vsk->transport->unsent_bytes; + if (!unsent) + return; + + add_wait_queue(sk_sleep(sk), &wait); + + do { + if (sk_wait_event(sk, &timeout, unsent(vsk) == 0, &wait)) + break; + } while (!signal_pending(current) && timeout); + + remove_wait_queue(sk_sleep(sk), &wait); +} +EXPORT_SYMBOL_GPL(vsock_linger); + static int vsock_shutdown(struct socket *sock, int mode) { int err; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index f2f1b166731b..7897fd970dd8 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1191,25 +1191,6 @@ static void virtio_transport_remove_sock(struct vsock_sock *vsk) vsock_remove_sock(vsk); } -static void virtio_transport_wait_close(struct sock *sk, long timeout) -{ - if (timeout) { - DEFINE_WAIT_FUNC(wait, woken_wake_function); - struct vsock_sock *vsk = vsock_sk(sk); - - add_wait_queue(sk_sleep(sk), &wait); - - do { - if (sk_wait_event(sk, &timeout, - virtio_transport_unsent_bytes(vsk) == 0, - &wait)) - break; - } while (!signal_pending(current) && timeout); - - remove_wait_queue(sk_sleep(sk), &wait); - } -} - static void virtio_transport_cancel_close_work(struct vsock_sock *vsk, bool cancel_timeout) { @@ -1279,8 +1260,8 @@ static bool virtio_transport_close(struct vsock_sock *vsk) if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK) (void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK); - if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING)) - virtio_transport_wait_close(sk, sk->sk_lingertime); + if (!(current->flags & PF_EXITING)) + vsock_linger(sk); if (sock_flag(sk, SOCK_DONE)) { return true; -- cgit v1.2.3 From e9cb929670a1e98b592b30f03f06e9e20110f318 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 22 May 2025 13:21:47 +0200 Subject: net: phy: fix up const issues in to_mdio_device() and to_phy_device() Both to_mdio_device() and to_phy_device() "throw away" the const pointer attribute passed to them and return a non-const pointer, which generally is not a good thing overall. Fix this up by using container_of_const() which was designed for this very problem. Cc: Alexander Lobakin Cc: Andrew Lunn Cc: Heiner Kallweit Cc: Russell King Fixes: 7eab14de73a8 ("mdio, phy: fix -Wshadow warnings triggered by nested container_of()") Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2025052246-conduit-glory-8fc9@gregkh Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 5 +---- include/linux/phy.h | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 3c3deac57894..e43ff9f980a4 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -45,10 +45,7 @@ struct mdio_device { unsigned int reset_deassert_delay; }; -static inline struct mdio_device *to_mdio_device(const struct device *dev) -{ - return container_of(dev, struct mdio_device, dev); -} +#define to_mdio_device(__dev) container_of_const(__dev, struct mdio_device, dev) /* struct mdio_driver_common: Common to all MDIO drivers */ struct mdio_driver_common { diff --git a/include/linux/phy.h b/include/linux/phy.h index 32b9da274115..e194dad1623d 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -744,10 +744,7 @@ struct phy_device { #define PHY_F_NO_IRQ 0x80000000 #define PHY_F_RXC_ALWAYS_ON 0x40000000 -static inline struct phy_device *to_phy_device(const struct device *dev) -{ - return container_of(to_mdio_device(dev), struct phy_device, mdio); -} +#define to_phy_device(__dev) container_of_const(to_mdio_device(__dev), struct phy_device, mdio) /** * struct phy_tdr_config - Configuration of a TDR raw test -- cgit v1.2.3 From 33f1b3677a13dda60a2a59858f7916672e7f1546 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 May 2025 07:47:45 +0200 Subject: sctp: mark sctp_do_peeloff static sctp_do_peeloff is only used inside of net/sctp/socket.c, so mark it static. Signed-off-by: Christoph Hellwig Acked-by: Xin Long Link: https://patch.msgid.link/20250526054745.2329201-1-hch@lst.de Signed-off-by: Jakub Kicinski --- include/net/sctp/sctp.h | 2 -- net/sctp/socket.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index d8da764cf6de..e96d1bd087f6 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -364,8 +364,6 @@ sctp_assoc_to_state(const struct sctp_association *asoc) /* Look up the association by its id. */ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id); -int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp); - /* A macro to walk a list of skbs. */ #define sctp_skb_for_each(pos, head, tmp) \ skb_queue_walk_safe(head, pos, tmp) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 90b75d4ec329..1e5739858c20 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5627,7 +5627,8 @@ static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optv } /* Helper routine to branch off an association to a new socket. */ -int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) +static int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, + struct socket **sockp) { struct sctp_association *asoc = sctp_id2assoc(sk, id); struct sctp_sock *sp = sctp_sk(sk); @@ -5675,7 +5676,6 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) return err; } -EXPORT_SYMBOL(sctp_do_peeloff); static int sctp_getsockopt_peeloff_common(struct sock *sk, sctp_peeloff_arg_t *peeloff, struct file **newfile, unsigned flags) -- cgit v1.2.3 From 290e5d3c49f687c1567bde634dc33d57b0674919 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Mon, 19 May 2025 09:20:36 -0700 Subject: net: mana: Add support for Multi Vports on Bare metal To support Multi Vports on Bare metal, increase the device config response version. And, skip the register HW vport, and register filter steps, when the Bare metal hostmode is set. Signed-off-by: Haiyang Zhang Link: https://patch.msgid.link/1747671636-5810-1-git-send-email-haiyangz@microsoft.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_en.c | 24 ++++++++++++++++-------- include/net/mana/mana.h | 4 +++- 2 files changed, 19 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 2bac6be8f6a0..9c58d9e0bbb5 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -921,7 +921,7 @@ static void mana_pf_deregister_filter(struct mana_port_context *apc) static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver, u32 proto_minor_ver, u32 proto_micro_ver, - u16 *max_num_vports) + u16 *max_num_vports, u8 *bm_hostmode) { struct gdma_context *gc = ac->gdma_dev->gdma_context; struct mana_query_device_cfg_resp resp = {}; @@ -932,7 +932,7 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver, mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_DEV_CONFIG, sizeof(req), sizeof(resp)); - req.hdr.resp.msg_version = GDMA_MESSAGE_V2; + req.hdr.resp.msg_version = GDMA_MESSAGE_V3; req.proto_major_ver = proto_major_ver; req.proto_minor_ver = proto_minor_ver; @@ -956,11 +956,16 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver, *max_num_vports = resp.max_num_vports; - if (resp.hdr.response.msg_version == GDMA_MESSAGE_V2) + if (resp.hdr.response.msg_version >= GDMA_MESSAGE_V2) gc->adapter_mtu = resp.adapter_mtu; else gc->adapter_mtu = ETH_FRAME_LEN; + if (resp.hdr.response.msg_version >= GDMA_MESSAGE_V3) + *bm_hostmode = resp.bm_hostmode; + else + *bm_hostmode = 0; + debugfs_create_u16("adapter-MTU", 0400, gc->mana_pci_debugfs, &gc->adapter_mtu); return 0; @@ -2441,7 +2446,7 @@ static void mana_destroy_vport(struct mana_port_context *apc) mana_destroy_txq(apc); mana_uncfg_vport(apc); - if (gd->gdma_context->is_pf) + if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) mana_pf_deregister_hw_vport(apc); } @@ -2453,7 +2458,7 @@ static int mana_create_vport(struct mana_port_context *apc, apc->default_rxobj = INVALID_MANA_HANDLE; - if (gd->gdma_context->is_pf) { + if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) { err = mana_pf_register_hw_vport(apc); if (err) return err; @@ -2689,7 +2694,7 @@ int mana_alloc_queues(struct net_device *ndev) goto destroy_vport; } - if (gd->gdma_context->is_pf) { + if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) { err = mana_pf_register_filter(apc); if (err) goto destroy_vport; @@ -2751,7 +2756,7 @@ static int mana_dealloc_queues(struct net_device *ndev) mana_chn_setxdp(apc, NULL); - if (gd->gdma_context->is_pf) + if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) mana_pf_deregister_filter(apc); /* No packet can be transmitted now since apc->port_is_up is false. @@ -2998,6 +3003,7 @@ int mana_probe(struct gdma_dev *gd, bool resuming) struct gdma_context *gc = gd->gdma_context; struct mana_context *ac = gd->driver_data; struct device *dev = gc->dev; + u8 bm_hostmode = 0; u16 num_ports = 0; int err; int i; @@ -3026,10 +3032,12 @@ int mana_probe(struct gdma_dev *gd, bool resuming) } err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION, - MANA_MICRO_VERSION, &num_ports); + MANA_MICRO_VERSION, &num_ports, &bm_hostmode); if (err) goto out; + ac->bm_hostmode = bm_hostmode; + if (!resuming) { ac->num_ports = num_ports; } else { diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 0f78065de8fe..38238c1d00bf 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -408,6 +408,7 @@ struct mana_context { struct gdma_dev *gdma_dev; u16 num_ports; + u8 bm_hostmode; struct mana_eq *eqs; struct dentry *mana_eqs_debugfs; @@ -557,7 +558,8 @@ struct mana_query_device_cfg_resp { u64 pf_cap_flags4; u16 max_num_vports; - u16 reserved; + u8 bm_hostmode; /* response v3: Bare Metal Host Mode */ + u8 reserved; u32 max_num_eqs; /* response v2: */ -- cgit v1.2.3