From 4721031c3559db8eae61df305f10c00099a7c1d0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:51 -0800 Subject: net: move gro definitions to include/net/gro.h include/linux/netdevice.h became too big, move gro stuff into include/net/gro.h Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 348 ---------------------------------------------- 1 file changed, 348 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec42495a43a..d95c9839ce90 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2520,109 +2520,6 @@ static inline void netif_napi_del(struct napi_struct *napi) synchronize_net(); } -struct napi_gro_cb { - /* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */ - void *frag0; - - /* Length of frag0. */ - unsigned int frag0_len; - - /* This indicates where we are processing relative to skb->data. */ - int data_offset; - - /* This is non-zero if the packet cannot be merged with the new skb. */ - u16 flush; - - /* Save the IP ID here and check when we get to the transport layer */ - u16 flush_id; - - /* Number of segments aggregated. */ - u16 count; - - /* Start offset for remote checksum offload */ - u16 gro_remcsum_start; - - /* jiffies when first packet was created/queued */ - unsigned long age; - - /* Used in ipv6_gro_receive() and foo-over-udp */ - u16 proto; - - /* This is non-zero if the packet may be of the same flow. */ - u8 same_flow:1; - - /* Used in tunnel GRO receive */ - u8 encap_mark:1; - - /* GRO checksum is valid */ - u8 csum_valid:1; - - /* Number of checksums via CHECKSUM_UNNECESSARY */ - u8 csum_cnt:3; - - /* Free the skb? */ - u8 free:2; -#define NAPI_GRO_FREE 1 -#define NAPI_GRO_FREE_STOLEN_HEAD 2 - - /* Used in foo-over-udp, set in udp[46]_gro_receive */ - u8 is_ipv6:1; - - /* Used in GRE, set in fou/gue_gro_receive */ - u8 is_fou:1; - - /* Used to determine if flush_id can be ignored */ - u8 is_atomic:1; - - /* Number of gro_receive callbacks this packet already went through */ - u8 recursion_counter:4; - - /* GRO is done by frag_list pointer chaining. */ - u8 is_flist:1; - - /* used to support CHECKSUM_COMPLETE for tunneling protocols */ - __wsum csum; - - /* used in skb_gro_receive() slow path */ - struct sk_buff *last; -}; - -#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) - -#define GRO_RECURSION_LIMIT 15 -static inline int gro_recursion_inc_test(struct sk_buff *skb) -{ - return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT; -} - -typedef struct sk_buff *(*gro_receive_t)(struct list_head *, struct sk_buff *); -static inline struct sk_buff *call_gro_receive(gro_receive_t cb, - struct list_head *head, - struct sk_buff *skb) -{ - if (unlikely(gro_recursion_inc_test(skb))) { - NAPI_GRO_CB(skb)->flush |= 1; - return NULL; - } - - return cb(head, skb); -} - -typedef struct sk_buff *(*gro_receive_sk_t)(struct sock *, struct list_head *, - struct sk_buff *); -static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb, - struct sock *sk, - struct list_head *head, - struct sk_buff *skb) -{ - if (unlikely(gro_recursion_inc_test(skb))) { - NAPI_GRO_CB(skb)->flush |= 1; - return NULL; - } - - return cb(sk, head, skb); -} - struct packet_type { __be16 type; /* This is really htons(ether_type). */ bool ignore_outgoing; @@ -3008,251 +2905,6 @@ int dev_restart(struct net_device *dev); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); -static inline unsigned int skb_gro_offset(const struct sk_buff *skb) -{ - return NAPI_GRO_CB(skb)->data_offset; -} - -static inline unsigned int skb_gro_len(const struct sk_buff *skb) -{ - return skb->len - NAPI_GRO_CB(skb)->data_offset; -} - -static inline void skb_gro_pull(struct sk_buff *skb, unsigned int len) -{ - NAPI_GRO_CB(skb)->data_offset += len; -} - -static inline void *skb_gro_header_fast(struct sk_buff *skb, - unsigned int offset) -{ - return NAPI_GRO_CB(skb)->frag0 + offset; -} - -static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen) -{ - return NAPI_GRO_CB(skb)->frag0_len < hlen; -} - -static inline void skb_gro_frag0_invalidate(struct sk_buff *skb) -{ - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; -} - -static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, - unsigned int offset) -{ - if (!pskb_may_pull(skb, hlen)) - return NULL; - - skb_gro_frag0_invalidate(skb); - return skb->data + offset; -} - -static inline void *skb_gro_network_header(struct sk_buff *skb) -{ - return (NAPI_GRO_CB(skb)->frag0 ?: skb->data) + - skb_network_offset(skb); -} - -static inline void skb_gro_postpull_rcsum(struct sk_buff *skb, - const void *start, unsigned int len) -{ - if (NAPI_GRO_CB(skb)->csum_valid) - NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum, - csum_partial(start, len, 0)); -} - -/* GRO checksum functions. These are logical equivalents of the normal - * checksum functions (in skbuff.h) except that they operate on the GRO - * offsets and fields in sk_buff. - */ - -__sum16 __skb_gro_checksum_complete(struct sk_buff *skb); - -static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb) -{ - return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb)); -} - -static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, - bool zero_okay, - __sum16 check) -{ - return ((skb->ip_summed != CHECKSUM_PARTIAL || - skb_checksum_start_offset(skb) < - skb_gro_offset(skb)) && - !skb_at_gro_remcsum_start(skb) && - NAPI_GRO_CB(skb)->csum_cnt == 0 && - (!zero_okay || check)); -} - -static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb, - __wsum psum) -{ - if (NAPI_GRO_CB(skb)->csum_valid && - !csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum))) - return 0; - - NAPI_GRO_CB(skb)->csum = psum; - - return __skb_gro_checksum_complete(skb); -} - -static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb) -{ - if (NAPI_GRO_CB(skb)->csum_cnt > 0) { - /* Consume a checksum from CHECKSUM_UNNECESSARY */ - NAPI_GRO_CB(skb)->csum_cnt--; - } else { - /* Update skb for CHECKSUM_UNNECESSARY and csum_level when we - * verified a new top level checksum or an encapsulated one - * during GRO. This saves work if we fallback to normal path. - */ - __skb_incr_checksum_unnecessary(skb); - } -} - -#define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \ - compute_pseudo) \ -({ \ - __sum16 __ret = 0; \ - if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \ - __ret = __skb_gro_checksum_validate_complete(skb, \ - compute_pseudo(skb, proto)); \ - if (!__ret) \ - skb_gro_incr_csum_unnecessary(skb); \ - __ret; \ -}) - -#define skb_gro_checksum_validate(skb, proto, compute_pseudo) \ - __skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo) - -#define skb_gro_checksum_validate_zero_check(skb, proto, check, \ - compute_pseudo) \ - __skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo) - -#define skb_gro_checksum_simple_validate(skb) \ - __skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo) - -static inline bool __skb_gro_checksum_convert_check(struct sk_buff *skb) -{ - return (NAPI_GRO_CB(skb)->csum_cnt == 0 && - !NAPI_GRO_CB(skb)->csum_valid); -} - -static inline void __skb_gro_checksum_convert(struct sk_buff *skb, - __wsum pseudo) -{ - NAPI_GRO_CB(skb)->csum = ~pseudo; - NAPI_GRO_CB(skb)->csum_valid = 1; -} - -#define skb_gro_checksum_try_convert(skb, proto, compute_pseudo) \ -do { \ - if (__skb_gro_checksum_convert_check(skb)) \ - __skb_gro_checksum_convert(skb, \ - compute_pseudo(skb, proto)); \ -} while (0) - -struct gro_remcsum { - int offset; - __wsum delta; -}; - -static inline void skb_gro_remcsum_init(struct gro_remcsum *grc) -{ - grc->offset = 0; - grc->delta = 0; -} - -static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, - unsigned int off, size_t hdrlen, - int start, int offset, - struct gro_remcsum *grc, - bool nopartial) -{ - __wsum delta; - size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); - - BUG_ON(!NAPI_GRO_CB(skb)->csum_valid); - - if (!nopartial) { - NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start; - return ptr; - } - - ptr = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, off + plen)) { - ptr = skb_gro_header_slow(skb, off + plen, off); - if (!ptr) - return NULL; - } - - delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum, - start, offset); - - /* Adjust skb->csum since we changed the packet */ - NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); - - grc->offset = off + hdrlen + offset; - grc->delta = delta; - - return ptr; -} - -static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, - struct gro_remcsum *grc) -{ - void *ptr; - size_t plen = grc->offset + sizeof(u16); - - if (!grc->delta) - return; - - ptr = skb_gro_header_fast(skb, grc->offset); - if (skb_gro_header_hard(skb, grc->offset + sizeof(u16))) { - ptr = skb_gro_header_slow(skb, plen, grc->offset); - if (!ptr) - return; - } - - remcsum_unadjust((__sum16 *)ptr, grc->delta); -} - -#ifdef CONFIG_XFRM_OFFLOAD -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) -{ - if (PTR_ERR(pp) != -EINPROGRESS) - NAPI_GRO_CB(skb)->flush |= flush; -} -static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, - struct sk_buff *pp, - int flush, - struct gro_remcsum *grc) -{ - if (PTR_ERR(pp) != -EINPROGRESS) { - NAPI_GRO_CB(skb)->flush |= flush; - skb_gro_remcsum_cleanup(skb, grc); - skb->remcsum_offload = 0; - } -} -#else -static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff *pp, int flush) -{ - NAPI_GRO_CB(skb)->flush |= flush; -} -static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, - struct sk_buff *pp, - int flush, - struct gro_remcsum *grc) -{ - NAPI_GRO_CB(skb)->flush |= flush; - skb_gro_remcsum_cleanup(skb, grc); - skb->remcsum_offload = 0; -} -#endif static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, -- cgit v1.2.3 From 0b935d7f8c07bf0a192712bdbf76dbf45ef8b115 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:52 -0800 Subject: net: gro: move skb_gro_receive_list to udp_offload.c This helper is used once, no need to keep it in fat net/core/skbuff.c Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - net/core/skbuff.c | 26 -------------------------- net/ipv4/udp_offload.c | 27 +++++++++++++++++++++++++++ 3 files changed, 27 insertions(+), 27 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d95c9839ce90..ce6ee1453dbc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2903,7 +2903,6 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id); int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); -int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5b9472bdceff..8560d50c960b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3920,32 +3920,6 @@ err_linearize: } EXPORT_SYMBOL_GPL(skb_segment_list); -int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) -{ - if (unlikely(p->len + skb->len >= 65536)) - return -E2BIG; - - if (NAPI_GRO_CB(p)->last == p) - skb_shinfo(p)->frag_list = skb; - else - NAPI_GRO_CB(p)->last->next = skb; - - skb_pull(skb, skb_gro_offset(skb)); - - NAPI_GRO_CB(p)->last = skb; - NAPI_GRO_CB(p)->count++; - p->data_len += skb->len; - - /* sk owenrship - if any - completely transferred to the aggregated packet */ - skb->destructor = NULL; - p->truesize += skb->truesize; - p->len += skb->len; - - NAPI_GRO_CB(skb)->same_flow = 1; - - return 0; -} - /** * skb_segment - Perform protocol segmentation on skb. * @head_skb: buffer to segment diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 7fbf9975e8c0..cbeb8965d1b7 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -425,6 +425,33 @@ out: return segs; } +static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) +{ + if (unlikely(p->len + skb->len >= 65536)) + return -E2BIG; + + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; + + skb_pull(skb, skb_gro_offset(skb)); + + NAPI_GRO_CB(p)->last = skb; + NAPI_GRO_CB(p)->count++; + p->data_len += skb->len; + + /* sk owenrship - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; + p->truesize += skb->truesize; + p->len += skb->len; + + NAPI_GRO_CB(skb)->same_flow = 1; + + return 0; +} + + #define UDP_GRO_CNT_MAX 64 static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct sk_buff *skb) -- cgit v1.2.3 From e456a18a390b96f22b0de2acd4d0f49c72ed2280 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:53 -0800 Subject: net: gro: move skb_gro_receive into net/core/gro.c net/core/gro.c will contain all core gro functions, to shrink net/core/skbuff.c and net/core/dev.c Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - include/net/gro.h | 2 + net/core/Makefile | 2 +- net/core/gro.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++ net/core/skbuff.c | 117 --------------------------------------------- 5 files changed, 121 insertions(+), 119 deletions(-) create mode 100644 net/core/gro.c (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ce6ee1453dbc..93d397db9ec4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2902,7 +2902,6 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); -int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, diff --git a/include/net/gro.h b/include/net/gro.h index 1ffbe74b2e35..f988bf3440f8 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -414,4 +414,6 @@ static inline __wsum ip6_gro_compute_pseudo(struct sk_buff *skb, int proto) skb_gro_len(skb), proto, 0)); } +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); + #endif /* _NET_IPV6_GRO_H */ diff --git a/net/core/Makefile b/net/core/Makefile index 4268846f2f47..6bdcb2cafed8 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ - fib_notifier.o xdp.o flow_offload.o + fib_notifier.o xdp.o flow_offload.o gro.o obj-y += net-sysfs.o obj-$(CONFIG_PAGE_POOL) += page_pool.o diff --git a/net/core/gro.c b/net/core/gro.c new file mode 100644 index 000000000000..91a74c4da9ff --- /dev/null +++ b/net/core/gro.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include + +int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) +{ + struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); + unsigned int offset = skb_gro_offset(skb); + unsigned int headlen = skb_headlen(skb); + unsigned int len = skb_gro_len(skb); + unsigned int delta_truesize; + unsigned int new_truesize; + struct sk_buff *lp; + + if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush)) + return -E2BIG; + + lp = NAPI_GRO_CB(p)->last; + pinfo = skb_shinfo(lp); + + if (headlen <= offset) { + skb_frag_t *frag; + skb_frag_t *frag2; + int i = skbinfo->nr_frags; + int nr_frags = pinfo->nr_frags + i; + + if (nr_frags > MAX_SKB_FRAGS) + goto merge; + + offset -= headlen; + pinfo->nr_frags = nr_frags; + skbinfo->nr_frags = 0; + + frag = pinfo->frags + nr_frags; + frag2 = skbinfo->frags + i; + do { + *--frag = *--frag2; + } while (--i); + + skb_frag_off_add(frag, offset); + skb_frag_size_sub(frag, offset); + + /* all fragments truesize : remove (head size + sk_buff) */ + new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); + delta_truesize = skb->truesize - new_truesize; + + skb->truesize = new_truesize; + skb->len -= skb->data_len; + skb->data_len = 0; + + NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; + goto done; + } else if (skb->head_frag) { + int nr_frags = pinfo->nr_frags; + skb_frag_t *frag = pinfo->frags + nr_frags; + struct page *page = virt_to_head_page(skb->head); + unsigned int first_size = headlen - offset; + unsigned int first_offset; + + if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) + goto merge; + + first_offset = skb->data - + (unsigned char *)page_address(page) + + offset; + + pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; + + __skb_frag_set_page(frag, page); + skb_frag_off_set(frag, first_offset); + skb_frag_size_set(frag, first_size); + + memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); + /* We dont need to clear skbinfo->nr_frags here */ + + new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); + delta_truesize = skb->truesize - new_truesize; + skb->truesize = new_truesize; + NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; + goto done; + } + +merge: + /* sk owenrship - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; + delta_truesize = skb->truesize; + if (offset > headlen) { + unsigned int eat = offset - headlen; + + skb_frag_off_add(&skbinfo->frags[0], eat); + skb_frag_size_sub(&skbinfo->frags[0], eat); + skb->data_len -= eat; + skb->len -= eat; + offset = headlen; + } + + __skb_pull(skb, offset); + + if (NAPI_GRO_CB(p)->last == p) + skb_shinfo(p)->frag_list = skb; + else + NAPI_GRO_CB(p)->last->next = skb; + NAPI_GRO_CB(p)->last = skb; + __skb_header_release(skb); + lp = p; + +done: + NAPI_GRO_CB(p)->count++; + p->data_len += len; + p->truesize += delta_truesize; + p->len += len; + if (lp != p) { + lp->data_len += len; + lp->truesize += delta_truesize; + lp->len += len; + } + NAPI_GRO_CB(skb)->same_flow = 1; + return 0; +} diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8560d50c960b..73720bc1daa3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -64,7 +64,6 @@ #include #include -#include #include #include #include @@ -4272,122 +4271,6 @@ err: } EXPORT_SYMBOL_GPL(skb_segment); -int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) -{ - struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); - unsigned int offset = skb_gro_offset(skb); - unsigned int headlen = skb_headlen(skb); - unsigned int len = skb_gro_len(skb); - unsigned int delta_truesize; - unsigned int new_truesize; - struct sk_buff *lp; - - if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush)) - return -E2BIG; - - lp = NAPI_GRO_CB(p)->last; - pinfo = skb_shinfo(lp); - - if (headlen <= offset) { - skb_frag_t *frag; - skb_frag_t *frag2; - int i = skbinfo->nr_frags; - int nr_frags = pinfo->nr_frags + i; - - if (nr_frags > MAX_SKB_FRAGS) - goto merge; - - offset -= headlen; - pinfo->nr_frags = nr_frags; - skbinfo->nr_frags = 0; - - frag = pinfo->frags + nr_frags; - frag2 = skbinfo->frags + i; - do { - *--frag = *--frag2; - } while (--i); - - skb_frag_off_add(frag, offset); - skb_frag_size_sub(frag, offset); - - /* all fragments truesize : remove (head size + sk_buff) */ - new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); - delta_truesize = skb->truesize - new_truesize; - - skb->truesize = new_truesize; - skb->len -= skb->data_len; - skb->data_len = 0; - - NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; - goto done; - } else if (skb->head_frag) { - int nr_frags = pinfo->nr_frags; - skb_frag_t *frag = pinfo->frags + nr_frags; - struct page *page = virt_to_head_page(skb->head); - unsigned int first_size = headlen - offset; - unsigned int first_offset; - - if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) - goto merge; - - first_offset = skb->data - - (unsigned char *)page_address(page) + - offset; - - pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; - - __skb_frag_set_page(frag, page); - skb_frag_off_set(frag, first_offset); - skb_frag_size_set(frag, first_size); - - memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); - /* We dont need to clear skbinfo->nr_frags here */ - - new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); - delta_truesize = skb->truesize - new_truesize; - skb->truesize = new_truesize; - NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; - goto done; - } - -merge: - /* sk owenrship - if any - completely transferred to the aggregated packet */ - skb->destructor = NULL; - delta_truesize = skb->truesize; - if (offset > headlen) { - unsigned int eat = offset - headlen; - - skb_frag_off_add(&skbinfo->frags[0], eat); - skb_frag_size_sub(&skbinfo->frags[0], eat); - skb->data_len -= eat; - skb->len -= eat; - offset = headlen; - } - - __skb_pull(skb, offset); - - if (NAPI_GRO_CB(p)->last == p) - skb_shinfo(p)->frag_list = skb; - else - NAPI_GRO_CB(p)->last->next = skb; - NAPI_GRO_CB(p)->last = skb; - __skb_header_release(skb); - lp = p; - -done: - NAPI_GRO_CB(p)->count++; - p->data_len += len; - p->truesize += delta_truesize; - p->len += len; - if (lp != p) { - lp->data_len += len; - lp->truesize += delta_truesize; - lp->len += len; - } - NAPI_GRO_CB(skb)->same_flow = 1; - return 0; -} - #ifdef CONFIG_SKB_EXTENSIONS #define SKB_EXT_ALIGN_VALUE 8 #define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) -- cgit v1.2.3 From 587652bbdd06ab38a4c1b85e40f933d2cf4a1147 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:05:54 -0800 Subject: net: gro: populate net/core/gro.c Move gro code and data from net/core/dev.c to net/core/gro.c to ease maintenance. gro_normal_list() and gro_normal_one() are inlined because they are called from both files. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/net/gro.h | 22 ++ net/core/dev.c | 668 +--------------------------------------------- net/core/gro.c | 648 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 672 insertions(+), 667 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 93d397db9ec4..31a7e6b27681 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3657,6 +3657,7 @@ int netif_rx_ni(struct sk_buff *skb); int netif_rx_any_context(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); +void netif_receive_skb_list_internal(struct list_head *head); void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); diff --git a/include/net/gro.h b/include/net/gro.h index f988bf3440f8..d0e7df691a80 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -416,4 +416,26 @@ static inline __wsum ip6_gro_compute_pseudo(struct sk_buff *skb, int proto) int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); +/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ +static inline void gro_normal_list(struct napi_struct *napi) +{ + if (!napi->rx_count) + return; + netif_receive_skb_list_internal(&napi->rx_list); + INIT_LIST_HEAD(&napi->rx_list); + napi->rx_count = 0; +} + +/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, + * pass the whole batch up to the stack. + */ +static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs) +{ + list_add_tail(&skb->list, &napi->rx_list); + napi->rx_count += segs; + if (napi->rx_count >= gro_normal_batch) + gro_normal_list(napi); +} + + #endif /* _NET_IPV6_GRO_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 15ac064b5562..92c9258cbf28 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -153,16 +153,10 @@ #include "net-sysfs.h" -#define MAX_GRO_SKBS 8 - -/* This should be increased if a protocol with a bigger head is added. */ -#define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(ptype_lock); -static DEFINE_SPINLOCK(offload_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; struct list_head ptype_all __read_mostly; /* Taps */ -static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, @@ -604,84 +598,6 @@ void dev_remove_pack(struct packet_type *pt) EXPORT_SYMBOL(dev_remove_pack); -/** - * dev_add_offload - register offload handlers - * @po: protocol offload declaration - * - * Add protocol offload handlers to the networking stack. The passed - * &proto_offload is linked into kernel lists and may not be freed until - * it has been removed from the kernel lists. - * - * This call does not sleep therefore it can not - * guarantee all CPU's that are in middle of receiving packets - * will see the new offload handlers (until the next received packet). - */ -void dev_add_offload(struct packet_offload *po) -{ - struct packet_offload *elem; - - spin_lock(&offload_lock); - list_for_each_entry(elem, &offload_base, list) { - if (po->priority < elem->priority) - break; - } - list_add_rcu(&po->list, elem->list.prev); - spin_unlock(&offload_lock); -} -EXPORT_SYMBOL(dev_add_offload); - -/** - * __dev_remove_offload - remove offload handler - * @po: packet offload declaration - * - * Remove a protocol offload handler that was previously added to the - * kernel offload handlers by dev_add_offload(). The passed &offload_type - * is removed from the kernel lists and can be freed or reused once this - * function returns. - * - * The packet type might still be in use by receivers - * and must not be freed until after all the CPU's have gone - * through a quiescent state. - */ -static void __dev_remove_offload(struct packet_offload *po) -{ - struct list_head *head = &offload_base; - struct packet_offload *po1; - - spin_lock(&offload_lock); - - list_for_each_entry(po1, head, list) { - if (po == po1) { - list_del_rcu(&po->list); - goto out; - } - } - - pr_warn("dev_remove_offload: %p not found\n", po); -out: - spin_unlock(&offload_lock); -} - -/** - * dev_remove_offload - remove packet offload handler - * @po: packet offload declaration - * - * Remove a packet offload handler that was previously added to the kernel - * offload handlers by dev_add_offload(). The passed &offload_type is - * removed from the kernel lists and can be freed or reused once this - * function returns. - * - * This call sleeps to guarantee that no CPU is looking at the packet - * type after return. - */ -void dev_remove_offload(struct packet_offload *po) -{ - __dev_remove_offload(po); - - synchronize_net(); -} -EXPORT_SYMBOL(dev_remove_offload); - /******************************************************************************* * * Device Interface Subroutines @@ -3315,40 +3231,6 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) return __vlan_get_protocol(skb, type, depth); } -/** - * skb_mac_gso_segment - mac layer segmentation handler. - * @skb: buffer to segment - * @features: features for the output path (see dev->features) - */ -struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_offload *ptype; - int vlan_depth = skb->mac_len; - __be16 type = skb_network_protocol(skb, &vlan_depth); - - if (unlikely(!type)) - return ERR_PTR(-EINVAL); - - __skb_pull(skb, vlan_depth); - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &offload_base, list) { - if (ptype->type == type && ptype->callbacks.gso_segment) { - segs = ptype->callbacks.gso_segment(skb, features); - break; - } - } - rcu_read_unlock(); - - __skb_push(skb, skb->data - skb_mac_header(skb)); - - return segs; -} -EXPORT_SYMBOL(skb_mac_gso_segment); - - /* openvswitch calls this on rx path, so we need a different check. */ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) @@ -4320,8 +4202,6 @@ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ int dev_rx_weight __read_mostly = 64; int dev_tx_weight __read_mostly = 64; -/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ -int gro_normal_batch __read_mostly = 8; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, @@ -5664,7 +5544,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) return ret; } -static void netif_receive_skb_list_internal(struct list_head *head) +void netif_receive_skb_list_internal(struct list_head *head) { struct sk_buff *skb, *next; struct list_head sublist; @@ -5842,550 +5722,6 @@ static void flush_all_backlogs(void) cpus_read_unlock(); } -/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ -static void gro_normal_list(struct napi_struct *napi) -{ - if (!napi->rx_count) - return; - netif_receive_skb_list_internal(&napi->rx_list); - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; -} - -/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, - * pass the whole batch up to the stack. - */ -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs) -{ - list_add_tail(&skb->list, &napi->rx_list); - napi->rx_count += segs; - if (napi->rx_count >= gro_normal_batch) - gro_normal_list(napi); -} - -static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) -{ - struct packet_offload *ptype; - __be16 type = skb->protocol; - struct list_head *head = &offload_base; - int err = -ENOENT; - - BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); - - if (NAPI_GRO_CB(skb)->count == 1) { - skb_shinfo(skb)->gso_size = 0; - goto out; - } - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || !ptype->callbacks.gro_complete) - continue; - - err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, - ipv6_gro_complete, inet_gro_complete, - skb, 0); - break; - } - rcu_read_unlock(); - - if (err) { - WARN_ON(&ptype->list == head); - kfree_skb(skb); - return; - } - -out: - gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); -} - -static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, - bool flush_old) -{ - struct list_head *head = &napi->gro_hash[index].list; - struct sk_buff *skb, *p; - - list_for_each_entry_safe_reverse(skb, p, head, list) { - if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) - return; - skb_list_del_init(skb); - napi_gro_complete(napi, skb); - napi->gro_hash[index].count--; - } - - if (!napi->gro_hash[index].count) - __clear_bit(index, &napi->gro_bitmask); -} - -/* napi->gro_hash[].list contains packets ordered by age. - * youngest packets at the head of it. - * Complete skbs in reverse order to reduce latencies. - */ -void napi_gro_flush(struct napi_struct *napi, bool flush_old) -{ - unsigned long bitmask = napi->gro_bitmask; - unsigned int i, base = ~0U; - - while ((i = ffs(bitmask)) != 0) { - bitmask >>= i; - base += i; - __napi_gro_flush_chain(napi, base, flush_old); - } -} -EXPORT_SYMBOL(napi_gro_flush); - -static void gro_list_prepare(const struct list_head *head, - const struct sk_buff *skb) -{ - unsigned int maclen = skb->dev->hard_header_len; - u32 hash = skb_get_hash_raw(skb); - struct sk_buff *p; - - list_for_each_entry(p, head, list) { - unsigned long diffs; - - NAPI_GRO_CB(p)->flush = 0; - - if (hash != skb_get_hash_raw(p)) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - - diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); - if (skb_vlan_tag_present(p)) - diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); - diffs |= skb_metadata_differs(p, skb); - if (maclen == ETH_HLEN) - diffs |= compare_ether_header(skb_mac_header(p), - skb_mac_header(skb)); - else if (!diffs) - diffs = memcmp(skb_mac_header(p), - skb_mac_header(skb), - maclen); - - /* in most common scenarions 'slow_gro' is 0 - * otherwise we are already on some slower paths - * either skip all the infrequent tests altogether or - * avoid trying too hard to skip each of them individually - */ - if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - struct tc_skb_ext *skb_ext; - struct tc_skb_ext *p_ext; -#endif - - diffs |= p->sk != skb->sk; - diffs |= skb_metadata_dst_cmp(p, skb); - diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); - -#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - skb_ext = skb_ext_find(skb, TC_SKB_EXT); - p_ext = skb_ext_find(p, TC_SKB_EXT); - - diffs |= (!!p_ext) ^ (!!skb_ext); - if (!diffs && unlikely(skb_ext)) - diffs |= p_ext->chain ^ skb_ext->chain; -#endif - } - - NAPI_GRO_CB(p)->same_flow = !diffs; - } -} - -static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) -{ - const struct skb_shared_info *pinfo = skb_shinfo(skb); - const skb_frag_t *frag0 = &pinfo->frags[0]; - - NAPI_GRO_CB(skb)->data_offset = 0; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; - - if (!skb_headlen(skb) && pinfo->nr_frags && - !PageHighMem(skb_frag_page(frag0)) && - (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { - NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); - NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, - skb_frag_size(frag0), - skb->end - skb->tail); - } -} - -static void gro_pull_from_frag0(struct sk_buff *skb, int grow) -{ - struct skb_shared_info *pinfo = skb_shinfo(skb); - - BUG_ON(skb->end - skb->tail < grow); - - memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); - - skb->data_len -= grow; - skb->tail += grow; - - skb_frag_off_add(&pinfo->frags[0], grow); - skb_frag_size_sub(&pinfo->frags[0], grow); - - if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { - skb_frag_unref(skb, 0); - memmove(pinfo->frags, pinfo->frags + 1, - --pinfo->nr_frags * sizeof(pinfo->frags[0])); - } -} - -static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) -{ - struct sk_buff *oldest; - - oldest = list_last_entry(head, struct sk_buff, list); - - /* We are called with head length >= MAX_GRO_SKBS, so this is - * impossible. - */ - if (WARN_ON_ONCE(!oldest)) - return; - - /* Do not adjust napi->gro_hash[].count, caller is adding a new - * SKB to the chain. - */ - skb_list_del_init(oldest); - napi_gro_complete(napi, oldest); -} - -static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) -{ - u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); - struct gro_list *gro_list = &napi->gro_hash[bucket]; - struct list_head *head = &offload_base; - struct packet_offload *ptype; - __be16 type = skb->protocol; - struct sk_buff *pp = NULL; - enum gro_result ret; - int same_flow; - int grow; - - if (netif_elide_gro(skb->dev)) - goto normal; - - gro_list_prepare(&gro_list->list, skb); - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || !ptype->callbacks.gro_receive) - continue; - - skb_set_network_header(skb, skb_gro_offset(skb)); - skb_reset_mac_len(skb); - NAPI_GRO_CB(skb)->same_flow = 0; - NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); - NAPI_GRO_CB(skb)->free = 0; - NAPI_GRO_CB(skb)->encap_mark = 0; - NAPI_GRO_CB(skb)->recursion_counter = 0; - NAPI_GRO_CB(skb)->is_fou = 0; - NAPI_GRO_CB(skb)->is_atomic = 1; - NAPI_GRO_CB(skb)->gro_remcsum_start = 0; - - /* Setup for GRO checksum validation */ - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - NAPI_GRO_CB(skb)->csum = skb->csum; - NAPI_GRO_CB(skb)->csum_valid = 1; - NAPI_GRO_CB(skb)->csum_cnt = 0; - break; - case CHECKSUM_UNNECESSARY: - NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; - NAPI_GRO_CB(skb)->csum_valid = 0; - break; - default: - NAPI_GRO_CB(skb)->csum_cnt = 0; - NAPI_GRO_CB(skb)->csum_valid = 0; - } - - pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, - ipv6_gro_receive, inet_gro_receive, - &gro_list->list, skb); - break; - } - rcu_read_unlock(); - - if (&ptype->list == head) - goto normal; - - if (PTR_ERR(pp) == -EINPROGRESS) { - ret = GRO_CONSUMED; - goto ok; - } - - same_flow = NAPI_GRO_CB(skb)->same_flow; - ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; - - if (pp) { - skb_list_del_init(pp); - napi_gro_complete(napi, pp); - gro_list->count--; - } - - if (same_flow) - goto ok; - - if (NAPI_GRO_CB(skb)->flush) - goto normal; - - if (unlikely(gro_list->count >= MAX_GRO_SKBS)) - gro_flush_oldest(napi, &gro_list->list); - else - gro_list->count++; - - NAPI_GRO_CB(skb)->count = 1; - NAPI_GRO_CB(skb)->age = jiffies; - NAPI_GRO_CB(skb)->last = skb; - skb_shinfo(skb)->gso_size = skb_gro_len(skb); - list_add(&skb->list, &gro_list->list); - ret = GRO_HELD; - -pull: - grow = skb_gro_offset(skb) - skb_headlen(skb); - if (grow > 0) - gro_pull_from_frag0(skb, grow); -ok: - if (gro_list->count) { - if (!test_bit(bucket, &napi->gro_bitmask)) - __set_bit(bucket, &napi->gro_bitmask); - } else if (test_bit(bucket, &napi->gro_bitmask)) { - __clear_bit(bucket, &napi->gro_bitmask); - } - - return ret; - -normal: - ret = GRO_NORMAL; - goto pull; -} - -struct packet_offload *gro_find_receive_by_type(__be16 type) -{ - struct list_head *offload_head = &offload_base; - struct packet_offload *ptype; - - list_for_each_entry_rcu(ptype, offload_head, list) { - if (ptype->type != type || !ptype->callbacks.gro_receive) - continue; - return ptype; - } - return NULL; -} -EXPORT_SYMBOL(gro_find_receive_by_type); - -struct packet_offload *gro_find_complete_by_type(__be16 type) -{ - struct list_head *offload_head = &offload_base; - struct packet_offload *ptype; - - list_for_each_entry_rcu(ptype, offload_head, list) { - if (ptype->type != type || !ptype->callbacks.gro_complete) - continue; - return ptype; - } - return NULL; -} -EXPORT_SYMBOL(gro_find_complete_by_type); - -static gro_result_t napi_skb_finish(struct napi_struct *napi, - struct sk_buff *skb, - gro_result_t ret) -{ - switch (ret) { - case GRO_NORMAL: - gro_normal_one(napi, skb, 1); - break; - - case GRO_MERGED_FREE: - if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) - napi_skb_free_stolen_head(skb); - else if (skb->fclone != SKB_FCLONE_UNAVAILABLE) - __kfree_skb(skb); - else - __kfree_skb_defer(skb); - break; - - case GRO_HELD: - case GRO_MERGED: - case GRO_CONSUMED: - break; - } - - return ret; -} - -gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) -{ - gro_result_t ret; - - skb_mark_napi_id(skb, napi); - trace_napi_gro_receive_entry(skb); - - skb_gro_reset_offset(skb, 0); - - ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); - trace_napi_gro_receive_exit(ret); - - return ret; -} -EXPORT_SYMBOL(napi_gro_receive); - -static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) -{ - if (unlikely(skb->pfmemalloc)) { - consume_skb(skb); - return; - } - __skb_pull(skb, skb_headlen(skb)); - /* restore the reserve we had after netdev_alloc_skb_ip_align() */ - skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); - __vlan_hwaccel_clear_tag(skb); - skb->dev = napi->dev; - skb->skb_iif = 0; - - /* eth_type_trans() assumes pkt_type is PACKET_HOST */ - skb->pkt_type = PACKET_HOST; - - skb->encapsulation = 0; - skb_shinfo(skb)->gso_type = 0; - skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); - if (unlikely(skb->slow_gro)) { - skb_orphan(skb); - skb_ext_reset(skb); - nf_reset_ct(skb); - skb->slow_gro = 0; - } - - napi->skb = skb; -} - -struct sk_buff *napi_get_frags(struct napi_struct *napi) -{ - struct sk_buff *skb = napi->skb; - - if (!skb) { - skb = napi_alloc_skb(napi, GRO_MAX_HEAD); - if (skb) { - napi->skb = skb; - skb_mark_napi_id(skb, napi); - } - } - return skb; -} -EXPORT_SYMBOL(napi_get_frags); - -static gro_result_t napi_frags_finish(struct napi_struct *napi, - struct sk_buff *skb, - gro_result_t ret) -{ - switch (ret) { - case GRO_NORMAL: - case GRO_HELD: - __skb_push(skb, ETH_HLEN); - skb->protocol = eth_type_trans(skb, skb->dev); - if (ret == GRO_NORMAL) - gro_normal_one(napi, skb, 1); - break; - - case GRO_MERGED_FREE: - if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) - napi_skb_free_stolen_head(skb); - else - napi_reuse_skb(napi, skb); - break; - - case GRO_MERGED: - case GRO_CONSUMED: - break; - } - - return ret; -} - -/* Upper GRO stack assumes network header starts at gro_offset=0 - * Drivers could call both napi_gro_frags() and napi_gro_receive() - * We copy ethernet header into skb->data to have a common layout. - */ -static struct sk_buff *napi_frags_skb(struct napi_struct *napi) -{ - struct sk_buff *skb = napi->skb; - const struct ethhdr *eth; - unsigned int hlen = sizeof(*eth); - - napi->skb = NULL; - - skb_reset_mac_header(skb); - skb_gro_reset_offset(skb, hlen); - - if (unlikely(skb_gro_header_hard(skb, hlen))) { - eth = skb_gro_header_slow(skb, hlen, 0); - if (unlikely(!eth)) { - net_warn_ratelimited("%s: dropping impossible skb from %s\n", - __func__, napi->dev->name); - napi_reuse_skb(napi, skb); - return NULL; - } - } else { - eth = (const struct ethhdr *)skb->data; - gro_pull_from_frag0(skb, hlen); - NAPI_GRO_CB(skb)->frag0 += hlen; - NAPI_GRO_CB(skb)->frag0_len -= hlen; - } - __skb_pull(skb, hlen); - - /* - * This works because the only protocols we care about don't require - * special handling. - * We'll fix it up properly in napi_frags_finish() - */ - skb->protocol = eth->h_proto; - - return skb; -} - -gro_result_t napi_gro_frags(struct napi_struct *napi) -{ - gro_result_t ret; - struct sk_buff *skb = napi_frags_skb(napi); - - trace_napi_gro_frags_entry(skb); - - ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); - trace_napi_gro_frags_exit(ret); - - return ret; -} -EXPORT_SYMBOL(napi_gro_frags); - -/* Compute the checksum from gro_offset and return the folded value - * after adding in any pseudo checksum. - */ -__sum16 __skb_gro_checksum_complete(struct sk_buff *skb) -{ - __wsum wsum; - __sum16 sum; - - wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); - - /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ - sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); - /* See comments in __skb_checksum_complete(). */ - if (likely(!sum)) { - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && - !skb->csum_complete_sw) - netdev_rx_csum_fault(skb->dev, skb); - } - - NAPI_GRO_CB(skb)->csum = wsum; - NAPI_GRO_CB(skb)->csum_valid = 1; - - return sum; -} -EXPORT_SYMBOL(__skb_gro_checksum_complete); - static void net_rps_send_ipi(struct softnet_data *remsd) { #ifdef CONFIG_RPS @@ -11640,8 +10976,6 @@ static int __init net_dev_init(void) for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); - INIT_LIST_HEAD(&offload_base); - if (register_pernet_subsys(&netdev_net_ops)) goto out; diff --git a/net/core/gro.c b/net/core/gro.c index 91a74c4da9ff..8ec8b44596da 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -1,5 +1,129 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include +#include +#include + +#define MAX_GRO_SKBS 8 + +/* This should be increased if a protocol with a bigger head is added. */ +#define GRO_MAX_HEAD (MAX_HEADER + 128) + +static DEFINE_SPINLOCK(offload_lock); +static struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); +/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ +int gro_normal_batch __read_mostly = 8; + +/** + * dev_add_offload - register offload handlers + * @po: protocol offload declaration + * + * Add protocol offload handlers to the networking stack. The passed + * &proto_offload is linked into kernel lists and may not be freed until + * it has been removed from the kernel lists. + * + * This call does not sleep therefore it can not + * guarantee all CPU's that are in middle of receiving packets + * will see the new offload handlers (until the next received packet). + */ +void dev_add_offload(struct packet_offload *po) +{ + struct packet_offload *elem; + + spin_lock(&offload_lock); + list_for_each_entry(elem, &offload_base, list) { + if (po->priority < elem->priority) + break; + } + list_add_rcu(&po->list, elem->list.prev); + spin_unlock(&offload_lock); +} +EXPORT_SYMBOL(dev_add_offload); + +/** + * __dev_remove_offload - remove offload handler + * @po: packet offload declaration + * + * Remove a protocol offload handler that was previously added to the + * kernel offload handlers by dev_add_offload(). The passed &offload_type + * is removed from the kernel lists and can be freed or reused once this + * function returns. + * + * The packet type might still be in use by receivers + * and must not be freed until after all the CPU's have gone + * through a quiescent state. + */ +static void __dev_remove_offload(struct packet_offload *po) +{ + struct list_head *head = &offload_base; + struct packet_offload *po1; + + spin_lock(&offload_lock); + + list_for_each_entry(po1, head, list) { + if (po == po1) { + list_del_rcu(&po->list); + goto out; + } + } + + pr_warn("dev_remove_offload: %p not found\n", po); +out: + spin_unlock(&offload_lock); +} + +/** + * dev_remove_offload - remove packet offload handler + * @po: packet offload declaration + * + * Remove a packet offload handler that was previously added to the kernel + * offload handlers by dev_add_offload(). The passed &offload_type is + * removed from the kernel lists and can be freed or reused once this + * function returns. + * + * This call sleeps to guarantee that no CPU is looking at the packet + * type after return. + */ +void dev_remove_offload(struct packet_offload *po) +{ + __dev_remove_offload(po); + + synchronize_net(); +} +EXPORT_SYMBOL(dev_remove_offload); + +/** + * skb_mac_gso_segment - mac layer segmentation handler. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + */ +struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_offload *ptype; + int vlan_depth = skb->mac_len; + __be16 type = skb_network_protocol(skb, &vlan_depth); + + if (unlikely(!type)) + return ERR_PTR(-EINVAL); + + __skb_pull(skb, vlan_depth); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { + segs = ptype->callbacks.gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + __skb_push(skb, skb->data - skb_mac_header(skb)); + + return segs; +} +EXPORT_SYMBOL(skb_mac_gso_segment); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) { @@ -116,3 +240,527 @@ done: NAPI_GRO_CB(skb)->same_flow = 1; return 0; } + + +static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) +{ + struct packet_offload *ptype; + __be16 type = skb->protocol; + struct list_head *head = &offload_base; + int err = -ENOENT; + + BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); + + if (NAPI_GRO_CB(skb)->count == 1) { + skb_shinfo(skb)->gso_size = 0; + goto out; + } + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + + err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, + ipv6_gro_complete, inet_gro_complete, + skb, 0); + break; + } + rcu_read_unlock(); + + if (err) { + WARN_ON(&ptype->list == head); + kfree_skb(skb); + return; + } + +out: + gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); +} + +static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, + bool flush_old) +{ + struct list_head *head = &napi->gro_hash[index].list; + struct sk_buff *skb, *p; + + list_for_each_entry_safe_reverse(skb, p, head, list) { + if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) + return; + skb_list_del_init(skb); + napi_gro_complete(napi, skb); + napi->gro_hash[index].count--; + } + + if (!napi->gro_hash[index].count) + __clear_bit(index, &napi->gro_bitmask); +} + +/* napi->gro_hash[].list contains packets ordered by age. + * youngest packets at the head of it. + * Complete skbs in reverse order to reduce latencies. + */ +void napi_gro_flush(struct napi_struct *napi, bool flush_old) +{ + unsigned long bitmask = napi->gro_bitmask; + unsigned int i, base = ~0U; + + while ((i = ffs(bitmask)) != 0) { + bitmask >>= i; + base += i; + __napi_gro_flush_chain(napi, base, flush_old); + } +} +EXPORT_SYMBOL(napi_gro_flush); + +static void gro_list_prepare(const struct list_head *head, + const struct sk_buff *skb) +{ + unsigned int maclen = skb->dev->hard_header_len; + u32 hash = skb_get_hash_raw(skb); + struct sk_buff *p; + + list_for_each_entry(p, head, list) { + unsigned long diffs; + + NAPI_GRO_CB(p)->flush = 0; + + if (hash != skb_get_hash_raw(p)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); + if (skb_vlan_tag_present(p)) + diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); + diffs |= skb_metadata_differs(p, skb); + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_mac_header(skb), + maclen); + + /* in most common scenarions 'slow_gro' is 0 + * otherwise we are already on some slower paths + * either skip all the infrequent tests altogether or + * avoid trying too hard to skip each of them individually + */ + if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { +#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + struct tc_skb_ext *skb_ext; + struct tc_skb_ext *p_ext; +#endif + + diffs |= p->sk != skb->sk; + diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); + +#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + skb_ext = skb_ext_find(skb, TC_SKB_EXT); + p_ext = skb_ext_find(p, TC_SKB_EXT); + + diffs |= (!!p_ext) ^ (!!skb_ext); + if (!diffs && unlikely(skb_ext)) + diffs |= p_ext->chain ^ skb_ext->chain; +#endif + } + + NAPI_GRO_CB(p)->same_flow = !diffs; + } +} + +static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) +{ + const struct skb_shared_info *pinfo = skb_shinfo(skb); + const skb_frag_t *frag0 = &pinfo->frags[0]; + + NAPI_GRO_CB(skb)->data_offset = 0; + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; + + if (!skb_headlen(skb) && pinfo->nr_frags && + !PageHighMem(skb_frag_page(frag0)) && + (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { + NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); + NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, + skb_frag_size(frag0), + skb->end - skb->tail); + } +} + +static void gro_pull_from_frag0(struct sk_buff *skb, int grow) +{ + struct skb_shared_info *pinfo = skb_shinfo(skb); + + BUG_ON(skb->end - skb->tail < grow); + + memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + + skb->data_len -= grow; + skb->tail += grow; + + skb_frag_off_add(&pinfo->frags[0], grow); + skb_frag_size_sub(&pinfo->frags[0], grow); + + if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { + skb_frag_unref(skb, 0); + memmove(pinfo->frags, pinfo->frags + 1, + --pinfo->nr_frags * sizeof(pinfo->frags[0])); + } +} + +static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) +{ + struct sk_buff *oldest; + + oldest = list_last_entry(head, struct sk_buff, list); + + /* We are called with head length >= MAX_GRO_SKBS, so this is + * impossible. + */ + if (WARN_ON_ONCE(!oldest)) + return; + + /* Do not adjust napi->gro_hash[].count, caller is adding a new + * SKB to the chain. + */ + skb_list_del_init(oldest); + napi_gro_complete(napi, oldest); +} + +static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + struct gro_list *gro_list = &napi->gro_hash[bucket]; + struct list_head *head = &offload_base; + struct packet_offload *ptype; + __be16 type = skb->protocol; + struct sk_buff *pp = NULL; + enum gro_result ret; + int same_flow; + int grow; + + if (netif_elide_gro(skb->dev)) + goto normal; + + gro_list_prepare(&gro_list->list, skb); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || !ptype->callbacks.gro_receive) + continue; + + skb_set_network_header(skb, skb_gro_offset(skb)); + skb_reset_mac_len(skb); + NAPI_GRO_CB(skb)->same_flow = 0; + NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); + NAPI_GRO_CB(skb)->free = 0; + NAPI_GRO_CB(skb)->encap_mark = 0; + NAPI_GRO_CB(skb)->recursion_counter = 0; + NAPI_GRO_CB(skb)->is_fou = 0; + NAPI_GRO_CB(skb)->is_atomic = 1; + NAPI_GRO_CB(skb)->gro_remcsum_start = 0; + + /* Setup for GRO checksum validation */ + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + NAPI_GRO_CB(skb)->csum_cnt = 0; + break; + case CHECKSUM_UNNECESSARY: + NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; + NAPI_GRO_CB(skb)->csum_valid = 0; + break; + default: + NAPI_GRO_CB(skb)->csum_cnt = 0; + NAPI_GRO_CB(skb)->csum_valid = 0; + } + + pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, + ipv6_gro_receive, inet_gro_receive, + &gro_list->list, skb); + break; + } + rcu_read_unlock(); + + if (&ptype->list == head) + goto normal; + + if (PTR_ERR(pp) == -EINPROGRESS) { + ret = GRO_CONSUMED; + goto ok; + } + + same_flow = NAPI_GRO_CB(skb)->same_flow; + ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; + + if (pp) { + skb_list_del_init(pp); + napi_gro_complete(napi, pp); + gro_list->count--; + } + + if (same_flow) + goto ok; + + if (NAPI_GRO_CB(skb)->flush) + goto normal; + + if (unlikely(gro_list->count >= MAX_GRO_SKBS)) + gro_flush_oldest(napi, &gro_list->list); + else + gro_list->count++; + + NAPI_GRO_CB(skb)->count = 1; + NAPI_GRO_CB(skb)->age = jiffies; + NAPI_GRO_CB(skb)->last = skb; + skb_shinfo(skb)->gso_size = skb_gro_len(skb); + list_add(&skb->list, &gro_list->list); + ret = GRO_HELD; + +pull: + grow = skb_gro_offset(skb) - skb_headlen(skb); + if (grow > 0) + gro_pull_from_frag0(skb, grow); +ok: + if (gro_list->count) { + if (!test_bit(bucket, &napi->gro_bitmask)) + __set_bit(bucket, &napi->gro_bitmask); + } else if (test_bit(bucket, &napi->gro_bitmask)) { + __clear_bit(bucket, &napi->gro_bitmask); + } + + return ret; + +normal: + ret = GRO_NORMAL; + goto pull; +} + +struct packet_offload *gro_find_receive_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_receive) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_receive_by_type); + +struct packet_offload *gro_find_complete_by_type(__be16 type) +{ + struct list_head *offload_head = &offload_base; + struct packet_offload *ptype; + + list_for_each_entry_rcu(ptype, offload_head, list) { + if (ptype->type != type || !ptype->callbacks.gro_complete) + continue; + return ptype; + } + return NULL; +} +EXPORT_SYMBOL(gro_find_complete_by_type); + +static gro_result_t napi_skb_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) +{ + switch (ret) { + case GRO_NORMAL: + gro_normal_one(napi, skb, 1); + break; + + case GRO_MERGED_FREE: + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + napi_skb_free_stolen_head(skb); + else if (skb->fclone != SKB_FCLONE_UNAVAILABLE) + __kfree_skb(skb); + else + __kfree_skb_defer(skb); + break; + + case GRO_HELD: + case GRO_MERGED: + case GRO_CONSUMED: + break; + } + + return ret; +} + +gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + gro_result_t ret; + + skb_mark_napi_id(skb, napi); + trace_napi_gro_receive_entry(skb); + + skb_gro_reset_offset(skb, 0); + + ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); + trace_napi_gro_receive_exit(ret); + + return ret; +} +EXPORT_SYMBOL(napi_gro_receive); + +static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +{ + if (unlikely(skb->pfmemalloc)) { + consume_skb(skb); + return; + } + __skb_pull(skb, skb_headlen(skb)); + /* restore the reserve we had after netdev_alloc_skb_ip_align() */ + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); + __vlan_hwaccel_clear_tag(skb); + skb->dev = napi->dev; + skb->skb_iif = 0; + + /* eth_type_trans() assumes pkt_type is PACKET_HOST */ + skb->pkt_type = PACKET_HOST; + + skb->encapsulation = 0; + skb_shinfo(skb)->gso_type = 0; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); + if (unlikely(skb->slow_gro)) { + skb_orphan(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); + skb->slow_gro = 0; + } + + napi->skb = skb; +} + +struct sk_buff *napi_get_frags(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + + if (!skb) { + skb = napi_alloc_skb(napi, GRO_MAX_HEAD); + if (skb) { + napi->skb = skb; + skb_mark_napi_id(skb, napi); + } + } + return skb; +} +EXPORT_SYMBOL(napi_get_frags); + +static gro_result_t napi_frags_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) +{ + switch (ret) { + case GRO_NORMAL: + case GRO_HELD: + __skb_push(skb, ETH_HLEN); + skb->protocol = eth_type_trans(skb, skb->dev); + if (ret == GRO_NORMAL) + gro_normal_one(napi, skb, 1); + break; + + case GRO_MERGED_FREE: + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + napi_skb_free_stolen_head(skb); + else + napi_reuse_skb(napi, skb); + break; + + case GRO_MERGED: + case GRO_CONSUMED: + break; + } + + return ret; +} + +/* Upper GRO stack assumes network header starts at gro_offset=0 + * Drivers could call both napi_gro_frags() and napi_gro_receive() + * We copy ethernet header into skb->data to have a common layout. + */ +static struct sk_buff *napi_frags_skb(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + const struct ethhdr *eth; + unsigned int hlen = sizeof(*eth); + + napi->skb = NULL; + + skb_reset_mac_header(skb); + skb_gro_reset_offset(skb, hlen); + + if (unlikely(skb_gro_header_hard(skb, hlen))) { + eth = skb_gro_header_slow(skb, hlen, 0); + if (unlikely(!eth)) { + net_warn_ratelimited("%s: dropping impossible skb from %s\n", + __func__, napi->dev->name); + napi_reuse_skb(napi, skb); + return NULL; + } + } else { + eth = (const struct ethhdr *)skb->data; + gro_pull_from_frag0(skb, hlen); + NAPI_GRO_CB(skb)->frag0 += hlen; + NAPI_GRO_CB(skb)->frag0_len -= hlen; + } + __skb_pull(skb, hlen); + + /* + * This works because the only protocols we care about don't require + * special handling. + * We'll fix it up properly in napi_frags_finish() + */ + skb->protocol = eth->h_proto; + + return skb; +} + +gro_result_t napi_gro_frags(struct napi_struct *napi) +{ + gro_result_t ret; + struct sk_buff *skb = napi_frags_skb(napi); + + trace_napi_gro_frags_entry(skb); + + ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + trace_napi_gro_frags_exit(ret); + + return ret; +} +EXPORT_SYMBOL(napi_gro_frags); + +/* Compute the checksum from gro_offset and return the folded value + * after adding in any pseudo checksum. + */ +__sum16 __skb_gro_checksum_complete(struct sk_buff *skb) +{ + __wsum wsum; + __sum16 sum; + + wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); + + /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ + sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); + /* See comments in __skb_checksum_complete(). */ + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(skb->dev, skb); + } + + NAPI_GRO_CB(skb)->csum = wsum; + NAPI_GRO_CB(skb)->csum_valid = 1; + + return sum; +} +EXPORT_SYMBOL(__skb_gro_checksum_complete); -- cgit v1.2.3 From 7071732c26fe2cf141185ed16a8a85d02495ae8c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Nov 2021 09:23:02 -0800 Subject: net: use .data.once section in netdev_level_once() Same rationale than prior patch : using the dedicated section avoid holes and pack all these bool values. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 31a7e6b27681..dd328364dfe9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4942,7 +4942,7 @@ void netdev_info(const struct net_device *dev, const char *format, ...); #define netdev_level_once(level, dev, fmt, ...) \ do { \ - static bool __print_once __read_mostly; \ + static bool __section(".data.once") __print_once; \ \ if (!__print_once) { \ __print_once = true; \ -- cgit v1.2.3 From 8160fb43d55d26d64607fd32fe69185a5f5fe41f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Nov 2021 19:29:21 -0800 Subject: net: use an atomic_long_t for queue->trans_timeout tx_timeout_show() assumed dev_watchdog() would stop all the queues, to fetch queue->trans_timeout under protection of the queue->_xmit_lock. As we want to no longer disrupt transmits, we use an atomic_long_t instead. Signed-off-by: Eric Dumazet Cc: david decotigny Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- net/core/net-sysfs.c | 6 +----- net/sched/sch_generic.c | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dd328364dfe9..1d22483cf78c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -592,7 +592,7 @@ struct netdev_queue { * Number of TX timeouts for this queue * (/sys/class/net/DEV/Q/trans_timeout) */ - unsigned long trans_timeout; + atomic_long_t trans_timeout; /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 9c01c642cf9e..addbef5419fb 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1201,11 +1201,7 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = { static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf) { - unsigned long trans_timeout; - - spin_lock_irq(&queue->_xmit_lock); - trans_timeout = queue->trans_timeout; - spin_unlock_irq(&queue->_xmit_lock); + unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); return sprintf(buf, fmt_ulong, trans_timeout); } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 3b0f62095803..1b4328bd495d 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -467,7 +467,7 @@ static void dev_watchdog(struct timer_list *t) time_after(jiffies, (trans_start + dev->watchdog_timeo))) { some_queue_timedout = 1; - txq->trans_timeout++; + atomic_long_inc(&txq->trans_timeout); break; } } -- cgit v1.2.3 From 5337824f4dc4bb26f38fbbba4ffb425a92803f15 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Nov 2021 19:29:22 -0800 Subject: net: annotate accesses to queue->trans_start In following patches, dev_watchdog() will no longer stop all queues. It will read queue->trans_start locklessly. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/apm/xgene/xgene_enet_main.c | 2 +- drivers/net/ethernet/atheros/ag71xx.c | 2 +- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 4 ++-- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +- drivers/net/ethernet/ibm/ibmvnic.c | 2 +- drivers/net/ethernet/intel/igb/igb_main.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 6 +++--- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 +- drivers/net/virtio_net.c | 2 +- drivers/net/wireless/marvell/mwifiex/init.c | 2 +- drivers/staging/rtl8192e/rtllib_softmac.c | 2 +- include/linux/netdevice.h | 16 +++++++++++++--- net/sched/sch_generic.c | 8 ++++---- 14 files changed, 33 insertions(+), 23 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index 220dc42af31a..ff2d099aab21 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -869,7 +869,7 @@ static void xgene_enet_timeout(struct net_device *ndev, unsigned int txqueue) for (i = 0; i < pdata->txq_cnt; i++) { txq = netdev_get_tx_queue(ndev, i); - txq->trans_start = jiffies; + txq_trans_cond_update(txq); netif_tx_start_queue(txq); } } diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c index 88d2ab748399..e4f30bb7498f 100644 --- a/drivers/net/ethernet/atheros/ag71xx.c +++ b/drivers/net/ethernet/atheros/ag71xx.c @@ -766,7 +766,7 @@ static bool ag71xx_check_dma_stuck(struct ag71xx *ag) unsigned long timestamp; u32 rx_sm, tx_sm, rx_fd; - timestamp = netdev_get_tx_queue(ag->ndev, 0)->trans_start; + timestamp = READ_ONCE(netdev_get_tx_queue(ag->ndev, 0)->trans_start); if (likely(time_before(jiffies, timestamp + HZ / 10))) return false; diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 6b2927d863e2..d6871437d951 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2325,7 +2325,7 @@ dpaa_start_xmit(struct sk_buff *skb, struct net_device *net_dev) txq = netdev_get_tx_queue(net_dev, queue_mapping); /* LLTX requires to do our own update of trans_start */ - txq->trans_start = jiffies; + txq_trans_cond_update(txq); if (priv->tx_tstamp && skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) { fd.cmd |= cpu_to_be32(FM_FD_CMD_UPD); @@ -2531,7 +2531,7 @@ static int dpaa_xdp_xmit_frame(struct net_device *net_dev, /* Bump the trans_start */ txq = netdev_get_tx_queue(net_dev, smp_processor_id()); - txq->trans_start = jiffies; + txq_trans_cond_update(txq); err = dpaa_xmit(priv, percpu_stats, smp_processor_id(), &fd); if (err) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 13835a37b3a2..d5100179f8d5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -2679,7 +2679,7 @@ static bool hns3_get_tx_timeo_queue_info(struct net_device *ndev) unsigned long trans_start; q = netdev_get_tx_queue(ndev, i); - trans_start = q->trans_start; + trans_start = READ_ONCE(q->trans_start); if (netif_xmit_stopped(q) && time_after(jiffies, (trans_start + ndev->watchdog_timeo))) { diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 3cca51735421..c327fc8860da 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2058,7 +2058,7 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) tx_packets++; tx_bytes += skb->len; - txq->trans_start = jiffies; + txq_trans_cond_update(txq); ret = NETDEV_TX_OK; goto out; diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 836be0d3b291..18a019a47182 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2927,7 +2927,7 @@ static int igb_xdp_xmit_back(struct igb_adapter *adapter, struct xdp_buff *xdp) nq = txring_txq(tx_ring); __netif_tx_lock(nq, cpu); /* Avoid transmit queue timeout since we share it with the slow path */ - nq->trans_start = jiffies; + txq_trans_cond_update(nq); ret = igb_xmit_xdp_ring(adapter, tx_ring, xdpf); __netif_tx_unlock(nq); @@ -2961,7 +2961,7 @@ static int igb_xdp_xmit(struct net_device *dev, int n, __netif_tx_lock(nq, cpu); /* Avoid transmit queue timeout since we share it with the slow path */ - nq->trans_start = jiffies; + txq_trans_cond_update(nq); for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 4f4bc8726ec4..860605133287 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -565,7 +565,7 @@ int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) snprintf(err_str, sizeof(err_str), "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u", sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, - jiffies_to_usecs(jiffies - sq->txq->trans_start)); + jiffies_to_usecs(jiffies - READ_ONCE(sq->txq->trans_start))); mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); return to_ctx.status; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 033c35c09a54..389d125310c1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2356,7 +2356,7 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget) bool work_done = true; /* Avoids TX time-out as we are sharing with slow path */ - nq->trans_start = jiffies; + txq_trans_cond_update(nq->trans_start); budget = min(budget, stmmac_tx_avail(priv, queue)); @@ -4657,7 +4657,7 @@ static int stmmac_xdp_xmit_back(struct stmmac_priv *priv, __netif_tx_lock(nq, cpu); /* Avoids TX time-out as we are sharing with slow path */ - nq->trans_start = jiffies; + txq_trans_cond_update(nq->trans_start); res = stmmac_xdp_xmit_xdpf(priv, queue, xdpf, false); if (res == STMMAC_XDP_TX) @@ -6293,7 +6293,7 @@ static int stmmac_xdp_xmit(struct net_device *dev, int num_frames, __netif_tx_lock(nq, cpu); /* Avoids TX time-out as we are sharing with slow path */ - nq->trans_start = jiffies; + txq_trans_cond_update(nq); for (i = 0; i < num_frames; i++) { int res; diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index c092cb61416a..750cea23e9cd 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -345,7 +345,7 @@ static void am65_cpsw_nuss_ndo_host_tx_timeout(struct net_device *ndev, netif_txq = netdev_get_tx_queue(ndev, txqueue); tx_chn = &common->tx_chns[txqueue]; - trans_start = netif_txq->trans_start; + trans_start = READ_ONCE(netif_txq->trans_start); netdev_err(ndev, "txq:%d DRV_XOFF:%d tmo:%u dql_avail:%d free_desc:%zu\n", txqueue, diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 1771d6e5224f..03e38e38ee4b 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2694,7 +2694,7 @@ static void virtnet_tx_timeout(struct net_device *dev, unsigned int txqueue) netdev_err(dev, "TX timeout on queue: %u, sq: %s, vq: 0x%x, name: %s, %u usecs ago\n", txqueue, sq->name, sq->vq->index, sq->vq->name, - jiffies_to_usecs(jiffies - txq->trans_start)); + jiffies_to_usecs(jiffies - READ_ONCE(txq->trans_start))); } static const struct net_device_ops virtnet_netdev = { diff --git a/drivers/net/wireless/marvell/mwifiex/init.c b/drivers/net/wireless/marvell/mwifiex/init.c index f006a3d72b40..88c72d1827a0 100644 --- a/drivers/net/wireless/marvell/mwifiex/init.c +++ b/drivers/net/wireless/marvell/mwifiex/init.c @@ -332,7 +332,7 @@ void mwifiex_set_trans_start(struct net_device *dev) int i; for (i = 0; i < dev->num_tx_queues; i++) - netdev_get_tx_queue(dev, i)->trans_start = jiffies; + txq_trans_cond_update(netdev_get_tx_queue(dev, i)); netif_trans_update(dev); } diff --git a/drivers/staging/rtl8192e/rtllib_softmac.c b/drivers/staging/rtl8192e/rtllib_softmac.c index d2726d01c757..aabbea48223d 100644 --- a/drivers/staging/rtl8192e/rtllib_softmac.c +++ b/drivers/staging/rtl8192e/rtllib_softmac.c @@ -2515,7 +2515,7 @@ void rtllib_stop_all_queues(struct rtllib_device *ieee) unsigned int i; for (i = 0; i < ieee->dev->num_tx_queues; i++) - netdev_get_tx_queue(ieee->dev, i)->trans_start = jiffies; + txq_trans_cond_update(netdev_get_tx_queue(ieee->dev, i)); netif_tx_stop_all_queues(ieee->dev); } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1d22483cf78c..279409ef2b18 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4095,10 +4095,21 @@ static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) spin_unlock_bh(&txq->_xmit_lock); } +/* + * txq->trans_start can be read locklessly from dev_watchdog() + */ static inline void txq_trans_update(struct netdev_queue *txq) { if (txq->xmit_lock_owner != -1) - txq->trans_start = jiffies; + WRITE_ONCE(txq->trans_start, jiffies); +} + +static inline void txq_trans_cond_update(struct netdev_queue *txq) +{ + unsigned long now = jiffies; + + if (READ_ONCE(txq->trans_start) != now) + WRITE_ONCE(txq->trans_start, now); } /* legacy drivers only, netdev_start_xmit() sets txq->trans_start */ @@ -4106,8 +4117,7 @@ static inline void netif_trans_update(struct net_device *dev) { struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); - if (txq->trans_start != jiffies) - txq->trans_start = jiffies; + txq_trans_cond_update(txq); } /** diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 1b4328bd495d..02c46041f76e 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -434,9 +434,9 @@ unsigned long dev_trans_start(struct net_device *dev) dev = vlan_dev_real_dev(dev); else if (netif_is_macvlan(dev)) dev = macvlan_dev_real_dev(dev); - res = netdev_get_tx_queue(dev, 0)->trans_start; + res = READ_ONCE(netdev_get_tx_queue(dev, 0)->trans_start); for (i = 1; i < dev->num_tx_queues; i++) { - val = netdev_get_tx_queue(dev, i)->trans_start; + val = READ_ONCE(netdev_get_tx_queue(dev, i)->trans_start); if (val && time_after(val, res)) res = val; } @@ -462,7 +462,7 @@ static void dev_watchdog(struct timer_list *t) struct netdev_queue *txq; txq = netdev_get_tx_queue(dev, i); - trans_start = txq->trans_start; + trans_start = READ_ONCE(txq->trans_start); if (netif_xmit_stopped(txq) && time_after(jiffies, (trans_start + dev->watchdog_timeo))) { @@ -1148,7 +1148,7 @@ static void transition_one_qdisc(struct net_device *dev, rcu_assign_pointer(dev_queue->qdisc, new_qdisc); if (need_watchdog_p) { - dev_queue->trans_start = 0; + WRITE_ONCE(dev_queue->trans_start, 0); *need_watchdog_p = 1; } } -- cgit v1.2.3 From dab8fe320726b38a6b1dc6a7ca6e386c5f7779e8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Nov 2021 19:29:23 -0800 Subject: net: do not inline netif_tx_lock()/netif_tx_unlock() These are not fast path, there is no point in inlining them. Also provide netif_freeze_queues()/netif_unfreeze_queues() so that we can use them from dev_watchdog() in the following patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 39 ++---------------------------------- net/sched/sch_generic.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 37 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 279409ef2b18..4f4a299e92de 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4126,27 +4126,7 @@ static inline void netif_trans_update(struct net_device *dev) * * Get network device transmit lock */ -static inline void netif_tx_lock(struct net_device *dev) -{ - unsigned int i; - int cpu; - - spin_lock(&dev->tx_global_lock); - cpu = smp_processor_id(); - for (i = 0; i < dev->num_tx_queues; i++) { - struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - - /* We are the only thread of execution doing a - * freeze, but we have to grab the _xmit_lock in - * order to synchronize with threads which are in - * the ->hard_start_xmit() handler and already - * checked the frozen bit. - */ - __netif_tx_lock(txq, cpu); - set_bit(__QUEUE_STATE_FROZEN, &txq->state); - __netif_tx_unlock(txq); - } -} +void netif_tx_lock(struct net_device *dev); static inline void netif_tx_lock_bh(struct net_device *dev) { @@ -4154,22 +4134,7 @@ static inline void netif_tx_lock_bh(struct net_device *dev) netif_tx_lock(dev); } -static inline void netif_tx_unlock(struct net_device *dev) -{ - unsigned int i; - - for (i = 0; i < dev->num_tx_queues; i++) { - struct netdev_queue *txq = netdev_get_tx_queue(dev, i); - - /* No need to grab the _xmit_lock here. If the - * queue is not stopped for another reason, we - * force a schedule. - */ - clear_bit(__QUEUE_STATE_FROZEN, &txq->state); - netif_schedule_queue(txq); - } - spin_unlock(&dev->tx_global_lock); -} +void netif_tx_unlock(struct net_device *dev); static inline void netif_tx_unlock_bh(struct net_device *dev) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 02c46041f76e..389e0d8fc68d 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -445,6 +445,57 @@ unsigned long dev_trans_start(struct net_device *dev) } EXPORT_SYMBOL(dev_trans_start); +static void netif_freeze_queues(struct net_device *dev) +{ + unsigned int i; + int cpu; + + cpu = smp_processor_id(); + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + + /* We are the only thread of execution doing a + * freeze, but we have to grab the _xmit_lock in + * order to synchronize with threads which are in + * the ->hard_start_xmit() handler and already + * checked the frozen bit. + */ + __netif_tx_lock(txq, cpu); + set_bit(__QUEUE_STATE_FROZEN, &txq->state); + __netif_tx_unlock(txq); + } +} + +void netif_tx_lock(struct net_device *dev) +{ + spin_lock(&dev->tx_global_lock); + netif_freeze_queues(dev); +} +EXPORT_SYMBOL(netif_tx_lock); + +static void netif_unfreeze_queues(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + + /* No need to grab the _xmit_lock here. If the + * queue is not stopped for another reason, we + * force a schedule. + */ + clear_bit(__QUEUE_STATE_FROZEN, &txq->state); + netif_schedule_queue(txq); + } +} + +void netif_tx_unlock(struct net_device *dev) +{ + netif_unfreeze_queues(dev); + spin_unlock(&dev->tx_global_lock); +} +EXPORT_SYMBOL(netif_tx_unlock); + static void dev_watchdog(struct timer_list *t) { struct net_device *dev = from_timer(dev, t, watchdog_timer); -- cgit v1.2.3 From adeef3e32146a8d2a73c399dc6f5d76a449131b1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Nov 2021 06:21:51 -0800 Subject: net: constify netdev->dev_addr Commit 406f42fa0d3c ("net-next: When a bond have a massive amount of VLANs...") introduced a rbtree for faster Ethernet address look up. We converted all users to make modifications via appropriate helpers, make netdev->dev_addr const. The update helpers need to upcast from the buffer to struct netdev_hw_addr. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 14 +++++--------- net/core/dev_addr_lists.c | 10 ++++++++++ 2 files changed, 15 insertions(+), 9 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4f4a299e92de..2462195784a9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2117,7 +2117,7 @@ struct net_device { * Cache lines mostly used on receive path (including eth_type_trans()) */ /* Interface address info used in eth_type_trans() */ - unsigned char *dev_addr; + const unsigned char *dev_addr; struct netdev_rx_queue *_rx; unsigned int num_rx_queues; @@ -4268,10 +4268,13 @@ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, void __hw_addr_init(struct netdev_hw_addr_list *list); /* Functions used for device addresses handling */ +void dev_addr_mod(struct net_device *dev, unsigned int offset, + const void *addr, size_t len); + static inline void __dev_addr_set(struct net_device *dev, const void *addr, size_t len) { - memcpy(dev->dev_addr, addr, len); + dev_addr_mod(dev, 0, addr, len); } static inline void dev_addr_set(struct net_device *dev, const u8 *addr) @@ -4279,13 +4282,6 @@ static inline void dev_addr_set(struct net_device *dev, const u8 *addr) __dev_addr_set(dev, addr, dev->addr_len); } -static inline void -dev_addr_mod(struct net_device *dev, unsigned int offset, - const void *addr, size_t len) -{ - memcpy(&dev->dev_addr[offset], addr, len); -} - int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); int dev_addr_del(struct net_device *dev, const unsigned char *addr, diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index f0cb38344126..ae8b1ef00fec 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -549,6 +549,16 @@ int dev_addr_init(struct net_device *dev) } EXPORT_SYMBOL(dev_addr_init); +void dev_addr_mod(struct net_device *dev, unsigned int offset, + const void *addr, size_t len) +{ + struct netdev_hw_addr *ha; + + ha = container_of(dev->dev_addr, struct netdev_hw_addr, addr[0]); + memcpy(&ha->addr[offset], addr, len); +} +EXPORT_SYMBOL(dev_addr_mod); + /** * dev_addr_add - Add a device address * @dev: device -- cgit v1.2.3 From d07b26f5bbea9ade34dfd6abea7b3ca056c03cd1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Nov 2021 06:21:53 -0800 Subject: dev_addr: add a modification check netdev->dev_addr should only be modified via helpers, but someone may be casting off the const. Add a runtime check to catch abuses. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ net/core/dev.c | 1 + net/core/dev_addr_lists.c | 19 +++++++++++++++++++ 3 files changed, 25 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2462195784a9..cb7f2661d187 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1942,6 +1942,8 @@ enum netdev_ml_priv_type { * @unlink_list: As netif_addr_lock() can be called recursively, * keep a list of interfaces to be deleted. * + * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2268,6 +2270,8 @@ struct net_device { /* protected by rtnl_lock */ struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE]; + + u8 dev_addr_shadow[MAX_ADDR_LEN]; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -4288,6 +4292,7 @@ int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); void dev_addr_flush(struct net_device *dev); int dev_addr_init(struct net_device *dev); +void dev_addr_check(struct net_device *dev); /* Functions used for unicast addresses handling */ int dev_uc_add(struct net_device *dev, const unsigned char *addr); diff --git a/net/core/dev.c b/net/core/dev.c index 92c9258cbf28..9219e319e901 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1377,6 +1377,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) int ret; ASSERT_RTNL(); + dev_addr_check(dev); if (!netif_device_present(dev)) { /* may be detached because parent is runtime-suspended */ diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index a23a83ac18e5..969942734951 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -498,6 +498,21 @@ EXPORT_SYMBOL(__hw_addr_init); * Device addresses handling functions */ +/* Check that netdev->dev_addr is not written to directly as this would + * break the rbtree layout. All changes should go thru dev_addr_set() and co. + * Remove this check in mid-2024. + */ +void dev_addr_check(struct net_device *dev) +{ + if (!memcmp(dev->dev_addr, dev->dev_addr_shadow, MAX_ADDR_LEN)) + return; + + netdev_warn(dev, "Current addr: %*ph\n", MAX_ADDR_LEN, dev->dev_addr); + netdev_warn(dev, "Expected addr: %*ph\n", + MAX_ADDR_LEN, dev->dev_addr_shadow); + netdev_WARN(dev, "Incorrect netdev->dev_addr\n"); +} + /** * dev_addr_flush - Flush device address list * @dev: device @@ -509,6 +524,7 @@ EXPORT_SYMBOL(__hw_addr_init); void dev_addr_flush(struct net_device *dev) { /* rtnl_mutex must be held here */ + dev_addr_check(dev); __hw_addr_flush(&dev->dev_addrs); dev->dev_addr = NULL; @@ -552,8 +568,11 @@ void dev_addr_mod(struct net_device *dev, unsigned int offset, { struct netdev_hw_addr *ha; + dev_addr_check(dev); + ha = container_of(dev->dev_addr, struct netdev_hw_addr, addr[0]); memcpy(&ha->addr[offset], addr, len); + memcpy(&dev->dev_addr_shadow[offset], addr, len); } EXPORT_SYMBOL(dev_addr_mod); -- cgit v1.2.3 From 4b66d2161b8125b6caa6971815e85631cf3cf36f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Nov 2021 07:43:31 -0800 Subject: net: annotate accesses to dev->gso_max_size dev->gso_max_size is written under RTNL protection, or when the device is not yet visible, but is read locklessly. Add the READ_ONCE()/WRITE_ONCE() pairs, and use netif_set_gso_max_size() where we can to better document what is going on. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 2 +- drivers/net/ethernet/realtek/r8169_main.c | 4 ++-- drivers/net/ethernet/sfc/ef100_nic.c | 2 +- drivers/net/ipvlan/ipvlan_main.c | 4 ++-- drivers/net/macvlan.c | 4 ++-- drivers/net/veth.c | 2 +- drivers/net/vxlan.c | 2 +- include/linux/netdevice.h | 3 ++- net/8021q/vlan.c | 2 +- net/8021q/vlan_dev.c | 2 +- net/bridge/br_if.c | 2 +- net/core/sock.c | 3 ++- net/sctp/output.c | 2 +- 13 files changed, 18 insertions(+), 16 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c index 369f6ae700c7..927ddbdf650b 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c @@ -286,7 +286,7 @@ nfp_repr_transfer_features(struct net_device *netdev, struct net_device *lower) if (repr->dst->u.port_info.lower_dev != lower) return; - netdev->gso_max_size = lower->gso_max_size; + netif_set_gso_max_size(netdev, lower->gso_max_size); netdev->gso_max_segs = lower->gso_max_segs; netdev_update_features(netdev); diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 14dd04b2fbed..ff222c54299d 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5390,11 +5390,11 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) */ if (rtl_chip_supports_csum_v2(tp)) { dev->hw_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6; - dev->gso_max_size = RTL_GSO_MAX_SIZE_V2; + netif_set_gso_max_size(dev, RTL_GSO_MAX_SIZE_V2); dev->gso_max_segs = RTL_GSO_MAX_SEGS_V2; } else { dev->hw_features |= NETIF_F_SG | NETIF_F_TSO; - dev->gso_max_size = RTL_GSO_MAX_SIZE_V1; + netif_set_gso_max_size(dev, RTL_GSO_MAX_SIZE_V1); dev->gso_max_segs = RTL_GSO_MAX_SEGS_V1; } diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c index 6aa81229b68a..d9015a77a6b8 100644 --- a/drivers/net/ethernet/sfc/ef100_nic.c +++ b/drivers/net/ethernet/sfc/ef100_nic.c @@ -993,7 +993,7 @@ static int ef100_process_design_param(struct efx_nic *efx, return 0; case ESE_EF100_DP_GZ_TSO_MAX_PAYLOAD_LEN: nic_data->tso_max_payload_len = min_t(u64, reader->value, GSO_MAX_SIZE); - efx->net_dev->gso_max_size = nic_data->tso_max_payload_len; + netif_set_gso_max_size(efx->net_dev, nic_data->tso_max_payload_len); return 0; case ESE_EF100_DP_GZ_TSO_MAX_PAYLOAD_NUM_SEGS: nic_data->tso_max_payload_num_segs = min_t(u64, reader->value, 0xffff); diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 1d2f4e7d7324..0ba63eccefa5 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -140,7 +140,7 @@ static int ipvlan_init(struct net_device *dev) dev->vlan_features = phy_dev->vlan_features & IPVLAN_FEATURES; dev->vlan_features |= IPVLAN_ALWAYS_ON_OFLOADS; dev->hw_enc_features |= dev->features; - dev->gso_max_size = phy_dev->gso_max_size; + netif_set_gso_max_size(dev, phy_dev->gso_max_size); dev->gso_max_segs = phy_dev->gso_max_segs; dev->hard_header_len = phy_dev->hard_header_len; @@ -763,7 +763,7 @@ static int ipvlan_device_event(struct notifier_block *unused, case NETDEV_FEAT_CHANGE: list_for_each_entry(ipvlan, &port->ipvlans, pnode) { - ipvlan->dev->gso_max_size = dev->gso_max_size; + netif_set_gso_max_size(ipvlan->dev, dev->gso_max_size); ipvlan->dev->gso_max_segs = dev->gso_max_segs; netdev_update_features(ipvlan->dev); } diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d2f830ec2969..506e9559df80 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -900,7 +900,7 @@ static int macvlan_init(struct net_device *dev) dev->vlan_features = lowerdev->vlan_features & MACVLAN_FEATURES; dev->vlan_features |= ALWAYS_ON_OFFLOADS; dev->hw_enc_features |= dev->features; - dev->gso_max_size = lowerdev->gso_max_size; + netif_set_gso_max_size(dev, lowerdev->gso_max_size); dev->gso_max_segs = lowerdev->gso_max_segs; dev->hard_header_len = lowerdev->hard_header_len; macvlan_set_lockdep_class(dev); @@ -1748,7 +1748,7 @@ static int macvlan_device_event(struct notifier_block *unused, break; case NETDEV_FEAT_CHANGE: list_for_each_entry(vlan, &port->vlans, list) { - vlan->dev->gso_max_size = dev->gso_max_size; + netif_set_gso_max_size(vlan->dev, dev->gso_max_size); vlan->dev->gso_max_segs = dev->gso_max_segs; netdev_update_features(vlan->dev); } diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 50eb43e5bf45..f2f66fb293fd 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1689,7 +1689,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, if (ifmp && (dev->ifindex != 0)) peer->ifindex = ifmp->ifi_index; - peer->gso_max_size = dev->gso_max_size; + netif_set_gso_max_size(peer, dev->gso_max_size); peer->gso_max_segs = dev->gso_max_segs; err = register_netdevice(peer); diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 563f86de0e0d..df4f1712be95 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -3811,7 +3811,7 @@ static void vxlan_config_apply(struct net_device *dev, if (lowerdev) { dst->remote_ifindex = conf->remote_ifindex; - dev->gso_max_size = lowerdev->gso_max_size; + netif_set_gso_max_size(dev, lowerdev->gso_max_size); dev->gso_max_segs = lowerdev->gso_max_segs; needed_headroom = lowerdev->hard_header_len; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cb7f2661d187..14eeb58ed197 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4731,7 +4731,8 @@ static inline bool netif_needs_gso(struct sk_buff *skb, static inline void netif_set_gso_max_size(struct net_device *dev, unsigned int size) { - dev->gso_max_size = size; + /* dev->gso_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_size, size); } static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index a3a0a5e994f5..34c0ffa81e5f 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -322,7 +322,7 @@ static void vlan_transfer_features(struct net_device *dev, { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); - vlandev->gso_max_size = dev->gso_max_size; + netif_set_gso_max_size(vlandev, dev->gso_max_size); vlandev->gso_max_segs = dev->gso_max_segs; if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ab6dee28536d..208f6845f6dd 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -573,7 +573,7 @@ static int vlan_dev_init(struct net_device *dev) NETIF_F_ALL_FCOE; dev->features |= dev->hw_features | NETIF_F_LLTX; - dev->gso_max_size = real_dev->gso_max_size; + netif_set_gso_max_size(dev, real_dev->gso_max_size); dev->gso_max_segs = real_dev->gso_max_segs; if (dev->features & NETIF_F_VLAN_FEATURES) netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 64b2d4fb50f5..d3e1c2276551 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -525,7 +525,7 @@ static void br_set_gso_limits(struct net_bridge *br) gso_max_size = min(gso_max_size, p->dev->gso_max_size); gso_max_segs = min(gso_max_segs, p->dev->gso_max_segs); } - br->dev->gso_max_size = gso_max_size; + netif_set_gso_max_size(br->dev, gso_max_size); br->dev->gso_max_segs = gso_max_segs; } diff --git a/net/core/sock.c b/net/core/sock.c index 1f9be266e586..4418e2a07c34 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2257,7 +2257,8 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - sk->sk_gso_max_size = dst->dev->gso_max_size; + /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ + sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); max_segs = max_t(u32, dst->dev->gso_max_segs, 1); } } diff --git a/net/sctp/output.c b/net/sctp/output.c index cdfdbd353c67..72fe6669c50d 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -134,7 +134,7 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, dst_hold(tp->dst); sk_setup_caps(sk, tp->dst); } - packet->max_size = sk_can_gso(sk) ? tp->dst->dev->gso_max_size + packet->max_size = sk_can_gso(sk) ? READ_ONCE(tp->dst->dev->gso_max_size) : asoc->pathmtu; rcu_read_unlock(); } -- cgit v1.2.3 From 6d872df3e3b91532b142de9044e5b4984017a55f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Nov 2021 07:43:32 -0800 Subject: net: annotate accesses to dev->gso_max_segs dev->gso_max_segs is written under RTNL protection, or when the device is not yet visible, but is read locklessly. Add netif_set_gso_max_segs() helper. Add the READ_ONCE()/WRITE_ONCE() pairs, and use netif_set_gso_max_segs() where we can to better document what is going on. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/ethernet/freescale/fec_main.c | 2 +- drivers/net/ethernet/marvell/mv643xx_eth.c | 2 +- drivers/net/ethernet/marvell/mvneta.c | 2 +- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 +- drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 2 +- drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c | 2 +- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +- drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 4 ++-- drivers/net/ethernet/realtek/r8169_main.c | 4 ++-- drivers/net/ethernet/sfc/ef100_nic.c | 4 ++-- drivers/net/ethernet/sfc/efx.c | 2 +- drivers/net/ethernet/sfc/falcon/efx.c | 2 +- drivers/net/ipvlan/ipvlan_main.c | 4 ++-- drivers/net/macvlan.c | 4 ++-- drivers/net/veth.c | 2 +- drivers/net/vxlan.c | 2 +- include/linux/netdevice.h | 7 +++++++ net/8021q/vlan.c | 2 +- net/8021q/vlan_dev.c | 2 +- net/bridge/br_if.c | 2 +- net/core/dev.c | 2 +- net/core/rtnetlink.c | 4 ++-- net/core/sock.c | 3 ++- 24 files changed, 37 insertions(+), 29 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index ff8da720a33a..cf73eacdda91 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1460,7 +1460,7 @@ done: bond_dev->hw_enc_features |= xfrm_features; #endif /* CONFIG_XFRM_OFFLOAD */ bond_dev->mpls_features = mpls_features; - bond_dev->gso_max_segs = gso_max_segs; + netif_set_gso_max_segs(bond_dev, gso_max_segs); netif_set_gso_max_size(bond_dev, gso_max_size); bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index bc418b910999..f38e70692514 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3558,7 +3558,7 @@ static int fec_enet_init(struct net_device *ndev) ndev->features |= NETIF_F_HW_VLAN_CTAG_RX; if (fep->quirks & FEC_QUIRK_HAS_CSUM) { - ndev->gso_max_segs = FEC_MAX_TSO_SEGS; + netif_set_gso_max_segs(ndev, FEC_MAX_TSO_SEGS); /* enable hw accelerator */ ndev->features |= (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 116919730237..105247582684 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -3201,7 +3201,7 @@ static int mv643xx_eth_probe(struct platform_device *pdev) dev->hw_features = dev->features; dev->priv_flags |= IFF_UNICAST_FLT; - dev->gso_max_segs = MV643XX_MAX_TSO_SEGS; + netif_set_gso_max_segs(dev, MV643XX_MAX_TSO_SEGS); /* MTU range: 64 - 9500 */ dev->min_mtu = 64; diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 350d24d7ead0..80e4b500695e 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -5335,7 +5335,7 @@ static int mvneta_probe(struct platform_device *pdev) dev->hw_features |= dev->features; dev->vlan_features |= dev->features; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - dev->gso_max_segs = MVNETA_MAX_TSO_SEGS; + netif_set_gso_max_segs(dev, MVNETA_MAX_TSO_SEGS); /* MTU range: 68 - 9676 */ dev->min_mtu = ETH_MIN_MTU; diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 8b1ca0b13e88..49fc31fe0db8 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -6857,7 +6857,7 @@ static int mvpp2_port_probe(struct platform_device *pdev, mvpp2_set_hw_csum(port, port->pool_long->id); dev->vlan_features |= features; - dev->gso_max_segs = MVPP2_MAX_TSO_SEGS; + netif_set_gso_max_segs(dev, MVPP2_MAX_TSO_SEGS); dev->priv_flags |= IFF_UNICAST_FLT; /* MTU range: 68 - 9704 */ diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 1e0d0c9c1dac..1333edf1c361 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -2741,7 +2741,7 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) netdev->hw_features |= NETIF_F_LOOPBACK | NETIF_F_RXALL; - netdev->gso_max_segs = OTX2_MAX_GSO_SEGS; + netif_set_gso_max_segs(netdev, OTX2_MAX_GSO_SEGS); netdev->watchdog_timeo = OTX2_TX_TIMEOUT; netdev->netdev_ops = &otx2_netdev_ops; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c index 78944ad3492f..254bebffe8c1 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c @@ -663,7 +663,7 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id) netdev->hw_features |= NETIF_F_NTUPLE; netdev->hw_features |= NETIF_F_RXALL; - netdev->gso_max_segs = OTX2_MAX_GSO_SEGS; + netif_set_gso_max_segs(netdev, OTX2_MAX_GSO_SEGS); netdev->watchdog_timeo = OTX2_TX_TIMEOUT; netdev->netdev_ops = &otx2vf_netdev_ops; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 850bfdf83d0a..6e38da4ad554 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -4097,7 +4097,7 @@ static void nfp_net_netdev_init(struct nfp_net *nn) netdev->min_mtu = ETH_MIN_MTU; netdev->max_mtu = nn->max_mtu; - netdev->gso_max_segs = NFP_NET_LSO_MAX_SEGS; + netif_set_gso_max_segs(netdev, NFP_NET_LSO_MAX_SEGS); netif_carrier_off(netdev); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c index 927ddbdf650b..181ac8e789a3 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c @@ -287,7 +287,7 @@ nfp_repr_transfer_features(struct net_device *netdev, struct net_device *lower) return; netif_set_gso_max_size(netdev, lower->gso_max_size); - netdev->gso_max_segs = lower->gso_max_segs; + netif_set_gso_max_segs(netdev, lower->gso_max_segs); netdev_update_features(netdev); } @@ -381,7 +381,7 @@ int nfp_repr_init(struct nfp_app *app, struct net_device *netdev, /* Advertise but disable TSO by default. */ netdev->features &= ~(NETIF_F_TSO | NETIF_F_TSO6); - netdev->gso_max_segs = NFP_NET_LSO_MAX_SEGS; + netif_set_gso_max_segs(netdev, NFP_NET_LSO_MAX_SEGS); netdev->priv_flags |= IFF_NO_QUEUE | IFF_DISABLE_NETPOLL; netdev->features |= NETIF_F_LLTX; diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index ff222c54299d..1f6313c8d308 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5391,11 +5391,11 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (rtl_chip_supports_csum_v2(tp)) { dev->hw_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6; netif_set_gso_max_size(dev, RTL_GSO_MAX_SIZE_V2); - dev->gso_max_segs = RTL_GSO_MAX_SEGS_V2; + netif_set_gso_max_segs(dev, RTL_GSO_MAX_SEGS_V2); } else { dev->hw_features |= NETIF_F_SG | NETIF_F_TSO; netif_set_gso_max_size(dev, RTL_GSO_MAX_SIZE_V1); - dev->gso_max_segs = RTL_GSO_MAX_SEGS_V1; + netif_set_gso_max_segs(dev, RTL_GSO_MAX_SEGS_V1); } dev->hw_features |= NETIF_F_RXALL; diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c index d9015a77a6b8..5a6fed33a1d7 100644 --- a/drivers/net/ethernet/sfc/ef100_nic.c +++ b/drivers/net/ethernet/sfc/ef100_nic.c @@ -997,7 +997,7 @@ static int ef100_process_design_param(struct efx_nic *efx, return 0; case ESE_EF100_DP_GZ_TSO_MAX_PAYLOAD_NUM_SEGS: nic_data->tso_max_payload_num_segs = min_t(u64, reader->value, 0xffff); - efx->net_dev->gso_max_segs = nic_data->tso_max_payload_num_segs; + netif_set_gso_max_segs(efx->net_dev, nic_data->tso_max_payload_num_segs); return 0; case ESE_EF100_DP_GZ_TSO_MAX_NUM_FRAMES: nic_data->tso_max_frames = min_t(u64, reader->value, 0xffff); @@ -1122,7 +1122,7 @@ static int ef100_probe_main(struct efx_nic *efx) nic_data->tso_max_frames = ESE_EF100_DP_GZ_TSO_MAX_NUM_FRAMES_DEFAULT; nic_data->tso_max_payload_num_segs = ESE_EF100_DP_GZ_TSO_MAX_PAYLOAD_NUM_SEGS_DEFAULT; nic_data->tso_max_payload_len = ESE_EF100_DP_GZ_TSO_MAX_PAYLOAD_LEN_DEFAULT; - net_dev->gso_max_segs = ESE_EF100_DP_GZ_TSO_MAX_HDR_NUM_SEGS_DEFAULT; + netif_set_gso_max_segs(net_dev, ESE_EF100_DP_GZ_TSO_MAX_HDR_NUM_SEGS_DEFAULT); /* Read design parameters */ rc = ef100_check_design_params(efx); if (rc) { diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 6960a2fe2b53..a8c252e2b252 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -709,7 +709,7 @@ static int efx_register_netdev(struct efx_nic *efx) if (efx_nic_rev(efx) >= EFX_REV_HUNT_A0) net_dev->priv_flags |= IFF_UNICAST_FLT; net_dev->ethtool_ops = &efx_ethtool_ops; - net_dev->gso_max_segs = EFX_TSO_MAX_SEGS; + netif_set_gso_max_segs(net_dev, EFX_TSO_MAX_SEGS); net_dev->min_mtu = EFX_MIN_MTU; net_dev->max_mtu = EFX_MAX_MTU; diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c index 314c9c69eb0e..60c595ef7589 100644 --- a/drivers/net/ethernet/sfc/falcon/efx.c +++ b/drivers/net/ethernet/sfc/falcon/efx.c @@ -2267,7 +2267,7 @@ static int ef4_register_netdev(struct ef4_nic *efx) net_dev->irq = efx->pci_dev->irq; net_dev->netdev_ops = &ef4_netdev_ops; net_dev->ethtool_ops = &ef4_ethtool_ops; - net_dev->gso_max_segs = EF4_TSO_MAX_SEGS; + netif_set_gso_max_segs(net_dev, EF4_TSO_MAX_SEGS); net_dev->min_mtu = EF4_MIN_MTU; net_dev->max_mtu = EF4_MAX_MTU; diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 0ba63eccefa5..20da0b2bd246 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -141,7 +141,7 @@ static int ipvlan_init(struct net_device *dev) dev->vlan_features |= IPVLAN_ALWAYS_ON_OFLOADS; dev->hw_enc_features |= dev->features; netif_set_gso_max_size(dev, phy_dev->gso_max_size); - dev->gso_max_segs = phy_dev->gso_max_segs; + netif_set_gso_max_segs(dev, phy_dev->gso_max_segs); dev->hard_header_len = phy_dev->hard_header_len; netdev_lockdep_set_classes(dev); @@ -764,7 +764,7 @@ static int ipvlan_device_event(struct notifier_block *unused, case NETDEV_FEAT_CHANGE: list_for_each_entry(ipvlan, &port->ipvlans, pnode) { netif_set_gso_max_size(ipvlan->dev, dev->gso_max_size); - ipvlan->dev->gso_max_segs = dev->gso_max_segs; + netif_set_gso_max_segs(ipvlan->dev, dev->gso_max_segs); netdev_update_features(ipvlan->dev); } break; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 506e9559df80..75b453acd778 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -901,7 +901,7 @@ static int macvlan_init(struct net_device *dev) dev->vlan_features |= ALWAYS_ON_OFFLOADS; dev->hw_enc_features |= dev->features; netif_set_gso_max_size(dev, lowerdev->gso_max_size); - dev->gso_max_segs = lowerdev->gso_max_segs; + netif_set_gso_max_segs(dev, lowerdev->gso_max_segs); dev->hard_header_len = lowerdev->hard_header_len; macvlan_set_lockdep_class(dev); @@ -1749,7 +1749,7 @@ static int macvlan_device_event(struct notifier_block *unused, case NETDEV_FEAT_CHANGE: list_for_each_entry(vlan, &port->vlans, list) { netif_set_gso_max_size(vlan->dev, dev->gso_max_size); - vlan->dev->gso_max_segs = dev->gso_max_segs; + netif_set_gso_max_segs(vlan->dev, dev->gso_max_segs); netdev_update_features(vlan->dev); } break; diff --git a/drivers/net/veth.c b/drivers/net/veth.c index f2f66fb293fd..5ca0a899101d 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1690,7 +1690,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, peer->ifindex = ifmp->ifi_index; netif_set_gso_max_size(peer, dev->gso_max_size); - peer->gso_max_segs = dev->gso_max_segs; + netif_set_gso_max_segs(peer, dev->gso_max_segs); err = register_netdevice(peer); put_net(net); diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index df4f1712be95..82bd794fceb3 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -3812,7 +3812,7 @@ static void vxlan_config_apply(struct net_device *dev, dst->remote_ifindex = conf->remote_ifindex; netif_set_gso_max_size(dev, lowerdev->gso_max_size); - dev->gso_max_segs = lowerdev->gso_max_segs; + netif_set_gso_max_segs(dev, lowerdev->gso_max_segs); needed_headroom = lowerdev->hard_header_len; needed_headroom += lowerdev->needed_headroom; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 14eeb58ed197..df049864661d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4735,6 +4735,13 @@ static inline void netif_set_gso_max_size(struct net_device *dev, WRITE_ONCE(dev->gso_max_size, size); } +static inline void netif_set_gso_max_segs(struct net_device *dev, + unsigned int segs) +{ + /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_segs, segs); +} + static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 34c0ffa81e5f..0c00200c9cb2 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -323,7 +323,7 @@ static void vlan_transfer_features(struct net_device *dev, struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); netif_set_gso_max_size(vlandev, dev->gso_max_size); - vlandev->gso_max_segs = dev->gso_max_segs; + netif_set_gso_max_segs(vlandev, dev->gso_max_segs); if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 208f6845f6dd..e9499a7c19fe 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -574,7 +574,7 @@ static int vlan_dev_init(struct net_device *dev) dev->features |= dev->hw_features | NETIF_F_LLTX; netif_set_gso_max_size(dev, real_dev->gso_max_size); - dev->gso_max_segs = real_dev->gso_max_segs; + netif_set_gso_max_segs(dev, real_dev->gso_max_segs); if (dev->features & NETIF_F_VLAN_FEATURES) netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index d3e1c2276551..3915832a03c2 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -526,7 +526,7 @@ static void br_set_gso_limits(struct net_bridge *br) gso_max_segs = min(gso_max_segs, p->dev->gso_max_segs); } netif_set_gso_max_size(br->dev, gso_max_size); - br->dev->gso_max_segs = gso_max_segs; + netif_set_gso_max_segs(br->dev, gso_max_segs); } /* diff --git a/net/core/dev.c b/net/core/dev.c index 9219e319e901..9c4fc8c3f981 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3396,7 +3396,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, { u16 gso_segs = skb_shinfo(skb)->gso_segs; - if (gso_segs > dev->gso_max_segs) + if (gso_segs > READ_ONCE(dev->gso_max_segs)) return features & ~NETIF_F_GSO_MASK; if (!skb_shinfo(skb)->gso_type) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2af8aeeadadf..fd030e02f16d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2768,7 +2768,7 @@ static int do_setlink(const struct sk_buff *skb, } if (dev->gso_max_segs ^ max_segs) { - dev->gso_max_segs = max_segs; + netif_set_gso_max_segs(dev, max_segs); status |= DO_SETLINK_MODIFIED; } } @@ -3222,7 +3222,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, if (tb[IFLA_GSO_MAX_SIZE]) netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) - dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); + netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); return dev; } diff --git a/net/core/sock.c b/net/core/sock.c index 4418e2a07c34..31a2b79c9b38 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2259,7 +2259,8 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); - max_segs = max_t(u32, dst->dev->gso_max_segs, 1); + /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ + max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; -- cgit v1.2.3 From 2106efda785b55a8957efed9a52dfa28ee0d7280 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 22 Nov 2021 17:24:47 -0800 Subject: net: remove .ndo_change_proto_down .ndo_change_proto_down was added seemingly to enable out-of-tree implementations. Over 2.5yrs later we still have no real users upstream. Hardwire the generic implementation for now, we can revert once real users materialize. (rocker is a test vehicle, not a user.) We need to drop the optimization on the sysfs side, because unlike ndos priv_flags will be changed at runtime, so we'd need READ_ONCE/WRITE_ONCE everywhere.. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker_main.c | 12 ------------ drivers/net/macvlan.c | 3 +-- drivers/net/vxlan.c | 3 +-- include/linux/netdevice.h | 12 +++--------- net/8021q/vlanproc.c | 2 +- net/core/dev.c | 26 ++++---------------------- net/core/net-sysfs.c | 8 -------- net/core/rtnetlink.c | 3 +-- 8 files changed, 11 insertions(+), 58 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index ba4062881eed..b620470c7905 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -1995,17 +1995,6 @@ static int rocker_port_get_phys_port_name(struct net_device *dev, return err ? -EOPNOTSUPP : 0; } -static int rocker_port_change_proto_down(struct net_device *dev, - bool proto_down) -{ - struct rocker_port *rocker_port = netdev_priv(dev); - - if (rocker_port->dev->flags & IFF_UP) - rocker_port_set_enable(rocker_port, !proto_down); - rocker_port->dev->proto_down = proto_down; - return 0; -} - static void rocker_port_neigh_destroy(struct net_device *dev, struct neighbour *n) { @@ -2037,7 +2026,6 @@ static const struct net_device_ops rocker_port_netdev_ops = { .ndo_set_mac_address = rocker_port_set_mac_address, .ndo_change_mtu = rocker_port_change_mtu, .ndo_get_phys_port_name = rocker_port_get_phys_port_name, - .ndo_change_proto_down = rocker_port_change_proto_down, .ndo_neigh_destroy = rocker_port_neigh_destroy, .ndo_get_port_parent_id = rocker_port_get_port_parent_id, }; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 75b453acd778..6ef5f77be4d0 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1171,7 +1171,6 @@ static const struct net_device_ops macvlan_netdev_ops = { #endif .ndo_get_iflink = macvlan_dev_get_iflink, .ndo_features_check = passthru_features_check, - .ndo_change_proto_down = dev_change_proto_down_generic, }; void macvlan_common_setup(struct net_device *dev) @@ -1182,7 +1181,7 @@ void macvlan_common_setup(struct net_device *dev) dev->max_mtu = ETH_MAX_MTU; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); - dev->priv_flags |= IFF_UNICAST_FLT; + dev->priv_flags |= IFF_UNICAST_FLT | IFF_CHANGE_PROTO_DOWN; dev->netdev_ops = &macvlan_netdev_ops; dev->needs_free_netdev = true; dev->header_ops = &macvlan_hard_header_ops; diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 82bd794fceb3..fecff0a46612 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -3234,7 +3234,6 @@ static const struct net_device_ops vxlan_netdev_ether_ops = { .ndo_fdb_dump = vxlan_fdb_dump, .ndo_fdb_get = vxlan_fdb_get, .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, - .ndo_change_proto_down = dev_change_proto_down_generic, }; static const struct net_device_ops vxlan_netdev_raw_ops = { @@ -3305,7 +3304,7 @@ static void vxlan_setup(struct net_device *dev) dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; netif_keep_dst(dev); - dev->priv_flags |= IFF_NO_QUEUE; + dev->priv_flags |= IFF_NO_QUEUE | IFF_CHANGE_PROTO_DOWN; /* MTU range: 68 - 65535 */ dev->min_mtu = ETH_MIN_MTU; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index df049864661d..db3bff1ae7fd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1297,11 +1297,6 @@ struct netdev_net_notifier { * TX queue. * int (*ndo_get_iflink)(const struct net_device *dev); * Called to get the iflink value of this device. - * void (*ndo_change_proto_down)(struct net_device *dev, - * bool proto_down); - * This function is used to pass protocol port error state information - * to the switch driver. The switch driver can react to the proto_down - * by doing a phys down on the associated switch port. * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); * This function is used to get egress tunnel information for given skb. * This is useful for retrieving outer tunnel header parameters while @@ -1542,8 +1537,6 @@ struct net_device_ops { int queue_index, u32 maxrate); int (*ndo_get_iflink)(const struct net_device *dev); - int (*ndo_change_proto_down)(struct net_device *dev, - bool proto_down); int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); void (*ndo_set_rx_headroom)(struct net_device *dev, @@ -1612,6 +1605,7 @@ struct net_device_ops { * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) + * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1646,6 +1640,7 @@ enum netdev_priv_flags { IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_LIVE_RENAME_OK = 1<<30, IFF_TX_SKB_NO_LINEAR = 1<<31, + IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1982,7 +1977,7 @@ struct net_device { /* Read-mostly cache-line for fast-path access */ unsigned int flags; - unsigned int priv_flags; + unsigned long long priv_flags; const struct net_device_ops *netdev_ops; int ifindex; unsigned short gflags; @@ -3735,7 +3730,6 @@ int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); int dev_change_proto_down(struct net_device *dev, bool proto_down); -int dev_change_proto_down_generic(struct net_device *dev, bool proto_down); void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, u32 value); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index ec87dea23719..08bf6c839e25 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -252,7 +252,7 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset) stats = dev_get_stats(vlandev, &temp); seq_printf(seq, - "%s VID: %d REORDER_HDR: %i dev->priv_flags: %hx\n", + "%s VID: %d REORDER_HDR: %i dev->priv_flags: %llx\n", vlandev->name, vlan->vlan_id, (int)(vlan->flags & 1), vlandev->priv_flags); diff --git a/net/core/dev.c b/net/core/dev.c index 9c4fc8c3f981..823917de0d2b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8558,35 +8558,17 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) EXPORT_SYMBOL(netdev_port_same_parent_id); /** - * dev_change_proto_down - update protocol port state information + * dev_change_proto_down - set carrier according to proto_down. + * * @dev: device * @proto_down: new value - * - * This info can be used by switch drivers to set the phys state of the - * port. */ int dev_change_proto_down(struct net_device *dev, bool proto_down) { - const struct net_device_ops *ops = dev->netdev_ops; - - if (!ops->ndo_change_proto_down) + if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; - return ops->ndo_change_proto_down(dev, proto_down); -} -EXPORT_SYMBOL(dev_change_proto_down); - -/** - * dev_change_proto_down_generic - generic implementation for - * ndo_change_proto_down that sets carrier according to - * proto_down. - * - * @dev: device - * @proto_down: new value - */ -int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) -{ if (proto_down) netif_carrier_off(dev); else @@ -8594,7 +8576,7 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) dev->proto_down = proto_down; return 0; } -EXPORT_SYMBOL(dev_change_proto_down_generic); +EXPORT_SYMBOL(dev_change_proto_down); /** * dev_change_proto_down_reason - proto down reason diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 4edd58d34f16..affe34d71d31 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -488,14 +488,6 @@ static ssize_t proto_down_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - struct net_device *netdev = to_net_dev(dev); - - /* The check is also done in change_proto_down; this helps returning - * early without hitting the trylock/restart in netdev_store. - */ - if (!netdev->netdev_ops->ndo_change_proto_down) - return -EOPNOTSUPP; - return netdev_store(dev, attr, buf, len, change_proto_down); } NETDEVICE_SHOW_RW(proto_down, fmt_dec); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index fd030e02f16d..6f25c0a8aebe 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2539,13 +2539,12 @@ static int do_set_proto_down(struct net_device *dev, struct netlink_ext_ack *extack) { struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; - const struct net_device_ops *ops = dev->netdev_ops; unsigned long mask = 0; u32 value; bool proto_down; int err; - if (!ops->ndo_change_proto_down) { + if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) { NL_SET_ERR_MSG(extack, "Protodown not supported by device"); return -EOPNOTSUPP; } -- cgit v1.2.3 From 7a10d8c810cfad3e79372d7d1c77899d86cd6662 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Nov 2021 09:01:55 -0800 Subject: net: annotate data-races on txq->xmit_lock_owner syzbot found that __dev_queue_xmit() is reading txq->xmit_lock_owner without annotations. No serious issue there, let's document what is happening there. BUG: KCSAN: data-race in __dev_queue_xmit / __dev_queue_xmit write to 0xffff888139d09484 of 4 bytes by interrupt on cpu 0: __netif_tx_unlock include/linux/netdevice.h:4437 [inline] __dev_queue_xmit+0x948/0xf70 net/core/dev.c:4229 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline] macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567 __netdev_start_xmit include/linux/netdevice.h:4987 [inline] netdev_start_xmit include/linux/netdevice.h:5001 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3590 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259 neigh_hh_output include/net/neighbour.h:511 [inline] neigh_output include/net/neighbour.h:525 [inline] ip6_finish_output2+0x995/0xbb0 net/ipv6/ip6_output.c:126 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:450 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421 expire_timers+0x116/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x410 kernel/time/timer.c:1734 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747 __do_softirq+0x158/0x2de kernel/softirq.c:558 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x37/0x70 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x3e/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 read to 0xffff888139d09484 of 4 bytes by interrupt on cpu 1: __dev_queue_xmit+0x5e3/0xf70 net/core/dev.c:4213 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline] macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567 __netdev_start_xmit include/linux/netdevice.h:4987 [inline] netdev_start_xmit include/linux/netdevice.h:5001 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3590 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259 neigh_resolve_output+0x3db/0x410 net/core/neighbour.c:1523 neigh_output include/net/neighbour.h:527 [inline] ip6_finish_output2+0x9be/0xbb0 net/ipv6/ip6_output.c:126 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:450 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421 expire_timers+0x116/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x410 kernel/time/timer.c:1734 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747 __do_softirq+0x158/0x2de kernel/softirq.c:558 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x37/0x70 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x8d/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 kcsan_setup_watchpoint+0x94/0x420 kernel/kcsan/core.c:443 folio_test_anon include/linux/page-flags.h:581 [inline] PageAnon include/linux/page-flags.h:586 [inline] zap_pte_range+0x5ac/0x10e0 mm/memory.c:1347 zap_pmd_range mm/memory.c:1467 [inline] zap_pud_range mm/memory.c:1496 [inline] zap_p4d_range mm/memory.c:1517 [inline] unmap_page_range+0x2dc/0x3d0 mm/memory.c:1538 unmap_single_vma+0x157/0x210 mm/memory.c:1583 unmap_vmas+0xd0/0x180 mm/memory.c:1615 exit_mmap+0x23d/0x470 mm/mmap.c:3170 __mmput+0x27/0x1b0 kernel/fork.c:1113 mmput+0x3d/0x50 kernel/fork.c:1134 exit_mm+0xdb/0x170 kernel/exit.c:507 do_exit+0x608/0x17a0 kernel/exit.c:819 do_group_exit+0xce/0x180 kernel/exit.c:929 get_signal+0xfc3/0x1550 kernel/signal.c:2852 arch_do_signal_or_restart+0x8c/0x2e0 arch/x86/kernel/signal.c:868 handle_signal_work kernel/entry/common.c:148 [inline] exit_to_user_mode_loop kernel/entry/common.c:172 [inline] exit_to_user_mode_prepare+0x113/0x190 kernel/entry/common.c:207 __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:300 do_syscall_64+0x50/0xd0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000000 -> 0xffffffff Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 28712 Comm: syz-executor.0 Tainted: G W 5.16.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20211130170155.2331929-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 19 +++++++++++++------ net/core/dev.c | 5 ++++- 2 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec42495a43a..be5cb3360b94 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4404,7 +4404,8 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) { spin_lock(&txq->_xmit_lock); - txq->xmit_lock_owner = cpu; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, cpu); } static inline bool __netif_tx_acquire(struct netdev_queue *txq) @@ -4421,26 +4422,32 @@ static inline void __netif_tx_release(struct netdev_queue *txq) static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); - txq->xmit_lock_owner = smp_processor_id(); + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } static inline bool __netif_tx_trylock(struct netdev_queue *txq) { bool ok = spin_trylock(&txq->_xmit_lock); - if (likely(ok)) - txq->xmit_lock_owner = smp_processor_id(); + + if (likely(ok)) { + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); + } return ok; } static inline void __netif_tx_unlock(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock_bh(&txq->_xmit_lock); } diff --git a/net/core/dev.c b/net/core/dev.c index 15ac064b5562..2a352e668d10 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4210,7 +4210,10 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ - if (txq->xmit_lock_owner != cpu) { + /* Other cpus might concurrently change txq->xmit_lock_owner + * to -1 or to their cpu id, but not to our id. + */ + if (READ_ONCE(txq->xmit_lock_owner) != cpu) { if (dev_xmit_recursion()) goto recursion_alert; -- cgit v1.2.3 From 4d92b95ff2f95f13df9bad0b5a25a9f60e72758d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:21:57 -0800 Subject: net: add net device refcount tracker infrastructure net device are refcounted. Over the years we had numerous bugs caused by imbalanced dev_hold() and dev_put() calls. The general idea is to be able to precisely pair each decrement with a corresponding prior increment. Both share a cookie, basically a pointer to private data storing stack traces. This patch adds dev_hold_track() and dev_put_track(). To use these helpers, each data structure owning a refcount should also use a "netdevice_tracker" to pair the hold and put. netdevice_tracker dev_tracker; ... dev_hold_track(dev, &dev_tracker, GFP_ATOMIC); ... dev_put_track(dev, &dev_tracker); Whenever a leak happens, we will get precise stack traces of the point dev_hold_track() happened, at device dismantle phase. We will also get a stack trace if too many dev_put_track() for the same netdevice_tracker are attempted. This is guarded by CONFIG_NET_DEV_REFCNT_TRACKER option. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 5 +++++ net/Kconfig.debug | 10 ++++++++++ net/core/dev.c | 3 +++ 4 files changed, 63 insertions(+) create mode 100644 net/Kconfig.debug (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 65117f01d5f2..143d60ed0047 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -48,6 +48,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -300,6 +301,12 @@ enum netdev_state_t { }; +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER +typedef struct ref_tracker *netdevice_tracker; +#else +typedef struct {} netdevice_tracker; +#endif + struct gro_list { struct list_head list; int count; @@ -1865,6 +1872,7 @@ enum netdev_ml_priv_type { * @proto_down_reason: reason a netdev interface is held down * @pcpu_refcnt: Number of references to this device * @dev_refcnt: Number of references to this device + * @refcnt_tracker: Tracker directory for tracked references to this device * @todo_list: Delayed register/unregister * @link_watch_list: XXX: need comments on this one * @@ -2178,6 +2186,7 @@ struct net_device { #else refcount_t dev_refcnt; #endif + struct ref_tracker_dir refcnt_tracker; struct list_head link_watch_list; @@ -3805,6 +3814,7 @@ void netdev_run_todo(void); * @dev: network device * * Release reference to device to allow it to be freed. + * Try using dev_put_track() instead. */ static inline void dev_put(struct net_device *dev) { @@ -3822,6 +3832,7 @@ static inline void dev_put(struct net_device *dev) * @dev: network device * * Hold reference to device to keep it from being freed. + * Try using dev_hold_track() instead. */ static inline void dev_hold(struct net_device *dev) { @@ -3834,6 +3845,40 @@ static inline void dev_hold(struct net_device *dev) } } +static inline void netdev_tracker_alloc(struct net_device *dev, + netdevice_tracker *tracker, gfp_t gfp) +{ +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp); +#endif +} + +static inline void netdev_tracker_free(struct net_device *dev, + netdevice_tracker *tracker) +{ +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + ref_tracker_free(&dev->refcnt_tracker, tracker); +#endif +} + +static inline void dev_hold_track(struct net_device *dev, + netdevice_tracker *tracker, gfp_t gfp) +{ + if (dev) { + dev_hold(dev); + netdev_tracker_alloc(dev, tracker, gfp); + } +} + +static inline void dev_put_track(struct net_device *dev, + netdevice_tracker *tracker) +{ + if (dev) { + netdev_tracker_free(dev, tracker); + dev_put(dev); + } +} + /* Carrier loss detection, dial on demand. The functions netif_carrier_on * and _off may be called from IRQ context, but it is caller * who is responsible for serialization of these calls. diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 633c2c5cb45b..6504b97f8dfd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -598,6 +598,11 @@ config DEBUG_MISC Say Y here if you need to enable miscellaneous debug code that should be under a more specific debug option but isn't. +menu "Networking Debugging" + +source "net/Kconfig.debug" + +endmenu # "Networking Debugging" menu "Memory Debugging" diff --git a/net/Kconfig.debug b/net/Kconfig.debug new file mode 100644 index 000000000000..fb5c70e01cb3 --- /dev/null +++ b/net/Kconfig.debug @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config NET_DEV_REFCNT_TRACKER + bool "Enable net device refcount tracking" + depends on DEBUG_KERNEL && STACKTRACE_SUPPORT + select REF_TRACKER + default n + help + Enable debugging feature to track device references. + This adds memory and cpu costs. diff --git a/net/core/dev.c b/net/core/dev.c index aba8acc1238c..1740d6cfe86b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9864,6 +9864,7 @@ static void netdev_wait_allrefs(struct net_device *dev) netdev_unregister_timeout_secs * HZ)) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, refcnt); + ref_tracker_dir_print(&dev->refcnt_tracker, 10); warning_time = jiffies; } } @@ -10154,6 +10155,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; + ref_tracker_dir_init(&dev->refcnt_tracker, 128); #ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) @@ -10270,6 +10272,7 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); + ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; -- cgit v1.2.3 From 80e8921b2b72c300ca56a01729004d30bedb82cd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:21:58 -0800 Subject: net: add net device refcount tracker to struct netdev_rx_queue This helps debugging net device refcount leaks. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ net/core/net-sysfs.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 143d60ed0047..3d691fadd569 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -741,6 +741,8 @@ struct netdev_rx_queue { #endif struct kobject kobj; struct net_device *dev; + netdevice_tracker dev_tracker; + #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index affe34d71d31..27a7ac2e516f 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1004,7 +1004,7 @@ static void rx_queue_release(struct kobject *kobj) #endif memset(kobj, 0, sizeof(*kobj)); - dev_put(queue->dev); + dev_put_track(queue->dev, &queue->dev_tracker); } static const void *rx_queue_namespace(struct kobject *kobj) @@ -1044,7 +1044,7 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) /* Kobject_put later will trigger rx_queue_release call which * decreases dev refcount: Take that reference here */ - dev_hold(queue->dev); + dev_hold_track(queue->dev, &queue->dev_tracker, GFP_KERNEL); kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, -- cgit v1.2.3 From 0b688f24b7d611db3a02f3d4ab562d049c78a17d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:21:59 -0800 Subject: net: add net device refcount tracker to struct netdev_queue This will help debugging pesky netdev reference leaks. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ net/core/net-sysfs.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3d691fadd569..b4f704337f65 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -586,6 +586,8 @@ struct netdev_queue { * read-mostly part */ struct net_device *dev; + netdevice_tracker dev_tracker; + struct Qdisc __rcu *qdisc; struct Qdisc *qdisc_sleeping; #ifdef CONFIG_SYSFS diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 27a7ac2e516f..3b2cdbbdc858 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1607,7 +1607,7 @@ static void netdev_queue_release(struct kobject *kobj) struct netdev_queue *queue = to_netdev_queue(kobj); memset(kobj, 0, sizeof(*kobj)); - dev_put(queue->dev); + dev_put_track(queue->dev, &queue->dev_tracker); } static const void *netdev_queue_namespace(struct kobject *kobj) @@ -1647,7 +1647,7 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) /* Kobject_put later will trigger netdev_queue_release call * which decreases dev refcount: Take that reference here */ - dev_hold(queue->dev); + dev_hold_track(queue->dev, &queue->dev_tracker, GFP_KERNEL); kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, -- cgit v1.2.3 From 9038c320001dd07f60736018edf608ac5baca0ab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:22:03 -0800 Subject: net: dst: add net device refcount tracking to dst_entry We want to track all dev_hold()/dev_put() to ease leak hunting. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 17 +++++++++++++++++ include/net/dst.h | 1 + net/core/dst.c | 8 ++++---- net/ipv4/route.c | 7 ++++--- net/ipv6/route.c | 5 +++-- 5 files changed, 29 insertions(+), 9 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b4f704337f65..afed3b10491b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3883,6 +3883,23 @@ static inline void dev_put_track(struct net_device *dev, } } +static inline void dev_replace_track(struct net_device *odev, + struct net_device *ndev, + netdevice_tracker *tracker, + gfp_t gfp) +{ +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + if (odev) + ref_tracker_free(&odev->refcnt_tracker, tracker); +#endif + dev_hold(ndev); + dev_put(odev); +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + if (ndev) + ref_tracker_alloc(&ndev->refcnt_tracker, tracker, gfp); +#endif +} + /* Carrier loss detection, dial on demand. The functions netif_carrier_on * and _off may be called from IRQ context, but it is caller * who is responsible for serialization of these calls. diff --git a/include/net/dst.h b/include/net/dst.h index a057319aabef..6aa252c3fc55 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -77,6 +77,7 @@ struct dst_entry { #ifndef CONFIG_64BIT atomic_t __refcnt; /* 32-bit offset 64 */ #endif + netdevice_tracker dev_tracker; }; struct dst_metrics { diff --git a/net/core/dst.c b/net/core/dst.c index 497ef9b3fc6a..d16c2c9bfebd 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -49,7 +49,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, unsigned short flags) { dst->dev = dev; - dev_hold(dev); + dev_hold_track(dev, &dst->dev_tracker, GFP_ATOMIC); dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; @@ -117,7 +117,7 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) if (dst->ops->destroy) dst->ops->destroy(dst); - dev_put(dst->dev); + dev_put_track(dst->dev, &dst->dev_tracker); lwtstate_put(dst->lwtstate); @@ -159,8 +159,8 @@ void dst_dev_put(struct dst_entry *dst) dst->input = dst_discard; dst->output = dst_discard_out; dst->dev = blackhole_netdev; - dev_hold(dst->dev); - dev_put(dev); + dev_replace_track(dev, blackhole_netdev, &dst->dev_tracker, + GFP_ATOMIC); } EXPORT_SYMBOL(dst_dev_put); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 243a0c52be42..843a7a3699fe 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1531,8 +1531,9 @@ void rt_flush_dev(struct net_device *dev) if (rt->dst.dev != dev) continue; rt->dst.dev = blackhole_netdev; - dev_hold(rt->dst.dev); - dev_put(dev); + dev_replace_track(dev, blackhole_netdev, + &rt->dst.dev_tracker, + GFP_ATOMIC); } spin_unlock_bh(&ul->lock); } @@ -2819,7 +2820,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->output = dst_discard_out; new->dev = net->loopback_dev; - dev_hold(new->dev); + dev_hold_track(new->dev, &new->dev_tracker, GFP_ATOMIC); rt->rt_is_input = ort->rt_is_input; rt->rt_iif = ort->rt_iif; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f0d29fcb2094..ba4dc94d76d6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -182,8 +182,9 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) if (rt_dev == dev) { rt->dst.dev = blackhole_netdev; - dev_hold(rt->dst.dev); - dev_put(rt_dev); + dev_replace_track(rt_dev, blackhole_netdev, + &rt->dst.dev_tracker, + GFP_ATOMIC); } } spin_unlock_bh(&ul->lock); -- cgit v1.2.3 From 63f13937cbe9b00982dfc8e578b1aec8e5037333 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Dec 2021 20:22:14 -0800 Subject: net: linkwatch: add net device refcount tracker Add a netdevice_tracker inside struct net_device, to track the self reference when a device is in lweventlist. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ net/core/link_watch.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index afed3b10491b..69dca1edd5a6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1950,6 +1950,7 @@ enum netdev_ml_priv_type { * keep a list of interfaces to be deleted. * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. + * @linkwatch_dev_tracker: refcount tracker used by linkwatch. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2280,6 +2281,7 @@ struct net_device { struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE]; u8 dev_addr_shadow[MAX_ADDR_LEN]; + netdevice_tracker linkwatch_dev_tracker; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 9599afd0862d..d7d089963b1d 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -109,7 +109,7 @@ static void linkwatch_add_event(struct net_device *dev) spin_lock_irqsave(&lweventlist_lock, flags); if (list_empty(&dev->link_watch_list)) { list_add_tail(&dev->link_watch_list, &lweventlist); - dev_hold(dev); + dev_hold_track(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC); } spin_unlock_irqrestore(&lweventlist_lock, flags); } @@ -166,7 +166,7 @@ static void linkwatch_do_dev(struct net_device *dev) netdev_state_change(dev); } - dev_put(dev); + dev_put_track(dev, &dev->linkwatch_dev_tracker); } static void __linkwatch_run_queue(int urgent_only) -- cgit v1.2.3 From f12bf6f3f942b37de65eeea8be25903587fec930 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Dec 2021 17:30:30 -0800 Subject: net: watchdog: add net device refcount tracker Add a netdevice_tracker inside struct net_device, to track the self reference when a device has an active watchdog timer. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ net/sched/sch_generic.c | 10 ++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 69dca1edd5a6..1a748ee9a421 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1951,6 +1951,7 @@ enum netdev_ml_priv_type { * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @linkwatch_dev_tracker: refcount tracker used by linkwatch. + * @watchdog_dev_tracker: refcount tracker used by watchdog. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2282,6 +2283,7 @@ struct net_device { u8 dev_addr_shadow[MAX_ADDR_LEN]; netdevice_tracker linkwatch_dev_tracker; + netdevice_tracker watchdog_dev_tracker; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 8c8fbf2e3856..b07bd1c7330f 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -499,6 +499,7 @@ EXPORT_SYMBOL(netif_tx_unlock); static void dev_watchdog(struct timer_list *t) { struct net_device *dev = from_timer(dev, t, watchdog_timer); + bool release = true; spin_lock(&dev->tx_global_lock); if (!qdisc_tx_is_noop(dev)) { @@ -534,12 +535,13 @@ static void dev_watchdog(struct timer_list *t) if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo))) - dev_hold(dev); + release = false; } } spin_unlock(&dev->tx_global_lock); - dev_put(dev); + if (release) + dev_put_track(dev, &dev->watchdog_dev_tracker); } void __netdev_watchdog_up(struct net_device *dev) @@ -549,7 +551,7 @@ void __netdev_watchdog_up(struct net_device *dev) dev->watchdog_timeo = 5*HZ; if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo))) - dev_hold(dev); + dev_hold_track(dev, &dev->watchdog_dev_tracker, GFP_ATOMIC); } } EXPORT_SYMBOL_GPL(__netdev_watchdog_up); @@ -563,7 +565,7 @@ static void dev_watchdog_down(struct net_device *dev) { netif_tx_lock_bh(dev); if (del_timer(&dev->watchdog_timer)) - dev_put(dev); + dev_put_track(dev, &dev->watchdog_dev_tracker); netif_tx_unlock_bh(dev); } -- cgit v1.2.3 From 9ba74e6c9e9d0c5c1e5792a7111fc7d1a0589cb8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Dec 2021 23:44:21 -0800 Subject: net: add networking namespace refcount tracker We have 100+ syzbot reports about netns being dismantled too soon, still unresolved as of today. We think a missing get_net() or an extra put_net() is the root cause. In order to find the bug(s), and be able to spot future ones, this patch adds CONFIG_NET_NS_REFCNT_TRACKER and new helpers to precisely pair all put_net() with corresponding get_net(). To use these helpers, each data structure owning a refcount should also use a "netns_tracker" to pair the get and put. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 9 +-------- include/net/net_namespace.h | 34 ++++++++++++++++++++++++++++++++++ include/net/net_trackers.h | 18 ++++++++++++++++++ net/Kconfig.debug | 9 +++++++++ net/core/net_namespace.c | 3 +++ 5 files changed, 65 insertions(+), 8 deletions(-) create mode 100644 include/net/net_trackers.h (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a748ee9a421..235d5d082f1a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -48,7 +48,7 @@ #include #include #include -#include +#include struct netpoll_info; struct device; @@ -300,13 +300,6 @@ enum netdev_state_t { __LINK_STATE_TESTING, }; - -#ifdef CONFIG_NET_DEV_REFCNT_TRACKER -typedef struct ref_tracker *netdevice_tracker; -#else -typedef struct {} netdevice_tracker; -#endif - struct gro_list { struct list_head list; int count; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index bb5fa5914032..5b61c462e534 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -87,6 +88,7 @@ struct net { struct idr netns_ids; struct ns_common ns; + struct ref_tracker_dir refcnt_tracker; struct list_head dev_base_head; struct proc_dir_entry *proc_net; @@ -240,6 +242,7 @@ void ipx_unregister_sysctl(void); #ifdef CONFIG_NET_NS void __put_net(struct net *net); +/* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { refcount_inc(&net->ns.count); @@ -258,6 +261,7 @@ static inline struct net *maybe_get_net(struct net *net) return net; } +/* Try using put_net_track() instead */ static inline void put_net(struct net *net) { if (refcount_dec_and_test(&net->ns.count)) @@ -308,6 +312,36 @@ static inline int check_net(const struct net *net) #endif +static inline void netns_tracker_alloc(struct net *net, + netns_tracker *tracker, gfp_t gfp) +{ +#ifdef CONFIG_NET_NS_REFCNT_TRACKER + ref_tracker_alloc(&net->refcnt_tracker, tracker, gfp); +#endif +} + +static inline void netns_tracker_free(struct net *net, + netns_tracker *tracker) +{ +#ifdef CONFIG_NET_NS_REFCNT_TRACKER + ref_tracker_free(&net->refcnt_tracker, tracker); +#endif +} + +static inline struct net *get_net_track(struct net *net, + netns_tracker *tracker, gfp_t gfp) +{ + get_net(net); + netns_tracker_alloc(net, tracker, gfp); + return net; +} + +static inline void put_net_track(struct net *net, netns_tracker *tracker) +{ + netns_tracker_free(net, tracker); + put_net(net); +} + typedef struct { #ifdef CONFIG_NET_NS struct net *net; diff --git a/include/net/net_trackers.h b/include/net/net_trackers.h new file mode 100644 index 000000000000..d94c76cf15a9 --- /dev/null +++ b/include/net/net_trackers.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_NET_TRACKERS_H +#define __NET_NET_TRACKERS_H +#include + +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER +typedef struct ref_tracker *netdevice_tracker; +#else +typedef struct {} netdevice_tracker; +#endif + +#ifdef CONFIG_NET_NS_REFCNT_TRACKER +typedef struct ref_tracker *netns_tracker; +#else +typedef struct {} netns_tracker; +#endif + +#endif /* __NET_NET_TRACKERS_H */ diff --git a/net/Kconfig.debug b/net/Kconfig.debug index fb5c70e01cb3..2f50611df858 100644 --- a/net/Kconfig.debug +++ b/net/Kconfig.debug @@ -8,3 +8,12 @@ config NET_DEV_REFCNT_TRACKER help Enable debugging feature to track device references. This adds memory and cpu costs. + +config NET_NS_REFCNT_TRACKER + bool "Enable networking namespace refcount tracking" + depends on DEBUG_KERNEL && STACKTRACE_SUPPORT + select REF_TRACKER + default n + help + Enable debugging feature to track netns references. + This adds memory and cpu costs. diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 202fa5eacd0f..9b7171c40434 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -311,6 +311,8 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) LIST_HEAD(net_exit_list); refcount_set(&net->ns.count, 1); + ref_tracker_dir_init(&net->refcnt_tracker, 128); + refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); preempt_disable(); @@ -635,6 +637,7 @@ static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) { + ref_tracker_dir_exit(&net->refcnt_tracker); /* Cleanup the network namespace in process context */ if (llist_add(&net->cleanup_list, &cleanup_list)) queue_work(netns_wq, &net_cleanup_work); -- cgit v1.2.3 From 9280ac2e6f199cddcd746a9ba459136b8666287b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Dec 2021 07:15:15 -0800 Subject: net: dev_replace_track() cleanup Use existing helpers (netdev_tracker_free() and netdev_tracker_alloc()) to remove ifdefery. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20211214151515.312535-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 235d5d082f1a..c06e9dc1a317 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3885,16 +3885,14 @@ static inline void dev_replace_track(struct net_device *odev, netdevice_tracker *tracker, gfp_t gfp) { -#ifdef CONFIG_NET_DEV_REFCNT_TRACKER if (odev) - ref_tracker_free(&odev->refcnt_tracker, tracker); -#endif + netdev_tracker_free(odev, tracker); + dev_hold(ndev); dev_put(odev); -#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + if (ndev) - ref_tracker_alloc(&ndev->refcnt_tracker, tracker, gfp); -#endif + netdev_tracker_alloc(ndev, tracker, gfp); } /* Carrier loss detection, dial on demand. The functions netif_carrier_on -- cgit v1.2.3 From f1d9268e061863ead77b07f5a6807d063e28a1c2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Dec 2021 07:09:33 -0800 Subject: net: add net device refcount tracker to struct packet_type Most notable changes are in af_packet, tipc ones are trivial. Signed-off-by: Eric Dumazet Cc: Jon Maloy Cc: Ying Xue Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/packet/af_packet.c | 14 +++++++++++--- net/tipc/bearer.c | 4 ++-- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c06e9dc1a317..a419718612c6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2533,6 +2533,7 @@ struct packet_type { __be16 type; /* This is really htons(ether_type). */ bool ignore_outgoing; struct net_device *dev; /* NULL is wildcarded here */ + netdevice_tracker dev_tracker; int (*func) (struct sk_buff *, struct net_device *, struct packet_type *, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a1ffdb48cc47..71854a16afbb 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3109,7 +3109,7 @@ static int packet_release(struct socket *sock) packet_cached_dev_reset(po); if (po->prot_hook.dev) { - dev_put(po->prot_hook.dev); + dev_put_track(po->prot_hook.dev, &po->prot_hook.dev_tracker); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); @@ -3217,18 +3217,25 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, WRITE_ONCE(po->num, proto); po->prot_hook.type = proto; + dev_put_track(dev_curr, &po->prot_hook.dev_tracker); + dev_curr = NULL; + if (unlikely(unlisted)) { dev_put(dev); po->prot_hook.dev = NULL; WRITE_ONCE(po->ifindex, -1); packet_cached_dev_reset(po); } else { + if (dev) + netdev_tracker_alloc(dev, + &po->prot_hook.dev_tracker, + GFP_ATOMIC); po->prot_hook.dev = dev; WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0); packet_cached_dev_assign(po, dev); } } - dev_put(dev_curr); + dev_put_track(dev_curr, &po->prot_hook.dev_tracker); if (proto == 0 || !need_rehook) goto out_unlock; @@ -4138,7 +4145,8 @@ static int packet_notifier(struct notifier_block *this, if (msg == NETDEV_UNREGISTER) { packet_cached_dev_reset(po); WRITE_ONCE(po->ifindex, -1); - dev_put(po->prot_hook.dev); + dev_put_track(po->prot_hook.dev, + &po->prot_hook.dev_tracker); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 60bc74b76adc..473a790f5894 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -787,7 +787,7 @@ int tipc_attach_loopback(struct net *net) if (!dev) return -ENODEV; - dev_hold(dev); + dev_hold_track(dev, &tn->loopback_pt.dev_tracker, GFP_KERNEL); tn->loopback_pt.dev = dev; tn->loopback_pt.type = htons(ETH_P_TIPC); tn->loopback_pt.func = tipc_loopback_rcv_pkt; @@ -800,7 +800,7 @@ void tipc_detach_loopback(struct net *net) struct tipc_net *tn = tipc_net(net); dev_remove_pack(&tn->loopback_pt); - dev_put(net->loopback_dev); + dev_put_track(net->loopback_dev, &tn->loopback_pt.dev_tracker); } /* Caller should hold rtnl_lock to protect the bearer */ -- cgit v1.2.3 From b62e3317b68d9c84301940ca8ca9c35a584111b2 Mon Sep 17 00:00:00 2001 From: Xiang wangx Date: Thu, 16 Dec 2021 23:19:16 +0800 Subject: net: fix typo in a comment The double 'as' in a comment is repeated, thus it should be removed. Signed-off-by: Xiang wangx Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index be5cb3360b94..6aadcc0ecb5b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1937,7 +1937,7 @@ enum netdev_ml_priv_type { * @udp_tunnel_nic: UDP tunnel offload state * @xdp_state: stores info on attached XDP BPF programs * - * @nested_level: Used as as a parameter of spin_lock_nested() of + * @nested_level: Used as a parameter of spin_lock_nested() of * dev->addr_list_lock. * @unlink_list: As netif_addr_lock() can be called recursively, * keep a list of interfaces to be deleted. -- cgit v1.2.3 From 8cbfe939abe905280279e84a297b1cb34e0d0ec9 Mon Sep 17 00:00:00 2001 From: Baowen Zheng Date: Fri, 17 Dec 2021 19:16:22 +0100 Subject: flow_offload: allow user to offload tc action to net device Use flow_indr_dev_register/flow_indr_dev_setup_offload to offload tc action. We need to call tc_cleanup_flow_action to clean up tc action entry since in tc_setup_action, some actions may hold dev refcnt, especially the mirror action. Signed-off-by: Baowen Zheng Signed-off-by: Louis Peens Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/net/flow_offload.h | 17 +++++++++ include/net/pkt_cls.h | 5 +++ net/core/flow_offload.c | 42 +++++++++++++++++---- net/sched/act_api.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++ net/sched/act_csum.c | 4 +- net/sched/act_ct.c | 4 +- net/sched/act_gact.c | 13 ++++++- net/sched/act_gate.c | 4 +- net/sched/act_mirred.c | 13 ++++++- net/sched/act_mpls.c | 16 +++++++- net/sched/act_police.c | 4 +- net/sched/act_sample.c | 4 +- net/sched/act_skbedit.c | 11 +++++- net/sched/act_tunnel_key.c | 9 ++++- net/sched/act_vlan.c | 16 +++++++- net/sched/cls_api.c | 21 +++++++++-- 17 files changed, 254 insertions(+), 23 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a419718612c6..8b0bdeb4734e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -920,6 +920,7 @@ enum tc_setup_type { TC_SETUP_QDISC_TBF, TC_SETUP_QDISC_FIFO, TC_SETUP_QDISC_HTB, + TC_SETUP_ACT, }; /* These structures hold the attributes of bpf state that are being passed diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 2271da5aa8ee..5b8c54eb7a6b 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -551,6 +551,23 @@ struct flow_cls_offload { u32 classid; }; +enum offload_act_command { + FLOW_ACT_REPLACE, + FLOW_ACT_DESTROY, + FLOW_ACT_STATS, +}; + +struct flow_offload_action { + struct netlink_ext_ack *extack; /* NULL in FLOW_ACT_STATS process*/ + enum offload_act_command command; + enum flow_action_id id; + u32 index; + struct flow_stats stats; + struct flow_action action; +}; + +struct flow_offload_action *offload_action_alloc(unsigned int num_actions); + static inline struct flow_rule * flow_cls_offload_flow_rule(struct flow_cls_offload *flow_cmd) { diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 5d4ff76d37e2..1bfb616ea759 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -262,6 +262,9 @@ static inline void tcf_exts_put_net(struct tcf_exts *exts) for (; 0; (void)(i), (void)(a), (void)(exts)) #endif +#define tcf_act_for_each_action(i, a, actions) \ + for (i = 0; i < TCA_ACT_MAX_PRIO && ((a) = actions[i]); i++) + static inline void tcf_exts_stats_update(const struct tcf_exts *exts, u64 bytes, u64 packets, u64 drops, u64 lastuse, @@ -539,6 +542,8 @@ tcf_match_indev(struct sk_buff *skb, int ifindex) int tc_setup_offload_action(struct flow_action *flow_action, const struct tcf_exts *exts); void tc_cleanup_offload_action(struct flow_action *flow_action); +int tc_setup_action(struct flow_action *flow_action, + struct tc_action *actions[]); int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, void *type_data, bool err_stop, bool rtnl_held); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 6beaea13564a..022c945817fa 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -27,6 +27,26 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) } EXPORT_SYMBOL(flow_rule_alloc); +struct flow_offload_action *offload_action_alloc(unsigned int num_actions) +{ + struct flow_offload_action *fl_action; + int i; + + fl_action = kzalloc(struct_size(fl_action, action.entries, num_actions), + GFP_KERNEL); + if (!fl_action) + return NULL; + + fl_action->action.num_entries = num_actions; + /* Pre-fill each action hw_stats with DONT_CARE. + * Caller can override this if it wants stats for a given action. + */ + for (i = 0; i < num_actions; i++) + fl_action->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE; + + return fl_action; +} + #define FLOW_DISSECTOR_MATCH(__rule, __type, __out) \ const struct flow_match *__m = &(__rule)->match; \ struct flow_dissector *__d = (__m)->dissector; \ @@ -549,19 +569,25 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, void (*cleanup)(struct flow_block_cb *block_cb)) { struct flow_indr_dev *this; + u32 count = 0; + int err; mutex_lock(&flow_indr_block_lock); + if (bo) { + if (bo->command == FLOW_BLOCK_BIND) + indir_dev_add(data, dev, sch, type, cleanup, bo); + else if (bo->command == FLOW_BLOCK_UNBIND) + indir_dev_remove(data); + } - if (bo->command == FLOW_BLOCK_BIND) - indir_dev_add(data, dev, sch, type, cleanup, bo); - else if (bo->command == FLOW_BLOCK_UNBIND) - indir_dev_remove(data); - - list_for_each_entry(this, &flow_block_indr_dev_list, list) - this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup); + list_for_each_entry(this, &flow_block_indr_dev_list, list) { + err = this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup); + if (!err) + count++; + } mutex_unlock(&flow_indr_block_lock); - return list_empty(&bo->cb_list) ? -EOPNOTSUPP : 0; + return (bo && list_empty(&bo->cb_list)) ? -EOPNOTSUPP : count; } EXPORT_SYMBOL(flow_indr_dev_setup_offload); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 3258da3d5bed..5c21401b0555 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -19,8 +19,10 @@ #include #include #include +#include #include #include +#include #ifdef CONFIG_INET DEFINE_STATIC_KEY_FALSE(tcf_frag_xmit_count); @@ -129,8 +131,92 @@ static void free_tcf(struct tc_action *p) kfree(p); } +static unsigned int tcf_offload_act_num_actions_single(struct tc_action *act) +{ + if (is_tcf_pedit(act)) + return tcf_pedit_nkeys(act); + else + return 1; +} + +static int offload_action_init(struct flow_offload_action *fl_action, + struct tc_action *act, + enum offload_act_command cmd, + struct netlink_ext_ack *extack) +{ + fl_action->extack = extack; + fl_action->command = cmd; + fl_action->index = act->tcfa_index; + + if (act->ops->offload_act_setup) + return act->ops->offload_act_setup(act, fl_action, NULL, false); + + return -EOPNOTSUPP; +} + +static int tcf_action_offload_cmd(struct flow_offload_action *fl_act, + struct netlink_ext_ack *extack) +{ + int err; + + err = flow_indr_dev_setup_offload(NULL, NULL, TC_SETUP_ACT, + fl_act, NULL, NULL); + if (err < 0) + return err; + + return 0; +} + +/* offload the tc action after it is inserted */ +static int tcf_action_offload_add(struct tc_action *action, + struct netlink_ext_ack *extack) +{ + struct tc_action *actions[TCA_ACT_MAX_PRIO] = { + [0] = action, + }; + struct flow_offload_action *fl_action; + int num, err = 0; + + num = tcf_offload_act_num_actions_single(action); + fl_action = offload_action_alloc(num); + if (!fl_action) + return -ENOMEM; + + err = offload_action_init(fl_action, action, FLOW_ACT_REPLACE, extack); + if (err) + goto fl_err; + + err = tc_setup_action(&fl_action->action, actions); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to setup tc actions for offload\n"); + goto fl_err; + } + + err = tcf_action_offload_cmd(fl_action, extack); + tc_cleanup_offload_action(&fl_action->action); + +fl_err: + kfree(fl_action); + + return err; +} + +static int tcf_action_offload_del(struct tc_action *action) +{ + struct flow_offload_action fl_act = {}; + int err = 0; + + err = offload_action_init(&fl_act, action, FLOW_ACT_DESTROY, NULL); + if (err) + return err; + + return tcf_action_offload_cmd(&fl_act, NULL); +} + static void tcf_action_cleanup(struct tc_action *p) { + tcf_action_offload_del(p); if (p->ops->cleanup) p->ops->cleanup(p); @@ -1061,6 +1147,11 @@ err_out: return ERR_PTR(err); } +static bool tc_act_bind(u32 flags) +{ + return !!(flags & TCA_ACT_FLAGS_BIND); +} + /* Returns numbers of initialized actions or negative error. */ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, @@ -1103,6 +1194,8 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, sz += tcf_action_fill_size(act); /* Start from index 0 */ actions[i - 1] = act; + if (!tc_act_bind(flags)) + tcf_action_offload_add(act, extack); } /* We have to commit them all together, because if any error happened in diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 4428852a03d7..e0f515b774ca 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -705,7 +705,9 @@ static int tcf_csum_offload_act_setup(struct tc_action *act, void *entry_data, entry->csum_flags = tcf_csum_update_flags(act); *index_inc = 1; } else { - return -EOPNOTSUPP; + struct flow_offload_action *fl_action = entry_data; + + fl_action->id = FLOW_ACTION_CSUM; } return 0; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index dc64f31e5191..1c537913a189 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1505,7 +1505,9 @@ static int tcf_ct_offload_act_setup(struct tc_action *act, void *entry_data, entry->ct.flow_table = tcf_ct_ft(act); *index_inc = 1; } else { - return -EOPNOTSUPP; + struct flow_offload_action *fl_action = entry_data; + + fl_action->id = FLOW_ACTION_CT; } return 0; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index f77be22069f4..bde6a6c01e64 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -272,7 +272,18 @@ static int tcf_gact_offload_act_setup(struct tc_action *act, void *entry_data, } *index_inc = 1; } else { - return -EOPNOTSUPP; + struct flow_offload_action *fl_action = entry_data; + + if (is_tcf_gact_ok(act)) + fl_action->id = FLOW_ACTION_ACCEPT; + else if (is_tcf_gact_shot(act)) + fl_action->id = FLOW_ACTION_DROP; + else if (is_tcf_gact_trap(act)) + fl_action->id = FLOW_ACTION_TRAP; + else if (is_tcf_gact_goto_chain(act)) + fl_action->id = FLOW_ACTION_GOTO; + else + return -EOPNOTSUPP; } return 0; diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 1d8297497692..d56e73843a4b 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -637,7 +637,9 @@ static int tcf_gate_offload_act_setup(struct tc_action *act, void *entry_data, return err; *index_inc = 1; } else { - return -EOPNOTSUPP; + struct flow_offload_action *fl_action = entry_data; + + fl_action->id = FLOW_ACTION_GATE; } return 0; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 8eecf55be0a2..39acd1d18609 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -482,7 +482,18 @@ static int tcf_mirred_offload_act_setup(struct tc_action *act, void *entry_data, } *index_inc = 1; } else { - return -EOPNOTSUPP; + struct flow_offload_action *fl_action = entry_data; + + if (is_tcf_mirred_egress_redirect(act)) + fl_action->id = FLOW_ACTION_REDIRECT; + else if (is_tcf_mirred_egress_mirror(act)) + fl_action->id = FLOW_ACTION_MIRRED; + else if (is_tcf_mirred_ingress_redirect(act)) + fl_action->id = FLOW_ACTION_REDIRECT_INGRESS; + else if (is_tcf_mirred_ingress_mirror(act)) + fl_action->id = FLOW_ACTION_MIRRED_INGRESS; + else + return -EOPNOTSUPP; } return 0; diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index a4615e1331e0..b9ff3459fdab 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -415,7 +415,21 @@ static int tcf_mpls_offload_act_setup(struct tc_action *act, void *entry_data, } *index_inc = 1; } else { - return -EOPNOTSUPP; + struct flow_offload_action *fl_action = entry_data; + + switch (tcf_mpls_action(act)) { + case TCA_MPLS_ACT_PUSH: + fl_action->id = FLOW_ACTION_MPLS_PUSH; + break; + case TCA_MPLS_ACT_POP: + fl_action->id = FLOW_ACTION_MPLS_POP; + break; + case TCA_MPLS_ACT_MODIFY: + fl_action->id = FLOW_ACTION_MPLS_MANGLE; + break; + default: + return -EOPNOTSUPP; + } } return 0; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index abb6d16a20b2..0923aa2b8f8a 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -421,7 +421,9 @@ static int tcf_police_offload_act_setup(struct tc_action *act, void *entry_data, entry->police.mtu = tcf_police_tcfp_mtu(act); *index_inc = 1; } else { - return -EOPNOTSUPP; + struct flow_offload_action *fl_action = entry_data; + + fl_action->id = FLOW_ACTION_POLICE; } return 0; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 07e56903211e..9a22cdda6bbd 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -303,7 +303,9 @@ static int tcf_sample_offload_act_setup(struct tc_action *act, void *entry_data, tcf_offload_sample_get_group(entry, act); *index_inc = 1; } else { - return -EOPNOTSUPP; + struct flow_offload_action *fl_action = entry_data; + + fl_action->id = FLOW_ACTION_SAMPLE; } return 0; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index c380f9e6cc95..ceba11b198bb 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -347,7 +347,16 @@ static int tcf_skbedit_offload_act_setup(struct tc_action *act, void *entry_data } *index_inc = 1; } else { - return -EOPNOTSUPP; + struct flow_offload_action *fl_action = entry_data; + + if (is_tcf_skbedit_mark(act)) + fl_action->id = FLOW_ACTION_MARK; + else if (is_tcf_skbedit_ptype(act)) + fl_action->id = FLOW_ACTION_PTYPE; + else if (is_tcf_skbedit_priority(act)) + fl_action->id = FLOW_ACTION_PRIORITY; + else + return -EOPNOTSUPP; } return 0; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index e96a65a5323e..23aba03d26a8 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -827,7 +827,14 @@ static int tcf_tunnel_key_offload_act_setup(struct tc_action *act, } *index_inc = 1; } else { - return -EOPNOTSUPP; + struct flow_offload_action *fl_action = entry_data; + + if (is_tcf_tunnel_set(act)) + fl_action->id = FLOW_ACTION_TUNNEL_ENCAP; + else if (is_tcf_tunnel_release(act)) + fl_action->id = FLOW_ACTION_TUNNEL_DECAP; + else + return -EOPNOTSUPP; } return 0; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 0300792084f0..756e2dcde1cd 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -395,7 +395,21 @@ static int tcf_vlan_offload_act_setup(struct tc_action *act, void *entry_data, } *index_inc = 1; } else { - return -EOPNOTSUPP; + struct flow_offload_action *fl_action = entry_data; + + switch (tcf_vlan_action(act)) { + case TCA_VLAN_ACT_PUSH: + fl_action->id = FLOW_ACTION_VLAN_PUSH; + break; + case TCA_VLAN_ACT_POP: + fl_action->id = FLOW_ACTION_VLAN_POP; + break; + case TCA_VLAN_ACT_MODIFY: + fl_action->id = FLOW_ACTION_VLAN_MANGLE; + break; + default: + return -EOPNOTSUPP; + } } return 0; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 53f263c9a725..353e1eed48be 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3488,8 +3488,8 @@ static int tc_setup_offload_act(struct tc_action *act, #endif } -int tc_setup_offload_action(struct flow_action *flow_action, - const struct tcf_exts *exts) +int tc_setup_action(struct flow_action *flow_action, + struct tc_action *actions[]) { int i, j, index, err = 0; struct tc_action *act; @@ -3498,11 +3498,11 @@ int tc_setup_offload_action(struct flow_action *flow_action, BUILD_BUG_ON(TCA_ACT_HW_STATS_IMMEDIATE != FLOW_ACTION_HW_STATS_IMMEDIATE); BUILD_BUG_ON(TCA_ACT_HW_STATS_DELAYED != FLOW_ACTION_HW_STATS_DELAYED); - if (!exts) + if (!actions) return 0; j = 0; - tcf_exts_for_each_action(i, act, exts) { + tcf_act_for_each_action(i, act, actions) { struct flow_action_entry *entry; entry = &flow_action->entries[j]; @@ -3531,6 +3531,19 @@ err_out_locked: spin_unlock_bh(&act->tcfa_lock); goto err_out; } + +int tc_setup_offload_action(struct flow_action *flow_action, + const struct tcf_exts *exts) +{ +#ifdef CONFIG_NET_CLS_ACT + if (!exts) + return 0; + + return tc_setup_action(flow_action, exts->actions); +#else + return 0; +#endif +} EXPORT_SYMBOL(tc_setup_offload_action); unsigned int tcf_exts_num_actions(struct tcf_exts *exts) -- cgit v1.2.3 From d6c6d0bb2cb3af91c9c1546af7cdf4770d0df443 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 29 Dec 2021 12:36:20 +0100 Subject: net: remove references to CONFIG_IRDA in network header files Commit d64c2a76123f ("staging: irda: remove the irda network stack and drivers") removes the config IRDA. Remove the remaining references to this non-existing config in the network header files. Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20211229113620.19368-1-lukas.bulwahn@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/atalk.h | 2 +- include/linux/netdevice.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/atalk.h b/include/linux/atalk.h index f6034ba774be..a55bfc6567d0 100644 --- a/include/linux/atalk.h +++ b/include/linux/atalk.h @@ -113,7 +113,7 @@ extern int aarp_proto_init(void); /* Inter module exports */ /* Give a device find its atif control structure */ -#if IS_ENABLED(CONFIG_IRDA) || IS_ENABLED(CONFIG_ATALK) +#if IS_ENABLED(CONFIG_ATALK) static inline struct atalk_iface *atalk_find_dev(struct net_device *dev) { return dev->atalk_ptr; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2684bdb6defa..6f99c8f51b60 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2098,7 +2098,7 @@ struct net_device { #if IS_ENABLED(CONFIG_TIPC) struct tipc_bearer __rcu *tipc_ptr; #endif -#if IS_ENABLED(CONFIG_IRDA) || IS_ENABLED(CONFIG_ATALK) +#if IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; #endif struct in_device __rcu *ip_ptr; -- cgit v1.2.3 From eac1b93c14d645ef147b049ace0d5230df755548 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Wed, 5 Jan 2022 02:48:38 -0800 Subject: gro: add ability to control gro max packet size Eric Dumazet suggested to allow users to modify max GRO packet size. We have seen GRO being disabled by users of appliances (such as wifi access points) because of claimed bufferbloat issues, or some work arounds in sch_cake, to split GRO/GSO packets. Instead of disabling GRO completely, one can chose to limit the maximum packet size of GRO packets, depending on their latency constraints. This patch adds a per device gro_max_size attribute that can be changed with ip link command. ip link set dev eth0 gro_max_size 16000 Suggested-by: Eric Dumazet Signed-off-by: Coco Li Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 11 +++++++++++ include/uapi/linux/if_link.h | 1 + net/core/dev.c | 1 + net/core/gro.c | 6 +++++- net/core/rtnetlink.c | 22 ++++++++++++++++++++++ tools/include/uapi/linux/if_link.h | 1 + 6 files changed, 41 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6f99c8f51b60..3213c7227b59 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1942,6 +1942,8 @@ enum netdev_ml_priv_type { * dev->addr_list_lock. * @unlink_list: As netif_addr_lock() can be called recursively, * keep a list of interfaces to be deleted. + * @gro_max_size: Maximum size of aggregated packet in generic + * receive offload (GRO) * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @linkwatch_dev_tracker: refcount tracker used by linkwatch. @@ -2131,6 +2133,8 @@ struct net_device { struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; int napi_defer_hard_irqs; +#define GRO_MAX_SIZE 65536 + unsigned int gro_max_size; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; @@ -4806,6 +4810,13 @@ static inline void netif_set_gso_max_segs(struct net_device *dev, WRITE_ONCE(dev->gso_max_segs, segs); } +static inline void netif_set_gro_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_max_size, size); +} + static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 4ac53b30b6dc..6218f93f5c1a 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -347,6 +347,7 @@ enum { */ IFLA_PARENT_DEV_NAME, IFLA_PARENT_DEV_BUS_NAME, + IFLA_GRO_MAX_SIZE, __IFLA_MAX }; diff --git a/net/core/dev.c b/net/core/dev.c index 6c8b226b5f2f..83a4089990a0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10180,6 +10180,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; + dev->gro_max_size = GRO_MAX_SIZE; dev->upper_level = 1; dev->lower_level = 1; #ifdef CONFIG_LOCKDEP diff --git a/net/core/gro.c b/net/core/gro.c index 8ec8b44596da..a11b286d1495 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -132,10 +132,14 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); unsigned int delta_truesize; + unsigned int gro_max_size; unsigned int new_truesize; struct sk_buff *lp; - if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush)) + /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ + gro_max_size = READ_ONCE(p->dev->gro_max_size); + + if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) return -E2BIG; lp = NAPI_GRO_CB(p)->last; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d6eba554b137..e476403231f0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1026,6 +1026,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ @@ -1728,6 +1729,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || + nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif @@ -1880,6 +1882,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, + [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2299,6 +2302,14 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], } } + if (tb[IFLA_GRO_MAX_SIZE]) { + u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); + + if (gro_max_size > GRO_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "too big gro_max_size"); + return -EINVAL; + } + } return 0; } @@ -2772,6 +2783,15 @@ static int do_setlink(const struct sk_buff *skb, } } + if (tb[IFLA_GRO_MAX_SIZE]) { + u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); + + if (dev->gro_max_size ^ gro_max_size) { + netif_set_gro_max_size(dev, gro_max_size); + status |= DO_SETLINK_MODIFIED; + } + } + if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); @@ -3222,6 +3242,8 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS])); + if (tb[IFLA_GRO_MAX_SIZE]) + netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE])); return dev; } diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 4ac53b30b6dc..6218f93f5c1a 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -347,6 +347,7 @@ enum { */ IFLA_PARENT_DEV_NAME, IFLA_PARENT_DEV_BUS_NAME, + IFLA_GRO_MAX_SIZE, __IFLA_MAX }; -- cgit v1.2.3 From 47934e06b65637c88a762d9c98329ae6e3238888 Mon Sep 17 00:00:00 2001 From: Congyu Liu Date: Tue, 18 Jan 2022 14:20:13 -0500 Subject: net: fix information leakage in /proc/net/ptype In one net namespace, after creating a packet socket without binding it to a device, users in other net namespaces can observe the new `packet_type` added by this packet socket by reading `/proc/net/ptype` file. This is minor information leakage as packet socket is namespace aware. Add a net pointer in `packet_type` to keep the net namespace of of corresponding packet socket. In `ptype_seq_show`, this net pointer must be checked when it is not NULL. Fixes: 2feb27dbe00c ("[NETNS]: Minor information leak via /proc/net/ptype file.") Signed-off-by: Congyu Liu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/net-procfs.c | 3 ++- net/packet/af_packet.c | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3213c7227b59..e490b84732d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2548,6 +2548,7 @@ struct packet_type { struct net_device *); bool (*id_match)(struct packet_type *ptype, struct sock *sk); + struct net *af_packet_net; void *af_packet_priv; struct list_head list; }; diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index d8b9dbabd4a4..5b8016335aca 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -260,7 +260,8 @@ static int ptype_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_puts(seq, "Type Device Function\n"); - else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { + else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) && + (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) { if (pt->type == htons(ETH_P_ALL)) seq_puts(seq, "ALL "); else diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5bd409ab4cc2..85ea7ddb48db 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1774,6 +1774,7 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; + match->prot_hook.af_packet_net = read_pnet(&match->net); match->prot_hook.id_match = match_fanout_group; match->max_num_members = args->max_num_members; list_add(&match->list, &fanout_list); @@ -3353,6 +3354,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, po->prot_hook.func = packet_rcv_spkt; po->prot_hook.af_packet_priv = sk; + po->prot_hook.af_packet_net = sock_net(sk); if (proto) { po->prot_hook.type = proto; -- cgit v1.2.3