From 0b5c21bbc01e92745ca1ca4f6fd87d878fa3ea5e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 4 Apr 2022 11:38:47 +0200 Subject: net: ensure net_todo_list is processed quickly In [1], Will raised a potential issue that the cfg80211 code, which does (from a locking perspective) rtnl_lock() wiphy_lock() rtnl_unlock() might be suspectible to ABBA deadlocks, because rtnl_unlock() calls netdev_run_todo(), which might end up calling rtnl_lock() again, which could then deadlock (see the comment in the code added here for the scenario). Some back and forth and thinking ensued, but clearly this can't happen if the net_todo_list is empty at the rtnl_unlock() here. Clearly, the code here cannot actually put an entry on it, and all other users of rtnl_unlock() will empty it since that will always go through netdev_run_todo(), emptying the list. So the only other way to get there would be to add to the list and then unlock the RTNL without going through rtnl_unlock(), which is only possible through __rtnl_unlock(). However, this isn't exported and not used in many places, and none of them seem to be able to unregister before using it. Therefore, add a WARN_ON() in the code to ensure this invariant won't be broken, so that the cfg80211 (or any similar) code stays safe. [1] https://lore.kernel.org/r/Yjzpo3TfZxtKPMAG@google.com Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20220404113847.0ee02e4a70da.Ic73d206e217db20fd22dcec14fe5442ca732804b@changeid Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 59e27a2b7bf0..b6a1e7f643da 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3894,7 +3894,8 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); extern int netdev_budget; extern unsigned int netdev_budget_usecs; -/* Called by rtnetlink.c:rtnl_unlock() */ +/* Used by rtnetlink.c:__rtnl_unlock()/rtnl_unlock() */ +extern struct list_head net_todo_list; void netdev_run_todo(void); static inline void __dev_put(struct net_device *dev) -- cgit v1.2.3 From a333215e10cb5d3b1e0685ca117f0e9452215485 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 5 Apr 2022 21:57:48 +0200 Subject: net: ethernet: mtk_eth_soc: implement flow offloading to WED devices This allows hardware flow offloading from Ethernet to WLAN on MT7622 SoC Co-developed-by: Lorenzo Bianconi Signed-off-by: Lorenzo Bianconi Signed-off-by: Felix Fietkau Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_ppe.c | 18 ++++++++ drivers/net/ethernet/mediatek/mtk_ppe.h | 14 ++++--- drivers/net/ethernet/mediatek/mtk_ppe_offload.c | 56 ++++++++++++++++++++++++- drivers/net/ethernet/mediatek/mtk_wed.h | 7 ++++ include/linux/netdevice.h | 7 ++++ net/core/dev.c | 4 ++ 6 files changed, 98 insertions(+), 8 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.c b/drivers/net/ethernet/mediatek/mtk_ppe.c index 3ad10c793308..472bcd3269a7 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe.c @@ -329,6 +329,24 @@ int mtk_foe_entry_set_pppoe(struct mtk_foe_entry *entry, int sid) return 0; } +int mtk_foe_entry_set_wdma(struct mtk_foe_entry *entry, int wdma_idx, int txq, + int bss, int wcid) +{ + struct mtk_foe_mac_info *l2 = mtk_foe_entry_l2(entry); + u32 *ib2 = mtk_foe_entry_ib2(entry); + + *ib2 &= ~MTK_FOE_IB2_PORT_MG; + *ib2 |= MTK_FOE_IB2_WDMA_WINFO; + if (wdma_idx) + *ib2 |= MTK_FOE_IB2_WDMA_DEVIDX; + + l2->vlan2 = FIELD_PREP(MTK_FOE_VLAN2_WINFO_BSS, bss) | + FIELD_PREP(MTK_FOE_VLAN2_WINFO_WCID, wcid) | + FIELD_PREP(MTK_FOE_VLAN2_WINFO_RING, txq); + + return 0; +} + static inline bool mtk_foe_entry_usable(struct mtk_foe_entry *entry) { return !(entry->ib1 & MTK_FOE_IB1_STATIC) && diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.h b/drivers/net/ethernet/mediatek/mtk_ppe.h index 242fb8f2ae65..df8ccaf48171 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.h +++ b/drivers/net/ethernet/mediatek/mtk_ppe.h @@ -48,9 +48,9 @@ enum { #define MTK_FOE_IB2_DEST_PORT GENMASK(7, 5) #define MTK_FOE_IB2_MULTICAST BIT(8) -#define MTK_FOE_IB2_WHNAT_QID2 GENMASK(13, 12) -#define MTK_FOE_IB2_WHNAT_DEVIDX BIT(16) -#define MTK_FOE_IB2_WHNAT_NAT BIT(17) +#define MTK_FOE_IB2_WDMA_QID2 GENMASK(13, 12) +#define MTK_FOE_IB2_WDMA_DEVIDX BIT(16) +#define MTK_FOE_IB2_WDMA_WINFO BIT(17) #define MTK_FOE_IB2_PORT_MG GENMASK(17, 12) @@ -58,9 +58,9 @@ enum { #define MTK_FOE_IB2_DSCP GENMASK(31, 24) -#define MTK_FOE_VLAN2_WHNAT_BSS GEMMASK(5, 0) -#define MTK_FOE_VLAN2_WHNAT_WCID GENMASK(13, 6) -#define MTK_FOE_VLAN2_WHNAT_RING GENMASK(15, 14) +#define MTK_FOE_VLAN2_WINFO_BSS GENMASK(5, 0) +#define MTK_FOE_VLAN2_WINFO_WCID GENMASK(13, 6) +#define MTK_FOE_VLAN2_WINFO_RING GENMASK(15, 14) enum { MTK_FOE_STATE_INVALID, @@ -281,6 +281,8 @@ int mtk_foe_entry_set_ipv6_tuple(struct mtk_foe_entry *entry, int mtk_foe_entry_set_dsa(struct mtk_foe_entry *entry, int port); int mtk_foe_entry_set_vlan(struct mtk_foe_entry *entry, int vid); int mtk_foe_entry_set_pppoe(struct mtk_foe_entry *entry, int sid); +int mtk_foe_entry_set_wdma(struct mtk_foe_entry *entry, int wdma_idx, int txq, + int bss, int wcid); int mtk_foe_entry_commit(struct mtk_ppe *ppe, struct mtk_foe_entry *entry, u16 timestamp); int mtk_ppe_debugfs_init(struct mtk_ppe *ppe); diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c index 7bb1f20002b5..bcf342bb9051 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c @@ -10,6 +10,7 @@ #include #include #include "mtk_eth_soc.h" +#include "mtk_wed.h" struct mtk_flow_data { struct ethhdr eth; @@ -39,6 +40,7 @@ struct mtk_flow_entry { struct rhash_head node; unsigned long cookie; u16 hash; + s8 wed_index; }; static const struct rhashtable_params mtk_flow_ht_params = { @@ -80,6 +82,35 @@ mtk_flow_offload_mangle_eth(const struct flow_action_entry *act, void *eth) memcpy(dest, src, act->mangle.mask ? 2 : 4); } +static int +mtk_flow_get_wdma_info(struct net_device *dev, const u8 *addr, struct mtk_wdma_info *info) +{ + struct net_device_path_ctx ctx = { + .dev = dev, + .daddr = addr, + }; + struct net_device_path path = {}; + + if (!IS_ENABLED(CONFIG_NET_MEDIATEK_SOC_WED)) + return -1; + + if (!dev->netdev_ops->ndo_fill_forward_path) + return -1; + + if (dev->netdev_ops->ndo_fill_forward_path(&ctx, &path)) + return -1; + + if (path.type != DEV_PATH_MTK_WDMA) + return -1; + + info->wdma_idx = path.mtk_wdma.wdma_idx; + info->queue = path.mtk_wdma.queue; + info->bss = path.mtk_wdma.bss; + info->wcid = path.mtk_wdma.wcid; + + return 0; +} + static int mtk_flow_mangle_ports(const struct flow_action_entry *act, @@ -149,10 +180,20 @@ mtk_flow_get_dsa_port(struct net_device **dev) static int mtk_flow_set_output_device(struct mtk_eth *eth, struct mtk_foe_entry *foe, - struct net_device *dev) + struct net_device *dev, const u8 *dest_mac, + int *wed_index) { + struct mtk_wdma_info info = {}; int pse_port, dsa_port; + if (mtk_flow_get_wdma_info(dev, dest_mac, &info) == 0) { + mtk_foe_entry_set_wdma(foe, info.wdma_idx, info.queue, info.bss, + info.wcid); + pse_port = 3; + *wed_index = info.wdma_idx; + goto out; + } + dsa_port = mtk_flow_get_dsa_port(&dev); if (dsa_port >= 0) mtk_foe_entry_set_dsa(foe, dsa_port); @@ -164,6 +205,7 @@ mtk_flow_set_output_device(struct mtk_eth *eth, struct mtk_foe_entry *foe, else return -EOPNOTSUPP; +out: mtk_foe_entry_set_pse_port(foe, pse_port); return 0; @@ -179,6 +221,7 @@ mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f) struct net_device *odev = NULL; struct mtk_flow_entry *entry; int offload_type = 0; + int wed_index = -1; u16 addr_type = 0; u32 timestamp; u8 l4proto = 0; @@ -326,10 +369,14 @@ mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f) if (data.pppoe.num == 1) mtk_foe_entry_set_pppoe(&foe, data.pppoe.sid); - err = mtk_flow_set_output_device(eth, &foe, odev); + err = mtk_flow_set_output_device(eth, &foe, odev, data.eth.h_dest, + &wed_index); if (err) return err; + if (wed_index >= 0 && (err = mtk_wed_flow_add(wed_index)) < 0) + return err; + entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; @@ -343,6 +390,7 @@ mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f) } entry->hash = hash; + entry->wed_index = wed_index; err = rhashtable_insert_fast(ð->flow_table, &entry->node, mtk_flow_ht_params); if (err < 0) @@ -353,6 +401,8 @@ clear_flow: mtk_foe_entry_clear(ð->ppe, hash); free: kfree(entry); + if (wed_index >= 0) + mtk_wed_flow_remove(wed_index); return err; } @@ -369,6 +419,8 @@ mtk_flow_offload_destroy(struct mtk_eth *eth, struct flow_cls_offload *f) mtk_foe_entry_clear(ð->ppe, entry->hash); rhashtable_remove_fast(ð->flow_table, &entry->node, mtk_flow_ht_params); + if (entry->wed_index >= 0) + mtk_wed_flow_remove(entry->wed_index); kfree(entry); return 0; diff --git a/drivers/net/ethernet/mediatek/mtk_wed.h b/drivers/net/ethernet/mediatek/mtk_wed.h index 404c9a9b130d..981ec613f4b0 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.h +++ b/drivers/net/ethernet/mediatek/mtk_wed.h @@ -7,6 +7,7 @@ #include #include #include +#include struct mtk_eth; @@ -27,6 +28,12 @@ struct mtk_wed_hw { int index; }; +struct mtk_wdma_info { + u8 wdma_idx; + u8 queue; + u16 wcid; + u8 bss; +}; #ifdef CONFIG_NET_MEDIATEK_SOC_WED static inline void diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b6a1e7f643da..7b2a0b739684 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -862,6 +862,7 @@ enum net_device_path_type { DEV_PATH_BRIDGE, DEV_PATH_PPPOE, DEV_PATH_DSA, + DEV_PATH_MTK_WDMA, }; struct net_device_path { @@ -887,6 +888,12 @@ struct net_device_path { int port; u16 proto; } dsa; + struct { + u8 wdma_idx; + u8 queue; + u16 wcid; + u8 bss; + } mtk_wdma; }; }; diff --git a/net/core/dev.c b/net/core/dev.c index 2ec17358d7b4..d5a362d53b34 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -701,6 +701,10 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, if (WARN_ON_ONCE(last_dev == ctx.dev)) return -1; } + + if (!ctx.dev) + return ret; + path = dev_fwd_path(stack); if (!path) return -1; -- cgit v1.2.3 From 6264f58ca0e54e41d63c2d00334a48bac28fbf30 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 Apr 2022 14:37:54 -0700 Subject: net: extract a few internals from netdevice.h There's a number of functions and static variables used under net/core/ but not from the outside. We currently dump most of them into netdevice.h. That bad for many reasons: - netdevice.h is very cluttered, hard to figure out what the APIs are; - netdevice.h is very long; - we have to touch netdevice.h more which causes expensive incremental builds. Create a header under net/core/ and move some declarations. The new header is also a bit of a catch-all but that's fine, if we create more specific headers people will likely over-think where their declaration fit best. And end up putting them in netdevice.h, again. More work should be done on splitting netdevice.h into more targeted headers, but that'd be more time consuming so small steps. Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 72 +----------------------------------- net/core/dev.c | 1 + net/core/dev.h | 91 ++++++++++++++++++++++++++++++++++++++++++++++ net/core/dev_addr_lists.c | 2 + net/core/dev_ioctl.c | 2 + net/core/link_watch.c | 1 + net/core/net-procfs.c | 2 + net/core/net-sysfs.c | 1 + net/core/rtnetlink.c | 2 + net/core/sock.c | 2 + net/core/sysctl_net_core.c | 2 + 11 files changed, 108 insertions(+), 70 deletions(-) create mode 100644 net/core/dev.h (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7b2a0b739684..7e7b2a72e473 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -59,7 +59,8 @@ struct dsa_port; struct ip_tunnel_parm; struct macsec_context; struct macsec_ops; - +struct netdev_name_node; +struct sd_flow_limit; struct sfp_bus; /* 802.11 specific */ struct wireless_dev; @@ -1020,16 +1021,6 @@ struct dev_ifalias { struct devlink; struct tlsdev_ops; -struct netdev_name_node { - struct hlist_node hlist; - struct list_head list; - struct net_device *dev; - const char *name; -}; - -int netdev_name_node_alt_create(struct net_device *dev, const char *name); -int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); - struct netdev_net_notifier { struct list_head list; struct notifier_block *nb; @@ -2975,7 +2966,6 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); struct net_device *dev_get_by_napi_id(unsigned int napi_id); -int netdev_get_name(struct net *net, char *name, int ifindex); int dev_restart(struct net_device *dev); @@ -3034,19 +3024,6 @@ static inline bool dev_has_header(const struct net_device *dev) return dev->header_ops && dev->header_ops->create; } -#ifdef CONFIG_NET_FLOW_LIMIT -#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ -struct sd_flow_limit { - u64 count; - unsigned int num_buckets; - unsigned int history_head; - u16 history[FLOW_LIMIT_HISTORY]; - u8 buckets[]; -}; - -extern int netdev_flow_limit_table_len; -#endif /* CONFIG_NET_FLOW_LIMIT */ - /* * Incoming packets are placed on per-CPU queues */ @@ -3770,7 +3747,6 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); void __dev_notify_flags(struct net_device *, unsigned int old_flags, unsigned int gchanges); -int dev_change_name(struct net_device *, const char *); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int __dev_change_net_namespace(struct net_device *dev, struct net *net, @@ -3782,13 +3758,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, return __dev_change_net_namespace(dev, net, pat, 0); } int __dev_set_mtu(struct net_device *, int); -int dev_validate_mtu(struct net_device *dev, int mtu, - struct netlink_ext_ack *extack); -int dev_set_mtu_ext(struct net_device *dev, int mtu, - struct netlink_ext_ack *extack); int dev_set_mtu(struct net_device *, int); -int dev_change_tx_queue_len(struct net_device *, unsigned long); -void dev_set_group(struct net_device *, int); int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, struct netlink_ext_ack *extack); int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, @@ -3796,24 +3766,13 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name); -int dev_change_carrier(struct net_device *, bool new_carrier); -int dev_get_phys_port_id(struct net_device *dev, - struct netdev_phys_item_id *ppid); -int dev_get_phys_port_name(struct net_device *dev, - char *name, size_t len); int dev_get_port_parent_id(struct net_device *dev, struct netdev_phys_item_id *ppid, bool recurse); bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); -int dev_change_proto_down(struct net_device *dev, bool proto_down); -void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, - u32 value); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); -typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); -int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, - int fd, int expected_fd, u32 flags); int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u8 dev_xdp_prog_count(struct net_device *dev); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); @@ -3898,13 +3857,6 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev, bool dev_nit_active(struct net_device *dev); void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); -extern int netdev_budget; -extern unsigned int netdev_budget_usecs; - -/* Used by rtnetlink.c:__rtnl_unlock()/rtnl_unlock() */ -extern struct list_head net_todo_list; -void netdev_run_todo(void); - static inline void __dev_put(struct net_device *dev) { if (dev) { @@ -4021,10 +3973,7 @@ static inline void dev_replace_track(struct net_device *odev, * called netif_lowerlayer_*() because they represent the state of any * kind of lower layer not just hardware media. */ - -void linkwatch_init_dev(struct net_device *dev); void linkwatch_fire_event(struct net_device *dev); -void linkwatch_forget_dev(struct net_device *dev); /** * netif_carrier_ok - test if carrier present @@ -4470,9 +4419,6 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); -void dev_addr_flush(struct net_device *dev); -int dev_addr_init(struct net_device *dev); -void dev_addr_check(struct net_device *dev); /* Functions used for unicast addresses handling */ int dev_uc_add(struct net_device *dev, const unsigned char *addr); @@ -4562,7 +4508,6 @@ static inline void __dev_mc_unsync(struct net_device *dev, /* Functions used for secondary unicast and multicast support */ void dev_set_rx_mode(struct net_device *dev); -void __dev_set_rx_mode(struct net_device *dev); int dev_set_promiscuity(struct net_device *dev, int inc); int dev_set_allmulti(struct net_device *dev, int inc); void netdev_state_change(struct net_device *dev); @@ -4580,11 +4525,6 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); extern int netdev_max_backlog; -extern int netdev_tstamp_prequeue; -extern int netdev_unregister_timeout_secs; -extern int weight_p; -extern int dev_weight_rx_bias; -extern int dev_weight_tx_bias; extern int dev_rx_weight; extern int dev_tx_weight; extern int gro_normal_batch; @@ -4772,12 +4712,6 @@ static inline void netdev_rx_csum_fault(struct net_device *dev, void net_enable_timestamp(void); void net_disable_timestamp(void); -#ifdef CONFIG_PROC_FS -int __init dev_proc_init(void); -#else -#define dev_proc_init() 0 -#endif - static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev, bool more) @@ -4813,8 +4747,6 @@ extern const struct kobj_ns_type_operations net_ns_type_operations; const char *netdev_drivername(const struct net_device *dev); -void linkwatch_run_queue(void); - static inline netdev_features_t netdev_intersect_features(netdev_features_t f1, netdev_features_t f2) { diff --git a/net/core/dev.c b/net/core/dev.c index 8755ad71be6c..f00d29856b43 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -151,6 +151,7 @@ #include #include +#include "dev.h" #include "net-sysfs.h" diff --git a/net/core/dev.h b/net/core/dev.h new file mode 100644 index 000000000000..27923df00637 --- /dev/null +++ b/net/core/dev.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_CORE_DEV_H +#define _NET_CORE_DEV_H + +#include + +struct net; +struct net_device; +struct netdev_bpf; +struct netdev_phys_item_id; +struct netlink_ext_ack; + +/* Random bits of netdevice that don't need to be exposed */ +#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ +struct sd_flow_limit { + u64 count; + unsigned int num_buckets; + unsigned int history_head; + u16 history[FLOW_LIMIT_HISTORY]; + u8 buckets[]; +}; + +extern int netdev_flow_limit_table_len; + +#ifdef CONFIG_PROC_FS +int __init dev_proc_init(void); +#else +#define dev_proc_init() 0 +#endif + +void linkwatch_init_dev(struct net_device *dev); +void linkwatch_forget_dev(struct net_device *dev); +void linkwatch_run_queue(void); + +void dev_addr_flush(struct net_device *dev); +int dev_addr_init(struct net_device *dev); +void dev_addr_check(struct net_device *dev); + +/* sysctls not referred to from outside net/core/ */ +extern int netdev_budget; +extern unsigned int netdev_budget_usecs; + +extern int netdev_tstamp_prequeue; +extern int netdev_unregister_timeout_secs; +extern int weight_p; +extern int dev_weight_rx_bias; +extern int dev_weight_tx_bias; + +/* rtnl helpers */ +extern struct list_head net_todo_list; +void netdev_run_todo(void); + +/* netdev management, shared between various uAPI entry points */ +struct netdev_name_node { + struct hlist_node hlist; + struct list_head list; + struct net_device *dev; + const char *name; +}; + +int netdev_get_name(struct net *net, char *name, int ifindex); +int dev_change_name(struct net_device *dev, const char *newname); + +int netdev_name_node_alt_create(struct net_device *dev, const char *name); +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); + +int dev_validate_mtu(struct net_device *dev, int mtu, + struct netlink_ext_ack *extack); +int dev_set_mtu_ext(struct net_device *dev, int mtu, + struct netlink_ext_ack *extack); + +int dev_get_phys_port_id(struct net_device *dev, + struct netdev_phys_item_id *ppid); +int dev_get_phys_port_name(struct net_device *dev, + char *name, size_t len); + +int dev_change_proto_down(struct net_device *dev, bool proto_down); +void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, + u32 value); + +typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); +int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, + int fd, int expected_fd, u32 flags); + +int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len); +void dev_set_group(struct net_device *dev, int new_group); +int dev_change_carrier(struct net_device *dev, bool new_carrier); + +void __dev_set_rx_mode(struct net_device *dev); + +#endif diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index bead38ca50bd..baa63dee2829 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -12,6 +12,8 @@ #include #include +#include "dev.h" + /* * General list handling functions */ diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 1b807d119da5..4f6be442ae7e 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -10,6 +10,8 @@ #include #include +#include "dev.h" + /* * Map an interface index to its name (SIOCGIFNAME) */ diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 95098d1a49bd..a244d3bade7d 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -18,6 +18,7 @@ #include #include +#include "dev.h" enum lw_bits { LW_URGENT = 0, diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 88cc0ad7d386..1ec23bf8b05c 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -4,6 +4,8 @@ #include #include +#include "dev.h" + #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) #define get_bucket(x) ((x) >> BUCKET_SPACE) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 9cbc1c8289bc..4980c3a50475 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -24,6 +24,7 @@ #include #include +#include "dev.h" #include "net-sysfs.h" #ifdef CONFIG_SYSFS diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 0e4502d641eb..4041b3e2e8ec 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -54,6 +54,8 @@ #include #include +#include "dev.h" + #define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 40 diff --git a/net/core/sock.c b/net/core/sock.c index 1180a0cb0110..7000403eaeb2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -141,6 +141,8 @@ #include +#include "dev.h" + static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 7123fe7feeac..8295e5877eb3 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -23,6 +23,8 @@ #include #include +#include "dev.h" + static int two = 2; static int three = 3; static int int_3600 = 3600; -- cgit v1.2.3 From 794c24e9921f32ded4422833a990ccf11dc3c00e Mon Sep 17 00:00:00 2001 From: Jeffrey Ji Date: Wed, 6 Apr 2022 17:26:00 +0000 Subject: net-core: rx_otherhost_dropped to core_stats Increment rx_otherhost_dropped counter when packet dropped due to mismatched dest MAC addr. An example when this drop can occur is when manually crafting raw packets that will be consumed by a user space application via a tap device. For testing purposes local traffic was generated using trafgen for the client and netcat to start a server Tested: Created 2 netns, sent 1 packet using trafgen from 1 to the other with "{eth(daddr=$INCORRECT_MAC...}", verified that iproute2 showed the counter was incremented. (Also had to modify iproute2 to show the stat, additional patch for that coming next.) Signed-off-by: Jeffrey Ji Reviewed-by: Brian Vazquez Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220406172600.1141083-1-jeffreyjilinux@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ include/uapi/linux/if_link.h | 5 +++++ net/core/dev.c | 1 + net/ipv4/ip_input.c | 1 + net/ipv6/ip6_input.c | 1 + 5 files changed, 10 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7e7b2a72e473..28ea4f8269d4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -203,6 +203,7 @@ struct net_device_core_stats { local_t rx_dropped; local_t tx_dropped; local_t rx_nohandler; + local_t rx_otherhost_dropped; } __aligned(4 * sizeof(local_t)); #include @@ -3837,6 +3838,7 @@ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev) \ DEV_CORE_STATS_INC(rx_dropped) DEV_CORE_STATS_INC(tx_dropped) DEV_CORE_STATS_INC(rx_nohandler) +DEV_CORE_STATS_INC(rx_otherhost_dropped) static __always_inline int ____dev_forward_skb(struct net_device *dev, struct sk_buff *skb, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index cc284c048e69..d1e600816b82 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -211,6 +211,9 @@ struct rtnl_link_stats { * @rx_nohandler: Number of packets received on the interface * but dropped by the networking stack because the device is * not designated to receive packets (e.g. backup link in a bond). + * + * @rx_otherhost_dropped: Number of packets dropped due to mismatch + * in destination MAC address. */ struct rtnl_link_stats64 { __u64 rx_packets; @@ -243,6 +246,8 @@ struct rtnl_link_stats64 { __u64 rx_compressed; __u64 tx_compressed; __u64 rx_nohandler; + + __u64 rx_otherhost_dropped; }; /* Subset of link stats useful for in-HW collection. Meaning of the fields is as diff --git a/net/core/dev.c b/net/core/dev.c index f00d29856b43..e027410e861b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10358,6 +10358,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, storage->rx_dropped += local_read(&core_stats->rx_dropped); storage->tx_dropped += local_read(&core_stats->tx_dropped); storage->rx_nohandler += local_read(&core_stats->rx_nohandler); + storage->rx_otherhost_dropped += local_read(&core_stats->rx_otherhost_dropped); } } return storage; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 95f7bb052784..b1165f717cd1 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -451,6 +451,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) * that it receives, do not try to analyse it. */ if (skb->pkt_type == PACKET_OTHERHOST) { + dev_core_stats_rx_otherhost_dropped_inc(skb->dev); drop_reason = SKB_DROP_REASON_OTHERHOST; goto drop; } diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 5b5ea35635f9..b4880c7c84eb 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -150,6 +150,7 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, struct inet6_dev *idev; if (skb->pkt_type == PACKET_OTHERHOST) { + dev_core_stats_rx_otherhost_dropped_inc(skb->dev); kfree_skb(skb); return NULL; } -- cgit v1.2.3 From 1306d5362a591493a2d07f685ed2cc480dcda320 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 13 Apr 2022 13:51:56 +0300 Subject: net: add ndo_fdb_del_bulk Add a new netdev op called ndo_fdb_del_bulk, it will be later used for driver-specific bulk delete implementation dispatched from rtnetlink. The first user will be the bridge, we need it to signal to rtnetlink from the driver that we support bulk delete operation (NLM_F_BULK). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 28ea4f8269d4..a602f29365b0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1260,6 +1260,10 @@ struct netdev_net_notifier { * struct net_device *dev, * const unsigned char *addr, u16 vid) * Deletes the FDB entry from dev coresponding to addr. + * int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, struct nlattr *tb[], + * struct net_device *dev, + * u16 vid, + * struct netlink_ext_ack *extack); * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, * struct net_device *dev, struct net_device *filter_dev, * int *idx) @@ -1510,6 +1514,11 @@ struct net_device_ops { struct net_device *dev, const unsigned char *addr, u16 vid); + int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, + struct nlattr *tb[], + struct net_device *dev, + u16 vid, + struct netlink_ext_ack *extack); int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, -- cgit v1.2.3 From 2f1e85b1aee459b7d0fd981839042c6a38ffaf0c Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Sat, 16 Apr 2022 00:40:45 +0800 Subject: net: sched: use queue_mapping to pick tx queue This patch fixes issue: * If we install tc filters with act_skbedit in clsact hook. It doesn't work, because netdev_core_pick_tx() overwrites queue_mapping. $ tc filter ... action skbedit queue_mapping 1 And this patch is useful: * We can use FQ + EDT to implement efficient policies. Tx queues are picked by xps, ndo_select_queue of netdev driver, or skb hash in netdev_core_pick_tx(). In fact, the netdev driver, and skb hash are _not_ under control. xps uses the CPUs map to select Tx queues, but we can't figure out which task_struct of pod/containter running on this cpu in most case. We can use clsact filters to classify one pod/container traffic to one Tx queue. Why ? In containter networking environment, there are two kinds of pod/ containter/net-namespace. One kind (e.g. P1, P2), the high throughput is key in these applications. But avoid running out of network resource, the outbound traffic of these pods is limited, using or sharing one dedicated Tx queues assigned HTB/TBF/FQ Qdisc. Other kind of pods (e.g. Pn), the low latency of data access is key. And the traffic is not limited. Pods use or share other dedicated Tx queues assigned FIFO Qdisc. This choice provides two benefits. First, contention on the HTB/FQ Qdisc lock is significantly reduced since fewer CPUs contend for the same queue. More importantly, Qdisc contention can be eliminated completely if each CPU has its own FIFO Qdisc for the second kind of pods. There must be a mechanism in place to support classifying traffic based on pods/container to different Tx queues. Note that clsact is outside of Qdisc while Qdisc can run a classifier to select a sub-queue under the lock. In general recording the decision in the skb seems a little heavy handed. This patch introduces a per-CPU variable, suggested by Eric. The xmit.skip_txqueue flag is firstly cleared in __dev_queue_xmit(). - Tx Qdisc may install that skbedit actions, then xmit.skip_txqueue flag is set in qdisc->enqueue() though tx queue has been selected in netdev_tx_queue_mapping() or netdev_core_pick_tx(). That flag is cleared firstly in __dev_queue_xmit(), is useful: - Avoid picking Tx queue with netdev_tx_queue_mapping() in next netdev in such case: eth0 macvlan - eth0.3 vlan - eth0 ixgbe-phy: For example, eth0, macvlan in pod, which root Qdisc install skbedit queue_mapping, send packets to eth0.3, vlan in host. In __dev_queue_xmit() of eth0.3, clear the flag, does not select tx queue according to skb->queue_mapping because there is no filters in clsact or tx Qdisc of this netdev. Same action taked in eth0, ixgbe in Host. - Avoid picking Tx queue for next packet. If we set xmit.skip_txqueue in tx Qdisc (qdisc->enqueue()), the proper way to clear it is clearing it in __dev_queue_xmit when processing next packets. For performance reasons, use the static key. If user does not config the NET_EGRESS, the patch will not be compiled. +----+ +----+ +----+ | P1 | | P2 | | Pn | +----+ +----+ +----+ | | | +-----------+-----------+ | | clsact/skbedit | MQ v +-----------+-----------+ | q0 | q1 | qn v v v HTB/FQ HTB/FQ ... FIFO Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Jonathan Lemon Cc: Eric Dumazet Cc: Alexander Lobakin Cc: Paolo Abeni Cc: Talal Ahmad Cc: Kevin Hao Cc: Ilias Apalodimas Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Antoine Tenart Cc: Wei Wang Cc: Arnd Bergmann Suggested-by: Eric Dumazet Signed-off-by: Tonghao Zhang Acked-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 3 +++ include/linux/rtnetlink.h | 1 + net/core/dev.c | 31 +++++++++++++++++++++++++++++-- net/sched/act_skbedit.c | 6 +++++- 4 files changed, 38 insertions(+), 3 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a602f29365b0..7dccbfd1bf56 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3061,6 +3061,9 @@ struct softnet_data { struct { u16 recursion; u8 more; +#ifdef CONFIG_NET_EGRESS + u8 skip_txqueue; +#endif } xmit; #ifdef CONFIG_RPS /* input_queue_head should be written by cpu owning this struct, diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 7f970b16da3a..ae2c6a3cec5d 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -100,6 +100,7 @@ void net_dec_ingress_queue(void); #ifdef CONFIG_NET_EGRESS void net_inc_egress_queue(void); void net_dec_egress_queue(void); +void netdev_xmit_skip_txqueue(bool skip); #endif void rtnetlink_init(void); diff --git a/net/core/dev.c b/net/core/dev.c index ba853e878007..4a77ebda4fb1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3925,6 +3925,25 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) return skb; } + +static struct netdev_queue * +netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) +{ + int qm = skb_get_queue_mapping(skb); + + return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); +} + +static bool netdev_xmit_txqueue_skipped(void) +{ + return __this_cpu_read(softnet_data.xmit.skip_txqueue); +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); #endif /* CONFIG_NET_EGRESS */ #ifdef CONFIG_XPS @@ -4095,7 +4114,7 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; - struct netdev_queue *txq; + struct netdev_queue *txq = NULL; struct Qdisc *q; int rc = -ENOMEM; bool again = false; @@ -4123,11 +4142,17 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (!skb) goto out; } + + netdev_xmit_skip_txqueue(false); + nf_skip_egress(skb, true); skb = sch_handle_egress(skb, &rc, dev); if (!skb) goto out; nf_skip_egress(skb, false); + + if (netdev_xmit_txqueue_skipped()) + txq = netdev_tx_queue_mapping(dev, skb); } #endif /* If device/qdisc don't need skb->dst, release it right now while @@ -4138,7 +4163,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) else skb_dst_force(skb); - txq = netdev_core_pick_tx(dev, skb, sb_dev); + if (!txq) + txq = netdev_core_pick_tx(dev, skb, sb_dev); + q = rcu_dereference_bh(txq->qdisc); trace_net_dev_queue(skb); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 92d0dc754207..1c5fdb6e7c2f 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -58,8 +58,12 @@ static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, } } if (params->flags & SKBEDIT_F_QUEUE_MAPPING && - skb->dev->real_num_tx_queues > params->queue_mapping) + skb->dev->real_num_tx_queues > params->queue_mapping) { +#ifdef CONFIG_NET_EGRESS + netdev_xmit_skip_txqueue(true); +#endif skb_set_queue_mapping(skb, params->queue_mapping); + } if (params->flags & SKBEDIT_F_MARK) { skb->mark &= ~params->mask; skb->mark |= params->mark & params->mask; -- cgit v1.2.3 From 68822bdf76f10c3dc80609d4e2cdc1e847429086 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Apr 2022 13:12:37 -0700 Subject: net: generalize skb freeing deferral to per-cpu lists Logic added in commit f35f821935d8 ("tcp: defer skb freeing after socket lock is released") helped bulk TCP flows to move the cost of skbs frees outside of critical section where socket lock was held. But for RPC traffic, or hosts with RFS enabled, the solution is far from being ideal. For RPC traffic, recvmsg() has to return to user space right after skb payload has been consumed, meaning that BH handler has no chance to pick the skb before recvmsg() thread. This issue is more visible with BIG TCP, as more RPC fit one skb. For RFS, even if BH handler picks the skbs, they are still picked from the cpu on which user thread is running. Ideally, it is better to free the skbs (and associated page frags) on the cpu that originally allocated them. This patch removes the per socket anchor (sk->defer_list) and instead uses a per-cpu list, which will hold more skbs per round. This new per-cpu list is drained at the end of net_action_rx(), after incoming packets have been processed, to lower latencies. In normal conditions, skbs are added to the per-cpu list with no further action. In the (unlikely) cases where the cpu does not run net_action_rx() handler fast enough, we use an IPI to raise NET_RX_SOFTIRQ on the remote cpu. Also, we do not bother draining the per-cpu list from dev_cpu_dead() This is because skbs in this list have no requirement on how fast they should be freed. Note that we can add in the future a small per-cpu cache if we see any contention on sd->defer_lock. Tested on a pair of hosts with 100Gbit NIC, RFS enabled, and /proc/sys/net/ipv4/tcp_rmem[2] tuned to 16MB to work around page recycling strategy used by NIC driver (its page pool capacity being too small compared to number of skbs/pages held in sockets receive queues) Note that this tuning was only done to demonstrate worse conditions for skb freeing for this particular test. These conditions can happen in more general production workload. 10 runs of one TCP_STREAM flow Before: Average throughput: 49685 Mbit. Kernel profiles on cpu running user thread recvmsg() show high cost for skb freeing related functions (*) 57.81% [kernel] [k] copy_user_enhanced_fast_string (*) 12.87% [kernel] [k] skb_release_data (*) 4.25% [kernel] [k] __free_one_page (*) 3.57% [kernel] [k] __list_del_entry_valid 1.85% [kernel] [k] __netif_receive_skb_core 1.60% [kernel] [k] __skb_datagram_iter (*) 1.59% [kernel] [k] free_unref_page_commit (*) 1.16% [kernel] [k] __slab_free 1.16% [kernel] [k] _copy_to_iter (*) 1.01% [kernel] [k] kfree (*) 0.88% [kernel] [k] free_unref_page 0.57% [kernel] [k] ip6_rcv_core 0.55% [kernel] [k] ip6t_do_table 0.54% [kernel] [k] flush_smp_call_function_queue (*) 0.54% [kernel] [k] free_pcppages_bulk 0.51% [kernel] [k] llist_reverse_order 0.38% [kernel] [k] process_backlog (*) 0.38% [kernel] [k] free_pcp_prepare 0.37% [kernel] [k] tcp_recvmsg_locked (*) 0.37% [kernel] [k] __list_add_valid 0.34% [kernel] [k] sock_rfree 0.34% [kernel] [k] _raw_spin_lock_irq (*) 0.33% [kernel] [k] __page_cache_release 0.33% [kernel] [k] tcp_v6_rcv (*) 0.33% [kernel] [k] __put_page (*) 0.29% [kernel] [k] __mod_zone_page_state 0.27% [kernel] [k] _raw_spin_lock After patch: Average throughput: 73076 Mbit. Kernel profiles on cpu running user thread recvmsg() looks better: 81.35% [kernel] [k] copy_user_enhanced_fast_string 1.95% [kernel] [k] _copy_to_iter 1.95% [kernel] [k] __skb_datagram_iter 1.27% [kernel] [k] __netif_receive_skb_core 1.03% [kernel] [k] ip6t_do_table 0.60% [kernel] [k] sock_rfree 0.50% [kernel] [k] tcp_v6_rcv 0.47% [kernel] [k] ip6_rcv_core 0.45% [kernel] [k] read_tsc 0.44% [kernel] [k] _raw_spin_lock_irqsave 0.37% [kernel] [k] _raw_spin_lock 0.37% [kernel] [k] native_irq_return_iret 0.33% [kernel] [k] __inet6_lookup_established 0.31% [kernel] [k] ip6_protocol_deliver_rcu 0.29% [kernel] [k] tcp_rcv_established 0.29% [kernel] [k] llist_reverse_order v2: kdoc issue (kernel bots) do not defer if (alloc_cpu == smp_processor_id()) (Paolo) replace the sk_buff_head with a single-linked list (Jakub) add a READ_ONCE()/WRITE_ONCE() for the lockless read of sd->defer_list Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20220422201237.416238-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 5 +++++ include/linux/skbuff.h | 3 +++ include/net/sock.h | 2 -- include/net/tcp.h | 12 ----------- net/core/dev.c | 31 ++++++++++++++++++++++++++++ net/core/skbuff.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++- net/core/sock.c | 3 --- net/ipv4/tcp.c | 25 +---------------------- net/ipv4/tcp_ipv4.c | 1 - net/ipv6/tcp_ipv6.c | 1 - net/tls/tls_sw.c | 2 -- 11 files changed, 90 insertions(+), 46 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7dccbfd1bf56..ac8a5f71220a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3081,6 +3081,11 @@ struct softnet_data { struct sk_buff_head input_pkt_queue; struct napi_struct backlog; + /* Another possibly contended cache line */ + spinlock_t defer_lock ____cacheline_aligned_in_smp; + int defer_count; + struct sk_buff *defer_list; + call_single_data_t defer_csd; }; static inline void input_queue_head_incr(struct softnet_data *sd) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 84d78df60453..5cbc184ca685 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -888,6 +888,7 @@ typedef unsigned char *sk_buff_data_t; * delivery_time at egress. * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS + * @alloc_cpu: CPU which did the skb allocation. * @secmark: security marking * @mark: Generic packet mark * @reserved_tailroom: (aka @mark) number of bytes of free space available @@ -1080,6 +1081,7 @@ struct sk_buff { unsigned int sender_cpu; }; #endif + u16 alloc_cpu; #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; #endif @@ -1321,6 +1323,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb_around(struct sk_buff *skb, void *data, unsigned int frag_size); +void skb_attempt_defer_free(struct sk_buff *skb); struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); diff --git a/include/net/sock.h b/include/net/sock.h index a01d6c421aa2..f9f8ecae0f8d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -292,7 +292,6 @@ struct sk_filter; * @sk_pacing_shift: scaling factor for TCP Small Queues * @sk_lingertime: %SO_LINGER l_linger setting * @sk_backlog: always used with the per-socket spinlock held - * @defer_list: head of llist storing skbs to be freed * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, @@ -417,7 +416,6 @@ struct sock { struct sk_buff *head; struct sk_buff *tail; } sk_backlog; - struct llist_head defer_list; #define sk_rmem_alloc sk_backlog.rmem_alloc diff --git a/include/net/tcp.h b/include/net/tcp.h index 679b1964d494..94a52ad1101c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1375,18 +1375,6 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); -#ifdef CONFIG_INET -void __sk_defer_free_flush(struct sock *sk); - -static inline void sk_defer_free_flush(struct sock *sk) -{ - if (llist_empty(&sk->defer_list)) - return; - __sk_defer_free_flush(sk); -} -#else -static inline void sk_defer_free_flush(struct sock *sk) {} -#endif int tcp_filter(struct sock *sk, struct sk_buff *skb); void tcp_set_state(struct sock *sk, int state); diff --git a/net/core/dev.c b/net/core/dev.c index 4a77ebda4fb1..611bd7197064 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4545,6 +4545,12 @@ static void rps_trigger_softirq(void *data) #endif /* CONFIG_RPS */ +/* Called from hardirq (IPI) context */ +static void trigger_rx_softirq(void *data __always_unused) +{ + __raise_softirq_irqoff(NET_RX_SOFTIRQ); +} + /* * Check if this softnet_data structure is another cpu one * If yes, queue it to our IPI list and return 1 @@ -6571,6 +6577,28 @@ static int napi_threaded_poll(void *data) return 0; } +static void skb_defer_free_flush(struct softnet_data *sd) +{ + struct sk_buff *skb, *next; + unsigned long flags; + + /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ + if (!READ_ONCE(sd->defer_list)) + return; + + spin_lock_irqsave(&sd->defer_lock, flags); + skb = sd->defer_list; + sd->defer_list = NULL; + sd->defer_count = 0; + spin_unlock_irqrestore(&sd->defer_lock, flags); + + while (skb != NULL) { + next = skb->next; + __kfree_skb(skb); + skb = next; + } +} + static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -6616,6 +6644,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); + skb_defer_free_flush(sd); } struct netdev_adjacent { @@ -11326,6 +11355,8 @@ static int __init net_dev_init(void) INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif + INIT_CSD(&sd->defer_csd, trigger_rx_softirq, NULL); + spin_lock_init(&sd->defer_lock); init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 30b523fa4ad2..028a280fbabd 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -204,7 +204,7 @@ static void __build_skb_around(struct sk_buff *skb, void *data, skb_set_end_offset(skb, size); skb->mac_header = (typeof(skb->mac_header))~0U; skb->transport_header = (typeof(skb->transport_header))~0U; - + skb->alloc_cpu = raw_smp_processor_id(); /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); @@ -1037,6 +1037,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #ifdef CONFIG_NET_RX_BUSY_POLL CHECK_SKB_FIELD(napi_id); #endif + CHECK_SKB_FIELD(alloc_cpu); #ifdef CONFIG_XPS CHECK_SKB_FIELD(sender_cpu); #endif @@ -6486,3 +6487,51 @@ free_now: } EXPORT_SYMBOL(__skb_ext_put); #endif /* CONFIG_SKB_EXTENSIONS */ + +/** + * skb_attempt_defer_free - queue skb for remote freeing + * @skb: buffer + * + * Put @skb in a per-cpu list, using the cpu which + * allocated the skb/pages to reduce false sharing + * and memory zone spinlock contention. + */ +void skb_attempt_defer_free(struct sk_buff *skb) +{ + int cpu = skb->alloc_cpu; + struct softnet_data *sd; + unsigned long flags; + bool kick; + + if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || + !cpu_online(cpu) || + cpu == raw_smp_processor_id()) { + __kfree_skb(skb); + return; + } + + sd = &per_cpu(softnet_data, cpu); + /* We do not send an IPI or any signal. + * Remote cpu will eventually call skb_defer_free_flush() + */ + spin_lock_irqsave(&sd->defer_lock, flags); + skb->next = sd->defer_list; + /* Paired with READ_ONCE() in skb_defer_free_flush() */ + WRITE_ONCE(sd->defer_list, skb); + sd->defer_count++; + + /* kick every time queue length reaches 128. + * This should avoid blocking in smp_call_function_single_async(). + * This condition should hardly be bit under normal conditions, + * unless cpu suddenly stopped to receive NIC interrupts. + */ + kick = sd->defer_count == 128; + + spin_unlock_irqrestore(&sd->defer_lock, flags); + + /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU + * if we are unlucky enough (this seems very unlikely). + */ + if (unlikely(kick)) + smp_call_function_single_async(cpu, &sd->defer_csd); +} diff --git a/net/core/sock.c b/net/core/sock.c index 29abec3eabd8..a0f3989de3d6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2082,9 +2082,6 @@ void sk_destruct(struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); - WARN_ON_ONCE(!llist_empty(&sk->defer_list)); - sk_defer_free_flush(sk); - if (rcu_access_pointer(sk->sk_reuseport_cb)) { reuseport_detach_sock(sk); use_call_rcu = true; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e20b87b3bf90..db55af9eb37b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -843,7 +843,6 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, } release_sock(sk); - sk_defer_free_flush(sk); if (spliced) return spliced; @@ -1589,20 +1588,6 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) tcp_send_ack(sk); } -void __sk_defer_free_flush(struct sock *sk) -{ - struct llist_node *head; - struct sk_buff *skb, *n; - - head = llist_del_all(&sk->defer_list); - llist_for_each_entry_safe(skb, n, head, ll_node) { - prefetch(n); - skb_mark_not_on_list(skb); - __kfree_skb(skb); - } -} -EXPORT_SYMBOL(__sk_defer_free_flush); - static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); @@ -1610,11 +1595,7 @@ static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) sock_rfree(skb); skb->destructor = NULL; skb->sk = NULL; - if (!skb_queue_empty(&sk->sk_receive_queue) || - !llist_empty(&sk->defer_list)) { - llist_add(&skb->ll_node, &sk->defer_list); - return; - } + return skb_attempt_defer_free(skb); } __kfree_skb(skb); } @@ -2453,7 +2434,6 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, __sk_flush_backlog(sk); } else { tcp_cleanup_rbuf(sk, copied); - sk_defer_free_flush(sk); sk_wait_data(sk, &timeo, last); } @@ -2571,7 +2551,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, lock_sock(sk); ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags); release_sock(sk); - sk_defer_free_flush(sk); if (cmsg_flags && ret >= 0) { if (cmsg_flags & TCP_CMSG_TS) @@ -3096,7 +3075,6 @@ int tcp_disconnect(struct sock *sk, int flags) sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; } - sk_defer_free_flush(sk); sk_error_report(sk); return 0; } @@ -4225,7 +4203,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level, err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, &zc, &len, err); release_sock(sk); - sk_defer_free_flush(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) goto zerocopy_rcv_cmsg; switch (len) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2c2d42142555..918816ec5dd4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2065,7 +2065,6 @@ process: sk_incoming_cpu_update(sk); - sk_defer_free_flush(sk); bh_lock_sock_nested(sk); tcp_segs_in(tcp_sk(sk), skb); ret = 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 54277de7474b..60bdec257ba7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1728,7 +1728,6 @@ process: sk_incoming_cpu_update(sk); - sk_defer_free_flush(sk); bh_lock_sock_nested(sk); tcp_segs_in(tcp_sk(sk), skb); ret = 0; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index ddbe05ec5489..bc54f6c5b1a4 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1911,7 +1911,6 @@ recv_end: end: release_sock(sk); - sk_defer_free_flush(sk); if (psock) sk_psock_put(sk, psock); return copied ? : err; @@ -1983,7 +1982,6 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, splice_read_end: release_sock(sk); - sk_defer_free_flush(sk); return copied ? : err; } -- cgit v1.2.3 From c526fd8f9f4f21cb83c0b1c9a1ee9c0ac9be9e2e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 28 Apr 2022 11:58:46 +0100 Subject: net: inline dev_queue_xmit() Inline dev_queue_xmit() and dev_queue_xmit_accel(), they both are small proxy functions doing nothing but redirecting the control flow to __dev_queue_xmit(). Signed-off-by: Pavel Begunkov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 14 ++++++++++++-- net/core/dev.c | 15 ++------------- 2 files changed, 14 insertions(+), 15 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b75ca2d095ae..4aba92a4042a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2940,10 +2940,20 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); -int dev_queue_xmit(struct sk_buff *skb); -int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); +int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id); +static inline int dev_queue_xmit(struct sk_buff *skb) +{ + return __dev_queue_xmit(skb, NULL); +} + +static inline int dev_queue_xmit_accel(struct sk_buff *skb, + struct net_device *sb_dev) +{ + return __dev_queue_xmit(skb, sb_dev); +} + static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { int ret; diff --git a/net/core/dev.c b/net/core/dev.c index 7a61690d3137..d127164771f2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4111,7 +4111,7 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, * the BH enable code must have IRQs enabled so that it will not deadlock. * --BLG */ -static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) +int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; struct netdev_queue *txq = NULL; @@ -4235,18 +4235,7 @@ out: rcu_read_unlock_bh(); return rc; } - -int dev_queue_xmit(struct sk_buff *skb) -{ - return __dev_queue_xmit(skb, NULL); -} -EXPORT_SYMBOL(dev_queue_xmit); - -int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) -{ - return __dev_queue_xmit(skb, sb_dev); -} -EXPORT_SYMBOL(dev_queue_xmit_accel); +EXPORT_SYMBOL(__dev_queue_xmit); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { -- cgit v1.2.3 From 58caed3dacb4354a25a1aa8d2febc3e9648ba1f4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 May 2022 16:27:03 -0700 Subject: netdev: reshuffle netif_napi_add() APIs to allow dropping weight Most drivers should not have to worry about selecting the right weight for their NAPI instances and pass NAPI_POLL_WEIGHT. It'd be best if we didn't require the argument at all and selected the default internally. This change prepares the ground for such reshuffling, allowing for a smooth transition. The following API should remain after the next release cycle: netif_napi_add() netif_napi_add_weight() netif_napi_add_tx() netif_napi_add_tx_weight() Where the _weight() variants take an explicit weight argument. I opted for a _weight() suffix rather than a __ prefix, because we use __ in places to mean that caller needs to also issue a synchronize_net() call. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20220502232703.396351-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 50 +++++++++++++++++++++++++++++++---------------- net/core/dev.c | 6 +++--- 2 files changed, 36 insertions(+), 20 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4aba92a4042a..eaf66e57d891 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2499,37 +2499,53 @@ static inline void *netdev_priv(const struct net_device *dev) */ #define NAPI_POLL_WEIGHT 64 +void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight); + /** - * netif_napi_add - initialize a NAPI context - * @dev: network device - * @napi: NAPI context - * @poll: polling function - * @weight: default weight + * netif_napi_add() - initialize a NAPI context + * @dev: network device + * @napi: NAPI context + * @poll: polling function + * @weight: default weight * * netif_napi_add() must be used to initialize a NAPI context prior to calling * *any* of the other NAPI-related functions. */ -void netif_napi_add(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight); +static inline void +netif_napi_add(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) +{ + netif_napi_add_weight(dev, napi, poll, weight); +} + +static inline void +netif_napi_add_tx_weight(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), + int weight) +{ + set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state); + netif_napi_add_weight(dev, napi, poll, weight); +} + +#define netif_tx_napi_add netif_napi_add_tx_weight /** - * netif_tx_napi_add - initialize a NAPI context - * @dev: network device - * @napi: NAPI context - * @poll: polling function - * @weight: default weight + * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only + * @dev: network device + * @napi: NAPI context + * @poll: polling function * * This variant of netif_napi_add() should be used from drivers using NAPI * to exclusively poll a TX queue. * This will avoid we add it into napi_hash[], thus polluting this hash table. */ -static inline void netif_tx_napi_add(struct net_device *dev, +static inline void netif_napi_add_tx(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), - int weight) + int (*poll)(struct napi_struct *, int)) { - set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state); - netif_napi_add(dev, napi, poll, weight); + netif_napi_add_tx_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } /** diff --git a/net/core/dev.c b/net/core/dev.c index d127164771f2..c2d73595a7c3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6303,8 +6303,8 @@ int dev_set_threaded(struct net_device *dev, bool threaded) } EXPORT_SYMBOL(dev_set_threaded); -void netif_napi_add(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight) +void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) { if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) return; @@ -6337,7 +6337,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, if (dev->threaded && napi_kthread_create(napi)) dev->threaded = 0; } -EXPORT_SYMBOL(netif_napi_add); +EXPORT_SYMBOL(netif_napi_add_weight); void napi_disable(struct napi_struct *n) { -- cgit v1.2.3 From 6df6398f7c8b481ce83f28143bc08a5231616deb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 5 May 2022 19:51:31 -0700 Subject: net: add netif_inherit_tso_max() To make later patches smaller create a helper for inheriting the TSO limitations of a lower device. The TSO in the name is not an accident, subsequent patches will replace GSO with TSO in more names. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 3 +-- drivers/net/ipvlan/ipvlan_main.c | 6 ++---- drivers/net/macvlan.c | 6 ++---- drivers/net/veth.c | 3 +-- drivers/net/vxlan/vxlan_core.c | 3 +-- include/linux/netdevice.h | 3 +++ net/8021q/vlan.c | 3 +-- net/8021q/vlan_dev.c | 3 +-- net/core/dev.c | 12 ++++++++++++ 9 files changed, 24 insertions(+), 18 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c index ba3fa7eac98d..790e1d5e4b4a 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c @@ -286,8 +286,7 @@ nfp_repr_transfer_features(struct net_device *netdev, struct net_device *lower) if (repr->dst->u.port_info.lower_dev != lower) return; - netif_set_gso_max_size(netdev, lower->gso_max_size); - netif_set_gso_max_segs(netdev, lower->gso_max_segs); + netif_inherit_tso_max(netdev, lower); netdev_update_features(netdev); } diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 696e245f6d00..aa28a29e228c 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -139,8 +139,7 @@ static int ipvlan_init(struct net_device *dev) dev->vlan_features = phy_dev->vlan_features & IPVLAN_FEATURES; dev->vlan_features |= IPVLAN_ALWAYS_ON_OFLOADS; dev->hw_enc_features |= dev->features; - netif_set_gso_max_size(dev, phy_dev->gso_max_size); - netif_set_gso_max_segs(dev, phy_dev->gso_max_segs); + netif_inherit_tso_max(dev, phy_dev); dev->hard_header_len = phy_dev->hard_header_len; netdev_lockdep_set_classes(dev); @@ -762,8 +761,7 @@ static int ipvlan_device_event(struct notifier_block *unused, case NETDEV_FEAT_CHANGE: list_for_each_entry(ipvlan, &port->ipvlans, pnode) { - netif_set_gso_max_size(ipvlan->dev, dev->gso_max_size); - netif_set_gso_max_segs(ipvlan->dev, dev->gso_max_segs); + netif_inherit_tso_max(ipvlan->dev, dev); netdev_update_features(ipvlan->dev); } break; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index b00bc8173abe..0017bd0fe3bb 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -904,8 +904,7 @@ static int macvlan_init(struct net_device *dev) dev->vlan_features = lowerdev->vlan_features & MACVLAN_FEATURES; dev->vlan_features |= ALWAYS_ON_OFFLOADS; dev->hw_enc_features |= dev->features; - netif_set_gso_max_size(dev, lowerdev->gso_max_size); - netif_set_gso_max_segs(dev, lowerdev->gso_max_segs); + netif_inherit_tso_max(dev, lowerdev); dev->hard_header_len = lowerdev->hard_header_len; macvlan_set_lockdep_class(dev); @@ -1763,8 +1762,7 @@ static int macvlan_device_event(struct notifier_block *unused, break; case NETDEV_FEAT_CHANGE: list_for_each_entry(vlan, &port->vlans, list) { - netif_set_gso_max_size(vlan->dev, dev->gso_max_size); - netif_set_gso_max_segs(vlan->dev, dev->gso_max_segs); + netif_inherit_tso_max(vlan->dev, dev); netdev_update_features(vlan->dev); } break; diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 3592014e50cc..f474e79a7745 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1758,8 +1758,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, if (ifmp && (dev->ifindex != 0)) peer->ifindex = ifmp->ifi_index; - netif_set_gso_max_size(peer, dev->gso_max_size); - netif_set_gso_max_segs(peer, dev->gso_max_segs); + netif_inherit_tso_max(peer, dev); err = register_netdevice(peer); put_net(net); diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 8a5e3a6d32d7..2bda692f70c5 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -3683,8 +3683,7 @@ static void vxlan_config_apply(struct net_device *dev, if (lowerdev) { dst->remote_ifindex = conf->remote_ifindex; - netif_set_gso_max_size(dev, lowerdev->gso_max_size); - netif_set_gso_max_segs(dev, lowerdev->gso_max_segs); + netif_inherit_tso_max(dev, lowerdev); needed_headroom = lowerdev->hard_header_len; needed_headroom += lowerdev->needed_headroom; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eaf66e57d891..006bb5c0e413 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4895,6 +4895,9 @@ static inline void netif_set_gro_max_size(struct net_device *dev, WRITE_ONCE(dev->gro_max_size, size); } +void netif_inherit_tso_max(struct net_device *to, + const struct net_device *from); + static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 788076b002b3..e40aa3e3641c 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -319,8 +319,7 @@ static void vlan_transfer_features(struct net_device *dev, { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); - netif_set_gso_max_size(vlandev, dev->gso_max_size); - netif_set_gso_max_segs(vlandev, dev->gso_max_segs); + netif_inherit_tso_max(vlandev, dev); if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index e5d23e75572a..839f2020b015 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -573,8 +573,7 @@ static int vlan_dev_init(struct net_device *dev) NETIF_F_ALL_FCOE; dev->features |= dev->hw_features | NETIF_F_LLTX; - netif_set_gso_max_size(dev, real_dev->gso_max_size); - netif_set_gso_max_segs(dev, real_dev->gso_max_segs); + netif_inherit_tso_max(dev, real_dev); if (dev->features & NETIF_F_VLAN_FEATURES) netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); diff --git a/net/core/dev.c b/net/core/dev.c index c2d73595a7c3..eef1d19b130f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2992,6 +2992,18 @@ undo_rx: } EXPORT_SYMBOL(netif_set_real_num_queues); +/** + * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper + * @to: netdev to update + * @from: netdev from which to copy the limits + */ +void netif_inherit_tso_max(struct net_device *to, const struct net_device *from) +{ + netif_set_gso_max_size(to, from->gso_max_size); + netif_set_gso_max_segs(to, from->gso_max_segs); +} +EXPORT_SYMBOL(netif_inherit_tso_max); + /** * netif_get_num_default_rss_queues - default number of RSS queues * -- cgit v1.2.3 From 14d7b8122fd591693a2388b98563707ba72c6780 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 5 May 2022 19:51:32 -0700 Subject: net: don't allow user space to lift the device limits Up until commit 46e6b992c250 ("rtnetlink: allow GSO maximums to be set on device creation") the gso_max_segs and gso_max_size of a device were not controlled from user space. The quoted commit added the ability to control them because of the following setup: netns A | netns B veth<->veth eth0 If eth0 has TSO limitations and user wants to efficiently forward traffic between eth0 and the veths they should copy the TSO limitations of eth0 onto the veths. This would happen automatically for macvlans or ipvlan but veth users are not so lucky (given the loose coupling). Unfortunately the commit in question allowed users to also override the limits on real HW devices. It may be useful to control the max GSO size and someone may be using that ability (not that I know of any user), so create a separate set of knobs to reliably record the TSO limitations. Validate the user requests. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++++ net/core/dev.c | 35 +++++++++++++++++++++++++++++++++++ net/core/rtnetlink.c | 4 ++-- 3 files changed, 46 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 006bb5c0e413..e12f7de6d6ae 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1917,8 +1917,10 @@ enum netdev_ml_priv_type { * @rtnl_link_ops: Rtnl_link_ops * * @gso_max_size: Maximum size of generic segmentation offload + * @tso_max_size: Device (as in HW) limit on the max TSO request size * @gso_max_segs: Maximum number of segments that can be passed to the * NIC for GSO + * @tso_max_segs: Device (as in HW) limit on the max TSO segment count * * @dcbnl_ops: Data Center Bridging netlink ops * @num_tc: Number of traffic classes in the net device @@ -2262,8 +2264,13 @@ struct net_device { /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SIZE 65536 unsigned int gso_max_size; +#define TSO_LEGACY_MAX_SIZE 65536 +#define TSO_MAX_SIZE UINT_MAX + unsigned int tso_max_size; #define GSO_MAX_SEGS 65535 u16 gso_max_segs; +#define TSO_MAX_SEGS U16_MAX + u16 tso_max_segs; #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; @@ -4895,6 +4902,8 @@ static inline void netif_set_gro_max_size(struct net_device *dev, WRITE_ONCE(dev->gro_max_size, size); } +void netif_set_tso_max_size(struct net_device *dev, unsigned int size); +void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs); void netif_inherit_tso_max(struct net_device *to, const struct net_device *from); diff --git a/net/core/dev.c b/net/core/dev.c index eef1d19b130f..6ae085b11373 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2992,6 +2992,39 @@ undo_rx: } EXPORT_SYMBOL(netif_set_real_num_queues); +/** + * netif_set_tso_max_size() - set the max size of TSO frames supported + * @dev: netdev to update + * @size: max skb->len of a TSO frame + * + * Set the limit on the size of TSO super-frames the device can handle. + * Unless explicitly set the stack will assume the value of %GSO_MAX_SIZE. + */ +void netif_set_tso_max_size(struct net_device *dev, unsigned int size) +{ + dev->tso_max_size = size; + if (size < READ_ONCE(dev->gso_max_size)) + netif_set_gso_max_size(dev, size); +} +EXPORT_SYMBOL(netif_set_tso_max_size); + +/** + * netif_set_tso_max_segs() - set the max number of segs supported for TSO + * @dev: netdev to update + * @segs: max number of TCP segments + * + * Set the limit on the number of TCP segments the device can generate from + * a single TSO super-frame. + * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS. + */ +void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs) +{ + dev->tso_max_segs = segs; + if (segs < READ_ONCE(dev->gso_max_segs)) + netif_set_gso_max_segs(dev, segs); +} +EXPORT_SYMBOL(netif_set_tso_max_segs); + /** * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper * @to: netdev to update @@ -10572,6 +10605,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_MAX_SIZE; + dev->tso_max_size = TSO_LEGACY_MAX_SIZE; + dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; dev->lower_level = 1; #ifdef CONFIG_LOCKDEP diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index eea5ed09e1bb..e6d4b9272995 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2803,7 +2803,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); - if (max_size > GSO_MAX_SIZE) { + if (max_size > GSO_MAX_SIZE || max_size > dev->tso_max_size) { err = -EINVAL; goto errout; } @@ -2817,7 +2817,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SEGS]) { u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); - if (max_segs > GSO_MAX_SEGS) { + if (max_segs > GSO_MAX_SEGS || max_segs > dev->tso_max_segs) { err = -EINVAL; goto errout; } -- cgit v1.2.3 From 744d49daf8bd3b17b345c836f2e6f97d49fa6ae8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 5 May 2022 19:51:34 -0700 Subject: net: move netif_set_gso_max helpers These are now internal to the core, no need to expose them. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 21 --------------------- net/core/dev.h | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e12f7de6d6ae..8cf0ac616cb9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4881,27 +4881,6 @@ static inline bool netif_needs_gso(struct sk_buff *skb, (skb->ip_summed != CHECKSUM_UNNECESSARY))); } -static inline void netif_set_gso_max_size(struct net_device *dev, - unsigned int size) -{ - /* dev->gso_max_size is read locklessly from sk_setup_caps() */ - WRITE_ONCE(dev->gso_max_size, size); -} - -static inline void netif_set_gso_max_segs(struct net_device *dev, - unsigned int segs) -{ - /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ - WRITE_ONCE(dev->gso_max_segs, segs); -} - -static inline void netif_set_gro_max_size(struct net_device *dev, - unsigned int size) -{ - /* This pairs with the READ_ONCE() in skb_gro_receive() */ - WRITE_ONCE(dev->gro_max_size, size); -} - void netif_set_tso_max_size(struct net_device *dev, unsigned int size); void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs); void netif_inherit_tso_max(struct net_device *to, diff --git a/net/core/dev.h b/net/core/dev.h index 27923df00637..328b37af90ba 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -88,4 +88,25 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier); void __dev_set_rx_mode(struct net_device *dev); +static inline void netif_set_gso_max_size(struct net_device *dev, + unsigned int size) +{ + /* dev->gso_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_size, size); +} + +static inline void netif_set_gso_max_segs(struct net_device *dev, + unsigned int segs) +{ + /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_segs, segs); +} + +static inline void netif_set_gro_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_max_size, size); +} + #endif -- cgit v1.2.3 From ca4567f1e6f660f86fcd04f3563c0045b0d4772f Mon Sep 17 00:00:00 2001 From: Alaa Mohamed Date: Thu, 5 May 2022 17:09:57 +0200 Subject: rtnetlink: add extack support in fdb del handlers Add extack support to .ndo_fdb_del in netdevice.h and all related methods. Signed-off-by: Alaa Mohamed Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ice/ice_main.c | 3 ++- drivers/net/ethernet/mscc/ocelot_net.c | 3 ++- drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c | 3 ++- drivers/net/macvlan.c | 3 ++- drivers/net/vxlan/vxlan_core.c | 3 ++- include/linux/netdevice.h | 2 +- net/bridge/br_fdb.c | 3 ++- net/bridge/br_private.h | 3 ++- net/core/rtnetlink.c | 4 ++-- 9 files changed, 17 insertions(+), 10 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 049a3f48caff..867908f94661 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5688,11 +5688,12 @@ ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[], * @dev: the net device pointer * @addr: the MAC address entry being added * @vid: VLAN ID + * @extack: netlink extended ack */ static int ice_fdb_del(struct ndmsg *ndm, __always_unused struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, - __always_unused u16 vid) + __always_unused u16 vid, struct netlink_ext_ack *extack) { int err; diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 247bc105bdd2..616d8127ef51 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -774,7 +774,8 @@ static int ocelot_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], static int ocelot_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid) + const unsigned char *addr, u16 vid, + struct netlink_ext_ack *extack) { struct ocelot_port_private *priv = netdev_priv(dev); struct ocelot_port *ocelot_port = &priv->port; diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index d320567b2cca..28476b982bab 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -368,7 +368,8 @@ static int qlcnic_set_mac(struct net_device *netdev, void *p) static int qlcnic_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *netdev, - const unsigned char *addr, u16 vid) + const unsigned char *addr, u16 vid, + struct netlink_ext_ack *extack) { struct qlcnic_adapter *adapter = netdev_priv(netdev); int err = -EOPNOTSUPP; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 0017bd0fe3bb..eff75beb1395 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1020,7 +1020,8 @@ static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], static int macvlan_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid) + const unsigned char *addr, u16 vid, + struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); int err = -EINVAL; diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 2bda692f70c5..91b7bb01fb10 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1280,7 +1280,8 @@ out: /* Delete entry (via netlink) */ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid) + const unsigned char *addr, u16 vid, + struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); union vxlan_addr ip; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8cf0ac616cb9..74c97a34921d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1513,7 +1513,7 @@ struct net_device_ops { struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, - u16 vid); + u16 vid, struct netlink_ext_ack *extack); int (*ndo_fdb_del_bulk)(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 1a3d583fbc8e..e7f4fccb6adb 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -1253,7 +1253,8 @@ static int __br_fdb_delete(struct net_bridge *br, /* Remove neighbor entry with RTM_DELNEIGH */ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid) + const unsigned char *addr, u16 vid, + struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 6ae882cfae1c..06e5f6faa431 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -793,7 +793,8 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags); int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, const unsigned char *addr, u16 vid); + struct net_device *dev, const unsigned char *addr, u16 vid, + struct netlink_ext_ack *extack); int br_fdb_delete_bulk(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, u16 vid, struct netlink_ext_ack *extack); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e6d4b9272995..6aff02df9ba5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4258,7 +4258,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, ops = br_dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); } else { if (ops->ndo_fdb_del_bulk) err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid, @@ -4276,7 +4276,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, ops = dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); else err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); } else { -- cgit v1.2.3 From 97dc7cd92ac67f6e05df418df1772ba4a7fbf693 Mon Sep 17 00:00:00 2001 From: Gerhard Engleder Date: Fri, 6 May 2022 22:01:40 +0200 Subject: ptp: Support late timestamp determination If a physical clock supports a free running cycle counter, then timestamps shall be based on this time too. For TX it is known in advance before the transmission if a timestamp based on the free running cycle counter is needed. For RX it is impossible to know which timestamp is needed before the packet is received and assigned to a socket. Support late timestamp determination by a network device. Therefore, an address/cookie is stored within the new netdev_data field of struct skb_shared_hwtstamps. This address/cookie is provided to a new network device function called ndo_get_tstamp(), which returns a timestamp based on the normal/adjustable time or based on the free running cycle counter. If function is not supported, then timestamp handling is not changed. This mechanism is intended for RX, but TX use is also possible. Signed-off-by: Gerhard Engleder Acked-by: Jonathan Lemon Acked-by: Richard Cochran Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 21 ++++++++++++++++++++ include/linux/skbuff.h | 14 +++++++++++--- net/socket.c | 49 +++++++++++++++++++++++++++++++++++++---------- 3 files changed, 71 insertions(+), 13 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 74c97a34921d..51fe143091cf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1356,6 +1356,12 @@ struct netdev_net_notifier { * The caller must be under RCU read context. * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); * Get the forwarding path to reach the real device from the HW destination address + * ktime_t (*ndo_get_tstamp)(struct net_device *dev, + * const struct skb_shared_hwtstamps *hwtstamps, + * bool cycles); + * Get hardware timestamp based on normal/adjustable time or free running + * cycle counter. This function is required if physical clock supports a + * free running cycle counter. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1578,6 +1584,9 @@ struct net_device_ops { struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); + ktime_t (*ndo_get_tstamp)(struct net_device *dev, + const struct skb_shared_hwtstamps *hwtstamps, + bool cycles); }; /** @@ -4761,6 +4770,18 @@ static inline void netdev_rx_csum_fault(struct net_device *dev, void net_enable_timestamp(void); void net_disable_timestamp(void); +static inline ktime_t netdev_get_tstamp(struct net_device *dev, + const struct skb_shared_hwtstamps *hwtstamps, + bool cycles) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_get_tstamp) + return ops->ndo_get_tstamp(dev, hwtstamps, cycles); + + return hwtstamps->hwtstamp; +} + static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev, bool more) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4a4f25975ca2..97de40bcfef6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -588,8 +588,10 @@ static inline bool skb_frag_must_loop(struct page *p) /** * struct skb_shared_hwtstamps - hardware time stamps - * @hwtstamp: hardware time stamp transformed into duration - * since arbitrary point in time + * @hwtstamp: hardware time stamp transformed into duration + * since arbitrary point in time + * @netdev_data: address/cookie of network device driver used as + * reference to actual hardware time stamp * * Software time stamps generated by ktime_get_real() are stored in * skb->tstamp. @@ -601,7 +603,10 @@ static inline bool skb_frag_must_loop(struct page *p) * &skb_shared_info. Use skb_hwtstamps() to get a pointer. */ struct skb_shared_hwtstamps { - ktime_t hwtstamp; + union { + ktime_t hwtstamp; + void *netdev_data; + }; }; /* Definitions for tx_flags in struct skb_shared_info */ @@ -621,6 +626,9 @@ enum { /* generate wifi status information (where possible) */ SKBTX_WIFI_STATUS = 1 << 4, + /* determine hardware time stamp based on time or cycles */ + SKBTX_HW_TSTAMP_NETDEV = 1 << 5, + /* generate software time stamp when entering packet scheduling */ SKBTX_SCHED_TSTAMP = 1 << 6, }; diff --git a/net/socket.c b/net/socket.c index 0f680c7d968a..6ee634c01eca 100644 --- a/net/socket.c +++ b/net/socket.c @@ -805,7 +805,28 @@ static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp) return skb->tstamp && !false_tstamp && skb_is_err_queue(skb); } -static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb) +static ktime_t get_timestamp(struct sock *sk, struct sk_buff *skb, int *if_index) +{ + bool cycles = sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC; + struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); + struct net_device *orig_dev; + ktime_t hwtstamp; + + rcu_read_lock(); + orig_dev = dev_get_by_napi_id(skb_napi_id(skb)); + if (orig_dev) { + *if_index = orig_dev->ifindex; + hwtstamp = netdev_get_tstamp(orig_dev, shhwtstamps, cycles); + } else { + hwtstamp = shhwtstamps->hwtstamp; + } + rcu_read_unlock(); + + return hwtstamp; +} + +static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb, + int if_index) { struct scm_ts_pktinfo ts_pktinfo; struct net_device *orig_dev; @@ -815,11 +836,14 @@ static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb) memset(&ts_pktinfo, 0, sizeof(ts_pktinfo)); - rcu_read_lock(); - orig_dev = dev_get_by_napi_id(skb_napi_id(skb)); - if (orig_dev) - ts_pktinfo.if_index = orig_dev->ifindex; - rcu_read_unlock(); + if (!if_index) { + rcu_read_lock(); + orig_dev = dev_get_by_napi_id(skb_napi_id(skb)); + if (orig_dev) + if_index = orig_dev->ifindex; + rcu_read_unlock(); + } + ts_pktinfo.if_index = if_index; ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb); put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO, @@ -839,6 +863,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, int empty = 1, false_tstamp = 0; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); + int if_index; ktime_t hwtstamp; /* Race occurred between timestamp enabling and packet @@ -887,18 +912,22 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, if (shhwtstamps && (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && !skb_is_swtx_tstamp(skb, false_tstamp)) { - if (sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC) - hwtstamp = ptp_convert_timestamp(&shhwtstamps->hwtstamp, - sk->sk_bind_phc); + if_index = 0; + if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV) + hwtstamp = get_timestamp(sk, skb, &if_index); else hwtstamp = shhwtstamps->hwtstamp; + if (sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC) + hwtstamp = ptp_convert_timestamp(&hwtstamp, + sk->sk_bind_phc); + if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) { empty = 0; if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && !skb_is_err_queue(skb)) - put_ts_pktinfo(msg, skb); + put_ts_pktinfo(msg, skb, if_index); } } if (!empty) { -- cgit v1.2.3 From 5b87be9e4978fe507c7e14c0bdff147d10d39aec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 May 2022 20:57:37 -0700 Subject: net: add include/net/net_debug.h Remove from include/linux/netdevice.h helpers that send debug/info/warnings to syslog. We plan adding more helpers in following patches. v2: added two includes, and 'struct net_device' forward declaration to avoid compile errors (kernel bots) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 141 +------------------------------------------ include/net/net_debug.h | 151 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+), 140 deletions(-) create mode 100644 include/net/net_debug.h (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 51fe143091cf..536321691c72 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -50,6 +50,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -5071,81 +5072,9 @@ static inline const char *netdev_reg_state(const struct net_device *dev) return " (unknown)"; } -__printf(3, 4) __cold -void netdev_printk(const char *level, const struct net_device *dev, - const char *format, ...); -__printf(2, 3) __cold -void netdev_emerg(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_alert(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_crit(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_err(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_warn(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_notice(const struct net_device *dev, const char *format, ...); -__printf(2, 3) __cold -void netdev_info(const struct net_device *dev, const char *format, ...); - -#define netdev_level_once(level, dev, fmt, ...) \ -do { \ - static bool __section(".data.once") __print_once; \ - \ - if (!__print_once) { \ - __print_once = true; \ - netdev_printk(level, dev, fmt, ##__VA_ARGS__); \ - } \ -} while (0) - -#define netdev_emerg_once(dev, fmt, ...) \ - netdev_level_once(KERN_EMERG, dev, fmt, ##__VA_ARGS__) -#define netdev_alert_once(dev, fmt, ...) \ - netdev_level_once(KERN_ALERT, dev, fmt, ##__VA_ARGS__) -#define netdev_crit_once(dev, fmt, ...) \ - netdev_level_once(KERN_CRIT, dev, fmt, ##__VA_ARGS__) -#define netdev_err_once(dev, fmt, ...) \ - netdev_level_once(KERN_ERR, dev, fmt, ##__VA_ARGS__) -#define netdev_warn_once(dev, fmt, ...) \ - netdev_level_once(KERN_WARNING, dev, fmt, ##__VA_ARGS__) -#define netdev_notice_once(dev, fmt, ...) \ - netdev_level_once(KERN_NOTICE, dev, fmt, ##__VA_ARGS__) -#define netdev_info_once(dev, fmt, ...) \ - netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__) - #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) -#if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) -#define netdev_dbg(__dev, format, args...) \ -do { \ - dynamic_netdev_dbg(__dev, format, ##args); \ -} while (0) -#elif defined(DEBUG) -#define netdev_dbg(__dev, format, args...) \ - netdev_printk(KERN_DEBUG, __dev, format, ##args) -#else -#define netdev_dbg(__dev, format, args...) \ -({ \ - if (0) \ - netdev_printk(KERN_DEBUG, __dev, format, ##args); \ -}) -#endif - -#if defined(VERBOSE_DEBUG) -#define netdev_vdbg netdev_dbg -#else - -#define netdev_vdbg(dev, format, args...) \ -({ \ - if (0) \ - netdev_printk(KERN_DEBUG, dev, format, ##args); \ - 0; \ -}) -#endif - /* * netdev_WARN() acts like dev_printk(), but with the key difference * of using a WARN/WARN_ON to get the message out, including the @@ -5159,74 +5088,6 @@ do { \ WARN_ONCE(1, "netdevice: %s%s: " format, netdev_name(dev), \ netdev_reg_state(dev), ##args) -/* netif printk helpers, similar to netdev_printk */ - -#define netif_printk(priv, type, level, dev, fmt, args...) \ -do { \ - if (netif_msg_##type(priv)) \ - netdev_printk(level, (dev), fmt, ##args); \ -} while (0) - -#define netif_level(level, priv, type, dev, fmt, args...) \ -do { \ - if (netif_msg_##type(priv)) \ - netdev_##level(dev, fmt, ##args); \ -} while (0) - -#define netif_emerg(priv, type, dev, fmt, args...) \ - netif_level(emerg, priv, type, dev, fmt, ##args) -#define netif_alert(priv, type, dev, fmt, args...) \ - netif_level(alert, priv, type, dev, fmt, ##args) -#define netif_crit(priv, type, dev, fmt, args...) \ - netif_level(crit, priv, type, dev, fmt, ##args) -#define netif_err(priv, type, dev, fmt, args...) \ - netif_level(err, priv, type, dev, fmt, ##args) -#define netif_warn(priv, type, dev, fmt, args...) \ - netif_level(warn, priv, type, dev, fmt, ##args) -#define netif_notice(priv, type, dev, fmt, args...) \ - netif_level(notice, priv, type, dev, fmt, ##args) -#define netif_info(priv, type, dev, fmt, args...) \ - netif_level(info, priv, type, dev, fmt, ##args) - -#if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) -#define netif_dbg(priv, type, netdev, format, args...) \ -do { \ - if (netif_msg_##type(priv)) \ - dynamic_netdev_dbg(netdev, format, ##args); \ -} while (0) -#elif defined(DEBUG) -#define netif_dbg(priv, type, dev, format, args...) \ - netif_printk(priv, type, KERN_DEBUG, dev, format, ##args) -#else -#define netif_dbg(priv, type, dev, format, args...) \ -({ \ - if (0) \ - netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \ - 0; \ -}) -#endif - -/* if @cond then downgrade to debug, else print at @level */ -#define netif_cond_dbg(priv, type, netdev, cond, level, fmt, args...) \ - do { \ - if (cond) \ - netif_dbg(priv, type, netdev, fmt, ##args); \ - else \ - netif_ ## level(priv, type, netdev, fmt, ##args); \ - } while (0) - -#if defined(VERBOSE_DEBUG) -#define netif_vdbg netif_dbg -#else -#define netif_vdbg(priv, type, dev, format, args...) \ -({ \ - if (0) \ - netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \ - 0; \ -}) -#endif - /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. diff --git a/include/net/net_debug.h b/include/net/net_debug.h new file mode 100644 index 000000000000..19cd2700c20e --- /dev/null +++ b/include/net/net_debug.h @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NET_DEBUG_H +#define _LINUX_NET_DEBUG_H + +#include +#include + +struct net_device; + +__printf(3, 4) __cold +void netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...); +__printf(2, 3) __cold +void netdev_emerg(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_alert(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_crit(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_err(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_warn(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_notice(const struct net_device *dev, const char *format, ...); +__printf(2, 3) __cold +void netdev_info(const struct net_device *dev, const char *format, ...); + +#define netdev_level_once(level, dev, fmt, ...) \ +do { \ + static bool __section(".data.once") __print_once; \ + \ + if (!__print_once) { \ + __print_once = true; \ + netdev_printk(level, dev, fmt, ##__VA_ARGS__); \ + } \ +} while (0) + +#define netdev_emerg_once(dev, fmt, ...) \ + netdev_level_once(KERN_EMERG, dev, fmt, ##__VA_ARGS__) +#define netdev_alert_once(dev, fmt, ...) \ + netdev_level_once(KERN_ALERT, dev, fmt, ##__VA_ARGS__) +#define netdev_crit_once(dev, fmt, ...) \ + netdev_level_once(KERN_CRIT, dev, fmt, ##__VA_ARGS__) +#define netdev_err_once(dev, fmt, ...) \ + netdev_level_once(KERN_ERR, dev, fmt, ##__VA_ARGS__) +#define netdev_warn_once(dev, fmt, ...) \ + netdev_level_once(KERN_WARNING, dev, fmt, ##__VA_ARGS__) +#define netdev_notice_once(dev, fmt, ...) \ + netdev_level_once(KERN_NOTICE, dev, fmt, ##__VA_ARGS__) +#define netdev_info_once(dev, fmt, ...) \ + netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__) + +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) +#define netdev_dbg(__dev, format, args...) \ +do { \ + dynamic_netdev_dbg(__dev, format, ##args); \ +} while (0) +#elif defined(DEBUG) +#define netdev_dbg(__dev, format, args...) \ + netdev_printk(KERN_DEBUG, __dev, format, ##args) +#else +#define netdev_dbg(__dev, format, args...) \ +({ \ + if (0) \ + netdev_printk(KERN_DEBUG, __dev, format, ##args); \ +}) +#endif + +#if defined(VERBOSE_DEBUG) +#define netdev_vdbg netdev_dbg +#else + +#define netdev_vdbg(dev, format, args...) \ +({ \ + if (0) \ + netdev_printk(KERN_DEBUG, dev, format, ##args); \ + 0; \ +}) +#endif + +/* netif printk helpers, similar to netdev_printk */ + +#define netif_printk(priv, type, level, dev, fmt, args...) \ +do { \ + if (netif_msg_##type(priv)) \ + netdev_printk(level, (dev), fmt, ##args); \ +} while (0) + +#define netif_level(level, priv, type, dev, fmt, args...) \ +do { \ + if (netif_msg_##type(priv)) \ + netdev_##level(dev, fmt, ##args); \ +} while (0) + +#define netif_emerg(priv, type, dev, fmt, args...) \ + netif_level(emerg, priv, type, dev, fmt, ##args) +#define netif_alert(priv, type, dev, fmt, args...) \ + netif_level(alert, priv, type, dev, fmt, ##args) +#define netif_crit(priv, type, dev, fmt, args...) \ + netif_level(crit, priv, type, dev, fmt, ##args) +#define netif_err(priv, type, dev, fmt, args...) \ + netif_level(err, priv, type, dev, fmt, ##args) +#define netif_warn(priv, type, dev, fmt, args...) \ + netif_level(warn, priv, type, dev, fmt, ##args) +#define netif_notice(priv, type, dev, fmt, args...) \ + netif_level(notice, priv, type, dev, fmt, ##args) +#define netif_info(priv, type, dev, fmt, args...) \ + netif_level(info, priv, type, dev, fmt, ##args) + +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) +#define netif_dbg(priv, type, netdev, format, args...) \ +do { \ + if (netif_msg_##type(priv)) \ + dynamic_netdev_dbg(netdev, format, ##args); \ +} while (0) +#elif defined(DEBUG) +#define netif_dbg(priv, type, dev, format, args...) \ + netif_printk(priv, type, KERN_DEBUG, dev, format, ##args) +#else +#define netif_dbg(priv, type, dev, format, args...) \ +({ \ + if (0) \ + netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \ + 0; \ +}) +#endif + +/* if @cond then downgrade to debug, else print at @level */ +#define netif_cond_dbg(priv, type, netdev, cond, level, fmt, args...) \ + do { \ + if (cond) \ + netif_dbg(priv, type, netdev, fmt, ##args); \ + else \ + netif_ ## level(priv, type, netdev, fmt, ##args); \ + } while (0) + +#if defined(VERBOSE_DEBUG) +#define netif_vdbg netif_dbg +#else +#define netif_vdbg(priv, type, dev, format, args...) \ +({ \ + if (0) \ + netif_printk(priv, type, KERN_DEBUG, dev, format, ##args); \ + 0; \ +}) +#endif + + +#endif /* _LINUX_NET_DEBUG_H */ -- cgit v1.2.3 From 7c4e983c4f3cf94fcd879730c6caa877e0768a4d Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 13 May 2022 11:33:57 -0700 Subject: net: allow gso_max_size to exceed 65536 The code for gso_max_size was added originally to allow for debugging and workaround of buggy devices that couldn't support TSO with blocks 64K in size. The original reason for limiting it to 64K was because that was the existing limits of IPv4 and non-jumbogram IPv6 length fields. With the addition of Big TCP we can remove this limit and allow the value to potentially go up to UINT_MAX and instead be limited by the tso_max_size value. So in order to support this we need to go through and clean up the remaining users of the gso_max_size value so that the values will cap at 64K for non-TCPv6 flows. In addition we can clean up the GSO_MAX_SIZE value so that 64K becomes GSO_LEGACY_MAX_SIZE and UINT_MAX will now be the upper limit for GSO_MAX_SIZE. v6: (edumazet) fixed a compile error if CONFIG_IPV6=n, in a new sk_trim_gso_size() helper. netif_set_tso_max_size() caps the requested TSO size with GSO_MAX_SIZE. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe.h | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +- drivers/net/ethernet/sfc/ef100_nic.c | 3 ++- drivers/net/ethernet/sfc/falcon/tx.c | 3 ++- drivers/net/ethernet/sfc/tx_common.c | 3 ++- drivers/net/ethernet/synopsys/dwc-xlgmac.h | 3 ++- drivers/net/hyperv/rndis_filter.c | 2 +- drivers/scsi/fcoe/fcoe.c | 2 +- include/linux/netdevice.h | 4 +++- net/bpf/test_run.c | 2 +- net/core/dev.c | 7 ++++--- net/core/rtnetlink.c | 2 +- net/core/sock.c | 14 ++++++++++++++ net/ipv4/tcp_bbr.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/sctp/output.c | 3 ++- 16 files changed, 40 insertions(+), 17 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h index 607a2c90513b..d9547552ceef 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe.h @@ -151,7 +151,8 @@ #define XGBE_TX_MAX_BUF_SIZE (0x3fff & ~(64 - 1)) /* Descriptors required for maximum contiguous TSO/GSO packet */ -#define XGBE_TX_MAX_SPLIT ((GSO_MAX_SIZE / XGBE_TX_MAX_BUF_SIZE) + 1) +#define XGBE_TX_MAX_SPLIT \ + ((GSO_LEGACY_MAX_SIZE / XGBE_TX_MAX_BUF_SIZE) + 1) /* Maximum possible descriptors needed for an SKB: * - Maximum number of SKB frags diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index fb11081001a0..838870bc6dbd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -2038,7 +2038,7 @@ mlx5e_hw_gro_skb_has_enough_space(struct sk_buff *skb, u16 data_bcnt) { int nr_frags = skb_shinfo(skb)->nr_frags; - return PAGE_SIZE * nr_frags + data_bcnt <= GSO_MAX_SIZE; + return PAGE_SIZE * nr_frags + data_bcnt <= GRO_MAX_SIZE; } static void diff --git a/drivers/net/ethernet/sfc/ef100_nic.c b/drivers/net/ethernet/sfc/ef100_nic.c index a69d756e09b9..b2536d2c218a 100644 --- a/drivers/net/ethernet/sfc/ef100_nic.c +++ b/drivers/net/ethernet/sfc/ef100_nic.c @@ -1008,7 +1008,8 @@ static int ef100_process_design_param(struct efx_nic *efx, } return 0; case ESE_EF100_DP_GZ_TSO_MAX_PAYLOAD_LEN: - nic_data->tso_max_payload_len = min_t(u64, reader->value, GSO_MAX_SIZE); + nic_data->tso_max_payload_len = min_t(u64, reader->value, + GSO_LEGACY_MAX_SIZE); netif_set_tso_max_size(efx->net_dev, nic_data->tso_max_payload_len); return 0; diff --git a/drivers/net/ethernet/sfc/falcon/tx.c b/drivers/net/ethernet/sfc/falcon/tx.c index f7306e93a8b8..b9369483758c 100644 --- a/drivers/net/ethernet/sfc/falcon/tx.c +++ b/drivers/net/ethernet/sfc/falcon/tx.c @@ -98,7 +98,8 @@ unsigned int ef4_tx_max_skb_descs(struct ef4_nic *efx) /* Possibly more for PCIe page boundaries within input fragments */ if (PAGE_SIZE > EF4_PAGE_SIZE) max_descs += max_t(unsigned int, MAX_SKB_FRAGS, - DIV_ROUND_UP(GSO_MAX_SIZE, EF4_PAGE_SIZE)); + DIV_ROUND_UP(GSO_LEGACY_MAX_SIZE, + EF4_PAGE_SIZE)); return max_descs; } diff --git a/drivers/net/ethernet/sfc/tx_common.c b/drivers/net/ethernet/sfc/tx_common.c index 9bc8281b7f5b..658ea2d34070 100644 --- a/drivers/net/ethernet/sfc/tx_common.c +++ b/drivers/net/ethernet/sfc/tx_common.c @@ -416,7 +416,8 @@ unsigned int efx_tx_max_skb_descs(struct efx_nic *efx) /* Possibly more for PCIe page boundaries within input fragments */ if (PAGE_SIZE > EFX_PAGE_SIZE) max_descs += max_t(unsigned int, MAX_SKB_FRAGS, - DIV_ROUND_UP(GSO_MAX_SIZE, EFX_PAGE_SIZE)); + DIV_ROUND_UP(GSO_LEGACY_MAX_SIZE, + EFX_PAGE_SIZE)); return max_descs; } diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac.h b/drivers/net/ethernet/synopsys/dwc-xlgmac.h index 98e3a271e017..a848e10f3ea4 100644 --- a/drivers/net/ethernet/synopsys/dwc-xlgmac.h +++ b/drivers/net/ethernet/synopsys/dwc-xlgmac.h @@ -38,7 +38,8 @@ #define XLGMAC_RX_DESC_MAX_DIRTY (XLGMAC_RX_DESC_CNT >> 3) /* Descriptors required for maximum contiguous TSO/GSO packet */ -#define XLGMAC_TX_MAX_SPLIT ((GSO_MAX_SIZE / XLGMAC_TX_MAX_BUF_SIZE) + 1) +#define XLGMAC_TX_MAX_SPLIT \ + ((GSO_LEGACY_MAX_SIZE / XLGMAC_TX_MAX_BUF_SIZE) + 1) /* Maximum possible descriptors needed for a SKB */ #define XLGMAC_TX_MAX_DESC_NR (MAX_SKB_FRAGS + XLGMAC_TX_MAX_SPLIT + 2) diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 866af2cc27a3..6da36cb8af80 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1349,7 +1349,7 @@ static int rndis_netdev_set_hwcaps(struct rndis_device *rndis_device, struct net_device_context *net_device_ctx = netdev_priv(net); struct ndis_offload hwcaps; struct ndis_offload_params offloads; - unsigned int gso_max_size = GSO_MAX_SIZE; + unsigned int gso_max_size = GSO_LEGACY_MAX_SIZE; int ret; /* Find HW offload capabilities */ diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index 44ca6110213c..79b2827e4081 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -667,7 +667,7 @@ static void fcoe_netdev_features_change(struct fc_lport *lport, if (netdev->features & NETIF_F_FSO) { lport->seq_offload = 1; - lport->lso_max = netdev->gso_max_size; + lport->lso_max = min(netdev->gso_max_size, GSO_LEGACY_MAX_SIZE); FCOE_NETDEV_DBG(netdev, "Supports LSO for max len 0x%x\n", lport->lso_max); } else { diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 536321691c72..ce780e352f43 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2272,7 +2272,9 @@ struct net_device { const struct rtnl_link_ops *rtnl_link_ops; /* for setting kernel sock attribute on TCP connection setup */ -#define GSO_MAX_SIZE 65536 +#define GSO_LEGACY_MAX_SIZE 65536u +#define GSO_MAX_SIZE UINT_MAX + unsigned int gso_max_size; #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 8d54fef9a568..9b5a1f630bb0 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -1001,7 +1001,7 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) cb->pkt_len = skb->len; } else { if (__skb->wire_len < skb->len || - __skb->wire_len > GSO_MAX_SIZE) + __skb->wire_len > GSO_LEGACY_MAX_SIZE) return -EINVAL; cb->pkt_len = __skb->wire_len; } diff --git a/net/core/dev.c b/net/core/dev.c index a601da3b4a7c..830beb05161a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2998,11 +2998,12 @@ EXPORT_SYMBOL(netif_set_real_num_queues); * @size: max skb->len of a TSO frame * * Set the limit on the size of TSO super-frames the device can handle. - * Unless explicitly set the stack will assume the value of %GSO_MAX_SIZE. + * Unless explicitly set the stack will assume the value of + * %GSO_LEGACY_MAX_SIZE. */ void netif_set_tso_max_size(struct net_device *dev, unsigned int size) { - dev->tso_max_size = size; + dev->tso_max_size = min(GSO_MAX_SIZE, size); if (size < READ_ONCE(dev->gso_max_size)) netif_set_gso_max_size(dev, size); } @@ -10595,7 +10596,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - dev->gso_max_size = GSO_MAX_SIZE; + dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_MAX_SIZE; dev->tso_max_size = TSO_LEGACY_MAX_SIZE; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f35cc21298ac..f2b0f747d3d2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2817,7 +2817,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); - if (max_size > GSO_MAX_SIZE || max_size > dev->tso_max_size) { + if (max_size > dev->tso_max_size) { err = -EINVAL; goto errout; } diff --git a/net/core/sock.c b/net/core/sock.c index 6b287eb5427b..24a46a1e4f28 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2293,6 +2293,19 @@ void sk_free_unlock_clone(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); +static void sk_trim_gso_size(struct sock *sk) +{ + if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE) + return; +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6 && + sk_is_tcp(sk) && + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + return; +#endif + sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; +} + void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; @@ -2312,6 +2325,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); + sk_trim_gso_size(sk); sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index c7d30a3bbd81..075e744bfb48 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -310,7 +310,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk) */ bytes = min_t(unsigned long, sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), - GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); + GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); return min(segs, 0x7FU); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b092228e4342..b4b2284ed4a2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1553,7 +1553,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, * SO_SNDBUF values. * Also allow first and last skb in retransmit queue to be split. */ - limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE); + limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE); if (unlikely((sk->sk_wmem_queued >> 1) > limit && tcp_queue != TCP_FRAG_IN_WRITE_QUEUE && skb != tcp_rtx_queue_head(sk) && diff --git a/net/sctp/output.c b/net/sctp/output.c index 72fe6669c50d..a63df055ac57 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -134,7 +134,8 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, dst_hold(tp->dst); sk_setup_caps(sk, tp->dst); } - packet->max_size = sk_can_gso(sk) ? READ_ONCE(tp->dst->dev->gso_max_size) + packet->max_size = sk_can_gso(sk) ? min(READ_ONCE(tp->dst->dev->gso_max_size), + GSO_LEGACY_MAX_SIZE) : asoc->pathmtu; rcu_read_unlock(); } -- cgit v1.2.3 From 34b92e8d19da1e44070dc0ecc2747871469ac534 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 May 2022 11:33:58 -0700 Subject: net: limit GSO_MAX_SIZE to 524280 bytes Make sure we will not overflow shinfo->gso_segs Minimal TCP MSS size is 8 bytes, and shinfo->gso_segs is a 16bit field. TCP_MIN_GSO_SIZE is currently defined in include/net/tcp.h, it seems cleaner to not bring tcp details into include/linux/netdevice.h Signed-off-by: Eric Dumazet Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ce780e352f43..fd38847d0dc7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2272,14 +2272,17 @@ struct net_device { const struct rtnl_link_ops *rtnl_link_ops; /* for setting kernel sock attribute on TCP connection setup */ +#define GSO_MAX_SEGS 65535u #define GSO_LEGACY_MAX_SIZE 65536u -#define GSO_MAX_SIZE UINT_MAX +/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), + * and shinfo->gso_segs is a 16bit field. + */ +#define GSO_MAX_SIZE (8 * GSO_MAX_SEGS) unsigned int gso_max_size; #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX unsigned int tso_max_size; -#define GSO_MAX_SEGS 65535 u16 gso_max_segs; #define TSO_MAX_SEGS U16_MAX u16 tso_max_segs; -- cgit v1.2.3 From 0fe79f28bfaf73b66b7b1562d2468f94aa03bd12 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 13 May 2022 11:34:03 -0700 Subject: net: allow gro_max_size to exceed 65536 Allow the gro_max_size to exceed a value larger than 65536. There weren't really any external limitations that prevented this other than the fact that IPv4 only supports a 16 bit length field. Since we have the option of adding a hop-by-hop header for IPv6 we can allow IPv6 to exceed this value and for IPv4 and non-TCP flows we can cap things at 65536 via a constant rather than relying on gro_max_size. [edumazet] limit GRO_MAX_SIZE to (8 * 65535) to avoid overflows. Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +- include/linux/netdevice.h | 6 +++++- include/net/ipv6.h | 2 +- net/core/dev.c | 2 +- net/core/gro.c | 8 ++++++++ net/core/rtnetlink.c | 8 -------- 6 files changed, 16 insertions(+), 12 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 838870bc6dbd..24de37b79f5a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -2038,7 +2038,7 @@ mlx5e_hw_gro_skb_has_enough_space(struct sk_buff *skb, u16 data_bcnt) { int nr_frags = skb_shinfo(skb)->nr_frags; - return PAGE_SIZE * nr_frags + data_bcnt <= GRO_MAX_SIZE; + return PAGE_SIZE * nr_frags + data_bcnt <= GRO_LEGACY_MAX_SIZE; } static void diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fd38847d0dc7..d57ce248004c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2161,7 +2161,11 @@ struct net_device { struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; int napi_defer_hard_irqs; -#define GRO_MAX_SIZE 65536 +#define GRO_LEGACY_MAX_SIZE 65536u +/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), + * and shinfo->gso_segs is a 16bit field. + */ +#define GRO_MAX_SIZE (8 * 65535u) unsigned int gro_max_size; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index b6df0314aa02..5b38bf1a586b 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -477,7 +477,7 @@ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) const struct hop_jumbo_hdr *jhdr; const struct ipv6hdr *nhdr; - if (likely(skb->len <= GRO_MAX_SIZE)) + if (likely(skb->len <= GRO_LEGACY_MAX_SIZE)) return 0; if (skb->protocol != htons(ETH_P_IPV6)) diff --git a/net/core/dev.c b/net/core/dev.c index 830beb05161a..d93456c75b55 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10598,7 +10598,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; - dev->gro_max_size = GRO_MAX_SIZE; + dev->gro_max_size = GRO_LEGACY_MAX_SIZE; dev->tso_max_size = TSO_LEGACY_MAX_SIZE; dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; diff --git a/net/core/gro.c b/net/core/gro.c index 78110edf5d4b..b4190eb08467 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -167,6 +167,14 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) return -E2BIG; + if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { + if (p->protocol != htons(ETH_P_IPV6) || + skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || + ipv6_hdr(p)->nexthdr != IPPROTO_TCP || + p->encapsulation) + return -E2BIG; + } + lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f2b0f747d3d2..ac45328607f7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2360,14 +2360,6 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], } } - if (tb[IFLA_GRO_MAX_SIZE]) { - u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); - - if (gro_max_size > GRO_MAX_SIZE) { - NL_SET_ERR_MSG(extack, "too big gro_max_size"); - return -EINVAL; - } - } return 0; } -- cgit v1.2.3 From 97e719a82b43c6c2bb5eebdb3c5d479a332ac2ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 May 2022 21:24:53 -0700 Subject: net: fix possible race in skb_attempt_defer_free() A cpu can observe sd->defer_count reaching 128, and call smp_call_function_single_async() Problem is that the remote CPU can clear sd->defer_count before the IPI is run/acknowledged. Other cpus can queue more packets and also decide to call smp_call_function_single_async() while the pending IPI was not yet delivered. This is a common issue with smp_call_function_single_async(). Callers must ensure correct synchronization and serialization. I triggered this issue while experimenting smaller threshold. Performing the call to smp_call_function_single_async() under sd->defer_lock protection did not solve the problem. Commit 5a18ceca6350 ("smp: Allow smp_call_function_single_async() to insert locked csd") replaced an informative WARN_ON_ONCE() with a return of -EBUSY, which is often ignored. Test of CSD_FLAG_LOCK presence is racy anyway. Fixes: 68822bdf76f1 ("net: generalize skb freeing deferral to per-cpu lists") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 7 +++++-- net/core/skbuff.c | 5 ++--- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d57ce248004c..cbaf312e365b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3136,6 +3136,7 @@ struct softnet_data { /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; int defer_count; + int defer_ipi_scheduled; struct sk_buff *defer_list; call_single_data_t defer_csd; }; diff --git a/net/core/dev.c b/net/core/dev.c index d93456c75b55..a5e663e1a75a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4582,9 +4582,12 @@ static void rps_trigger_softirq(void *data) #endif /* CONFIG_RPS */ /* Called from hardirq (IPI) context */ -static void trigger_rx_softirq(void *data __always_unused) +static void trigger_rx_softirq(void *data) { + struct softnet_data *sd = data; + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + smp_store_release(&sd->defer_ipi_scheduled, 0); } /* @@ -11382,7 +11385,7 @@ static int __init net_dev_init(void) INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif - INIT_CSD(&sd->defer_csd, trigger_rx_softirq, NULL); + INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); spin_lock_init(&sd->defer_lock); init_gro_hash(&sd->backlog); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2fea964f09d8..b40c8cdf4785 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6516,8 +6516,7 @@ void skb_attempt_defer_free(struct sk_buff *skb) sd->defer_count++; /* kick every time queue length reaches 128. - * This should avoid blocking in smp_call_function_single_async(). - * This condition should hardly be bit under normal conditions, + * This condition should hardly be hit under normal conditions, * unless cpu suddenly stopped to receive NIC interrupts. */ kick = sd->defer_count == 128; @@ -6527,6 +6526,6 @@ void skb_attempt_defer_free(struct sk_buff *skb) /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ - if (unlikely(kick)) + if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) smp_call_function_single_async(cpu, &sd->defer_csd); } -- cgit v1.2.3 From cf2df74e202d81b09f09d84c2d8903e0e87e9274 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 9 May 2022 14:26:15 +0200 Subject: net: fix dev_fill_forward_path with pppoe + bridge When calling dev_fill_forward_path on a pppoe device, the provided destination address is invalid. In order for the bridge fdb lookup to succeed, the pppoe code needs to update ctx->daddr to the correct value. Fix this by storing the address inside struct net_device_path_ctx Fixes: f6efc675c9dd ("net: ppp: resolve forwarding path for bridge pppoe devices") Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- drivers/net/ppp/pppoe.c | 1 + include/linux/netdevice.h | 2 +- net/core/dev.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 3619520340b7..e172743948ed 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -988,6 +988,7 @@ static int pppoe_fill_forward_path(struct net_device_path_ctx *ctx, path->encap.proto = htons(ETH_P_PPP_SES); path->encap.id = be16_to_cpu(po->num); memcpy(path->encap.h_dest, po->pppoe_pa.remote, ETH_ALEN); + memcpy(ctx->daddr, po->pppoe_pa.remote, ETH_ALEN); path->dev = ctx->dev; ctx->dev = dev; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b1fbe21650bb..f736c020cde2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -900,7 +900,7 @@ struct net_device_path_stack { struct net_device_path_ctx { const struct net_device *dev; - const u8 *daddr; + u8 daddr[ETH_ALEN]; int num_vlans; struct { diff --git a/net/core/dev.c b/net/core/dev.c index 1461c2d9dec8..2771fd22dc6a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -681,11 +681,11 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, const struct net_device *last_dev; struct net_device_path_ctx ctx = { .dev = dev, - .daddr = daddr, }; struct net_device_path *path; int ret = 0; + memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); stack->num_paths = 0; while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { last_dev = ctx.dev; -- cgit v1.2.3 From c304eddcecfe2513ff98ce3ae97d1c196d82ba08 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 19 May 2022 13:20:54 -0700 Subject: net: wrap the wireless pointers in struct net_device in an ifdef Most protocol-specific pointers in struct net_device are under a respective ifdef. Wireless is the notable exception. Since there's a sizable number of custom-built kernels for datacenter workloads which don't build wireless it seems reasonable to ifdefy those pointers as well. While at it move IPv4 and IPv6 pointers up, those are special for obvious reasons. Acked-by: Johannes Berg Acked-by: Stefan Schmidt # ieee802154 Acked-by: Sven Eckelmann Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++-- include/net/cfg80211.h | 2 ++ include/net/cfg802154.h | 2 ++ net/batman-adv/hard-interface.c | 2 ++ net/core/net-sysfs.c | 21 +++++++++++++-------- 5 files changed, 25 insertions(+), 10 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1ce91cb8074a..f615a66c89e9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2119,6 +2119,8 @@ struct net_device { /* Protocol-specific pointers */ + struct in_device __rcu *ip_ptr; + struct inet6_dev __rcu *ip6_ptr; #if IS_ENABLED(CONFIG_VLAN_8021Q) struct vlan_info __rcu *vlan_info; #endif @@ -2131,16 +2133,18 @@ struct net_device { #if IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; #endif - struct in_device __rcu *ip_ptr; #if IS_ENABLED(CONFIG_DECNET) struct dn_dev __rcu *dn_ptr; #endif - struct inet6_dev __rcu *ip6_ptr; #if IS_ENABLED(CONFIG_AX25) void *ax25_ptr; #endif +#if IS_ENABLED(CONFIG_CFG80211) struct wireless_dev *ieee80211_ptr; +#endif +#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN) struct wpan_dev *ieee802154_ptr; +#endif #if IS_ENABLED(CONFIG_MPLS_ROUTING) struct mpls_dev __rcu *mpls_ptr; #endif diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cc8a9880b9d6..6d02e12e4702 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -8006,7 +8006,9 @@ int cfg80211_register_netdevice(struct net_device *dev); */ static inline void cfg80211_unregister_netdevice(struct net_device *dev) { +#if IS_ENABLED(CONFIG_CFG80211) cfg80211_unregister_wdev(dev->ieee80211_ptr); +#endif } /** diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 85f9e8417688..d8d8719315fd 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -373,6 +373,7 @@ struct wpan_dev { #define to_phy(_dev) container_of(_dev, struct wpan_phy, dev) +#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN) static inline int wpan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, const struct ieee802154_addr *daddr, @@ -383,6 +384,7 @@ wpan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, return wpan_dev->header_ops->create(skb, dev, daddr, saddr, len); } +#endif struct wpan_phy * wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size); diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 83fb51b6e299..b8f8da7ee3de 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -307,9 +307,11 @@ static bool batadv_is_cfg80211_netdev(struct net_device *net_device) if (!net_device) return false; +#if IS_ENABLED(CONFIG_CFG80211) /* cfg80211 drivers have to set ieee80211_ptr */ if (net_device->ieee80211_ptr) return true; +#endif return false; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 4980c3a50475..e319e242dddf 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -746,7 +746,6 @@ static const struct attribute_group netstat_group = { .attrs = netstat_attrs, }; -#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) static struct attribute *wireless_attrs[] = { NULL }; @@ -755,7 +754,19 @@ static const struct attribute_group wireless_group = { .name = "wireless", .attrs = wireless_attrs, }; + +static bool wireless_group_needed(struct net_device *ndev) +{ +#if IS_ENABLED(CONFIG_CFG80211) + if (ndev->ieee80211_ptr) + return true; #endif +#if IS_ENABLED(CONFIG_WIRELESS_EXT) + if (ndev->wireless_handlers) + return true; +#endif + return false; +} #else /* CONFIG_SYSFS */ #define net_class_groups NULL @@ -1996,14 +2007,8 @@ int netdev_register_kobject(struct net_device *ndev) *groups++ = &netstat_group; -#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) - if (ndev->ieee80211_ptr) - *groups++ = &wireless_group; -#if IS_ENABLED(CONFIG_WIRELESS_EXT) - else if (ndev->wireless_handlers) + if (wireless_group_needed(ndev)) *groups++ = &wireless_group; -#endif -#endif #endif /* CONFIG_SYSFS */ error = device_add(dev); -- cgit v1.2.3