From f858cc9eed5b05cbe38d7ffd2787c21e3718eb7d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Oct 2024 12:12:18 +0000 Subject: net: add IFLA_MAX_PACING_OFFLOAD_HORIZON device attribute Some network devices have the ability to offload EDT (Earliest Departure Time) which is the model used for TCP pacing and FQ packet scheduler. Some of them implement the timing wheel mechanism described in https://saeed.github.io/files/carousel-sigcomm17.pdf with an associated 'timing wheel horizon'. This patch adds dev->max_pacing_offload_horizon expressing this timing wheel horizon in nsec units. This is a read-only attribute. Unless a driver sets it, dev->max_pacing_offload_horizon is zero. v2: addressed Jakub feedback ( https://lore.kernel.org/netdev/20240930152304.472767-2-edumazet@google.com/T/#mf6294d714c41cc459962154cc2580ce3c9693663 ) v3: added yaml doc (also per Jakub feedback) Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241003121219.2396589-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4d20c776a4ff..49a7e7db0883 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2009,6 +2009,8 @@ enum netdev_reg_state { * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, * where the clock is recovered. * + * @max_pacing_offload_horizon: max EDT offload horizon in nsec. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2399,6 +2401,8 @@ struct net_device { /** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */ struct dim_irq_moder *irq_moder; + u64 max_pacing_offload_horizon; + u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; -- cgit v1.2.3 From a3f5f4c2f9b6bc2aa6f5a3e8e23b7519e4f2e3e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Oct 2024 13:47:20 +0000 Subject: ipv4: remove fib_info_devhash[] Upcoming per-netns RTNL conversion needs to get rid of shared hash tables. fib_info_devhash[] is one of them. It is unclear why we used a hash table, because a single hlist_head per net device was cheaper and scalable. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241004134720.579244-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/net_device.rst | 1 + include/linux/netdevice.h | 3 ++ net/ipv4/fib_semantics.c | 35 ++++++++++------------ 3 files changed, 19 insertions(+), 20 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 49f03cb78c6e..556711c4d3cf 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -83,6 +83,7 @@ unsigned_int allmulti bool uc_promisc unsigned_char nested_level struct_in_device* ip_ptr read_mostly read_mostly __in_dev_get +struct hlist_head fib_nh_head struct_inet6_dev* ip6_ptr read_mostly read_mostly __in6_dev_get struct_vlan_info* vlan_info struct_dsa_port* dsa_ptr diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 49a7e7db0883..3baf8e539b6f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2211,6 +2211,9 @@ struct net_device { /* Protocol-specific pointers */ struct in_device __rcu *ip_ptr; + /** @fib_nh_head: nexthops associated with this netdev */ + struct hlist_head fib_nh_head; + #if IS_ENABLED(CONFIG_VLAN_8021Q) struct vlan_info __rcu *vlan_info; #endif diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index ece779bfb8f6..d2cee5c314f5 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -56,10 +56,6 @@ static unsigned int fib_info_hash_size; static unsigned int fib_info_hash_bits; static unsigned int fib_info_cnt; -#define DEVINDEX_HASHBITS 8 -#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) -static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; - /* for_nexthops and change_nexthops only used when nexthop object * is not set in a fib_info. The logic within can reference fib_nh. */ @@ -319,12 +315,9 @@ static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) return 0; } -static struct hlist_head * -fib_info_devhash_bucket(const struct net_device *dev) +static struct hlist_head *fib_nh_head(struct net_device *dev) { - u32 val = net_hash_mix(dev_net(dev)) ^ dev->ifindex; - - return &fib_info_devhash[hash_32(val, DEVINDEX_HASHBITS)]; + return &dev->fib_nh_head; } static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, @@ -435,11 +428,11 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev) struct hlist_head *head; struct fib_nh *nh; - head = fib_info_devhash_bucket(dev); + head = fib_nh_head(dev); hlist_for_each_entry_rcu(nh, head, nh_hash) { - if (nh->fib_nh_dev == dev && - nh->fib_nh_gw4 == gw && + DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); + if (nh->fib_nh_gw4 == gw && !(nh->fib_nh_flags & RTNH_F_DEAD)) { return 0; } @@ -1595,7 +1588,7 @@ link_it: if (!nexthop_nh->fib_nh_dev) continue; - head = fib_info_devhash_bucket(nexthop_nh->fib_nh_dev); + head = fib_nh_head(nexthop_nh->fib_nh_dev); hlist_add_head_rcu(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) } @@ -1948,12 +1941,12 @@ void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) { - struct hlist_head *head = fib_info_devhash_bucket(dev); + struct hlist_head *head = fib_nh_head(dev); struct fib_nh *nh; hlist_for_each_entry(nh, head, nh_hash) { - if (nh->fib_nh_dev == dev) - fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); + DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); + fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); } } @@ -1967,7 +1960,7 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) */ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { - struct hlist_head *head = fib_info_devhash_bucket(dev); + struct hlist_head *head = fib_nh_head(dev); struct fib_info *prev_fi = NULL; int scope = RT_SCOPE_NOWHERE; struct fib_nh *nh; @@ -1981,7 +1974,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) int dead; BUG_ON(!fi->fib_nhs); - if (nh->fib_nh_dev != dev || fi == prev_fi) + DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); + if (fi == prev_fi) continue; prev_fi = fi; dead = 0; @@ -2131,7 +2125,7 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) } prev_fi = NULL; - head = fib_info_devhash_bucket(dev); + head = fib_nh_head(dev); ret = 0; hlist_for_each_entry(nh, head, nh_hash) { @@ -2139,7 +2133,8 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) int alive; BUG_ON(!fi->fib_nhs); - if (nh->fib_nh_dev != dev || fi == prev_fi) + DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); + if (fi == prev_fi) continue; prev_fi = fi; -- cgit v1.2.3 From 836265d31631e28000fc8917ce697fc687a58724 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 21:35:25 +0200 Subject: wifi: remove iw_public_data from struct net_device Given the previous patches, we no longer need the struct iw_public_data etc., it's only used by the old Intel drivers (and ps3_gelic creates it but then doesn't use it). Remove all of that, including the pointer in struct net_device. Link: https://patch.msgid.link/20241007213525.8b2d52b60531.I6a27aaf30bded9a0977f07f47fba2bd31a3b3330@changeid Signed-off-by: Johannes Berg --- Documentation/networking/net_cachelines/net_device.rst | 1 - drivers/net/ethernet/toshiba/ps3_gelic_wireless.c | 1 - drivers/net/ethernet/toshiba/ps3_gelic_wireless.h | 1 - drivers/net/wireless/intel/ipw2x00/ipw2100.c | 2 -- drivers/net/wireless/intel/ipw2x00/ipw2100.h | 2 -- drivers/net/wireless/intel/ipw2x00/ipw2200.c | 1 - drivers/net/wireless/intel/ipw2x00/ipw2200.h | 2 -- drivers/net/wireless/intel/ipw2x00/libipw_spy.c | 7 ++++--- include/linux/netdevice.h | 2 -- include/net/iw_handler.h | 18 ------------------ 10 files changed, 4 insertions(+), 33 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 22b07c814f4a..b7d1d4c7a1ab 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -51,7 +51,6 @@ struct_net_device_core_stats* core_stats atomic_t carrier_up_count atomic_t carrier_down_count struct_iw_handler_def* wireless_handlers -struct_iw_public_data* wireless_data struct_ethtool_ops* ethtool_ops struct_l3mdev_ops* l3mdev_ops struct_ndisc_ops* ndisc_ops diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c index 44488c153ea2..4fbe4b7cd12a 100644 --- a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c +++ b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c @@ -2566,7 +2566,6 @@ static void gelic_wl_setup_netdev_ops(struct net_device *netdev) netdev->ethtool_ops = &gelic_wl_ethtool_ops; netdev->netdev_ops = &gelic_wl_netdevice_ops; - netdev->wireless_data = &wl->wireless_data; netdev->wireless_handlers = &gelic_wl_wext_handler_def; } diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.h b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.h index 1f203d1ae8db..dbabf538e10a 100644 --- a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.h +++ b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.h @@ -276,7 +276,6 @@ struct gelic_wl_info { u8 active_bssid[ETH_ALEN]; /* associated bssid */ unsigned int essid_len; - struct iw_public_data wireless_data; struct iw_statistics iwstat; }; diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c index a89e06c1b8ee..15bf35f2de2a 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c @@ -6022,8 +6022,6 @@ static struct net_device *ipw2100_alloc_device(struct pci_dev *pci_dev, dev->netdev_ops = &ipw2100_netdev_ops; dev->ethtool_ops = &ipw2100_ethtool_ops; dev->wireless_handlers = &ipw2100_wx_handler_def; - priv->wireless_data.libipw = priv->ieee; - dev->wireless_data = &priv->wireless_data; dev->watchdog_timeo = 3 * HZ; dev->irq = 0; dev->min_mtu = 68; diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.h b/drivers/net/wireless/intel/ipw2x00/ipw2100.h index b34085ade3aa..6f81f509b9cb 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2100.h +++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.h @@ -554,8 +554,6 @@ struct ipw2100_priv { struct net_device *net_dev; struct iw_statistics wstats; - struct iw_public_data wireless_data; - struct tasklet_struct irq_tasklet; struct delayed_work reset_work; diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c index c0e9d2109e34..be1d971b3d32 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c @@ -11645,7 +11645,6 @@ static int ipw_pci_probe(struct pci_dev *pdev, net_dev->netdev_ops = &ipw_netdev_ops; priv->ieee->spy_enabled = true; - net_dev->wireless_data = &priv->wireless_data; net_dev->wireless_handlers = &ipw_wx_handler_def; net_dev->ethtool_ops = &ipw_ethtool_ops; diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.h b/drivers/net/wireless/intel/ipw2x00/ipw2200.h index 46f119123b49..d3db54e6d37c 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2200.h +++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.h @@ -1274,8 +1274,6 @@ struct ipw_priv { struct iw_statistics wstats; - struct iw_public_data wireless_data; - int user_requested_scan; u8 direct_scan_ssid[IW_ESSID_MAX_SIZE]; u8 direct_scan_ssid_len; diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_spy.c b/drivers/net/wireless/intel/ipw2x00/libipw_spy.c index 979aeb10aeeb..ba876e92f7f6 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_spy.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_spy.c @@ -18,9 +18,10 @@ static struct iw_spy_data *get_spydata(struct net_device *dev) { - if (dev->wireless_data && dev->wireless_data->libipw && - dev->wireless_data->libipw->spy_enabled) - return &dev->wireless_data->libipw->spy_data; + struct libipw_device *ieee = netdev_priv(dev); + + if (ieee->spy_enabled) + return &ieee->spy_data; return NULL; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e87b5e488325..f0abed1e6e45 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1773,7 +1773,6 @@ enum netdev_reg_state { * @wireless_handlers: List of functions to handle Wireless Extensions, * instead of ioctl, * see for details. - * @wireless_data: Instance data managed by the core of wireless extensions * * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions @@ -2150,7 +2149,6 @@ struct net_device { #ifdef CONFIG_WIRELESS_EXT const struct iw_handler_def *wireless_handlers; - struct iw_public_data *wireless_data; #endif const struct ethtool_ops *ethtool_ops; #ifdef CONFIG_NET_L3_MASTER_DEV diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index a7b502958d27..fc44fcca1d5c 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -404,24 +404,6 @@ struct iw_spy_data { u_char spy_thr_under[IW_MAX_SPY]; }; -/* --------------------- DEVICE WIRELESS DATA --------------------- */ -/* - * This is all the wireless data specific to a device instance that - * is managed by the core of Wireless Extensions or the 802.11 layer. - * We only keep pointer to those structures, so that a driver is free - * to share them between instances. - * This structure should be initialised before registering the device. - * Access to this data follow the same rules as any other struct net_device - * data (i.e. valid as long as struct net_device exist, same locking rules). - */ -/* Forward declaration */ -struct libipw_device; -/* The struct */ -struct iw_public_data { - /* Legacy structure managed by the ipw2x00-specific IEEE 802.11 layer */ - struct libipw_device * libipw; -}; - /**************************** PROTOTYPES ****************************/ /* * Functions part of the Wireless Extensions (defined in net/wireless/wext-core.c). -- cgit v1.2.3 From 4b623f9f0f59652ea71fcb27d60b4c3b65126dbb Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:49 +0200 Subject: net-shapers: implement NL get operation Introduce the basic infrastructure to implement the net-shaper core functionality. Each network devices carries a net-shaper cache, the NL get() operation fetches the data from such cache. The cache is initially empty, will be fill by the set()/group() operation implemented later and is destroyed at device cleanup time. The net_shaper_fill_handle(), net_shaper_ctx_init(), and net_shaper_generic_pre() implementations handle generic index type attributes, despite the current caller always pass a constant value to avoid more noise in later patches using them with different attributes. Reviewed-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/ddd10fd645a9367803ad02fca4a5664ea5ace170.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/networking/kapi.rst | 3 + include/linux/netdevice.h | 21 +++ include/net/net_shaper.h | 120 ++++++++++++++ net/core/dev.c | 6 + net/core/dev.h | 6 + net/shaper/shaper.c | 335 +++++++++++++++++++++++++++++++++++++- 6 files changed, 484 insertions(+), 7 deletions(-) create mode 100644 include/net/net_shaper.h (limited to 'include/linux/netdevice.h') diff --git a/Documentation/networking/kapi.rst b/Documentation/networking/kapi.rst index ea55f462cefa..98682b9a13ee 100644 --- a/Documentation/networking/kapi.rst +++ b/Documentation/networking/kapi.rst @@ -104,6 +104,9 @@ Driver Support .. kernel-doc:: include/linux/netdevice.h :internal: +.. kernel-doc:: include/net/net_shaper.h + :internal: + PHY Support ----------- diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3baf8e539b6f..e6b93d01e631 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1603,6 +1603,14 @@ struct net_device_ops { int (*ndo_hwtstamp_set)(struct net_device *dev, struct kernel_hwtstamp_config *kernel_config, struct netlink_ext_ack *extack); + +#if IS_ENABLED(CONFIG_NET_SHAPER) + /** + * @net_shaper_ops: Device shaping offload operations + * see include/net/net_shapers.h + */ + const struct net_shaper_ops *net_shaper_ops; +#endif }; /** @@ -2406,6 +2414,19 @@ struct net_device { u64 max_pacing_offload_horizon; + /** + * @lock: protects @net_shaper_hierarchy, feel free to use for other + * netdev-scope protection. Ordering: take after rtnl_lock. + */ + struct mutex lock; + +#if IS_ENABLED(CONFIG_NET_SHAPER) + /** + * @net_shaper_hierarchy: data tracking the current shaper status + * see include/net/net_shapers.h + */ + struct net_shaper_hierarchy *net_shaper_hierarchy; +#endif u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; diff --git a/include/net/net_shaper.h b/include/net/net_shaper.h new file mode 100644 index 000000000000..5c3f49b52fe9 --- /dev/null +++ b/include/net/net_shaper.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NET_SHAPER_H_ +#define _NET_SHAPER_H_ + +#include + +#include + +struct net_device; +struct devlink; +struct netlink_ext_ack; + +enum net_shaper_binding_type { + NET_SHAPER_BINDING_TYPE_NETDEV, + /* NET_SHAPER_BINDING_TYPE_DEVLINK_PORT */ +}; + +struct net_shaper_binding { + enum net_shaper_binding_type type; + union { + struct net_device *netdev; + struct devlink *devlink; + }; +}; + +struct net_shaper_handle { + enum net_shaper_scope scope; + u32 id; +}; + +/** + * struct net_shaper - represents a shaping node on the NIC H/W + * zeroed field are considered not set. + * @parent: Unique identifier for the shaper parent, usually implied + * @handle: Unique identifier for this shaper + * @metric: Specify if the rate limits refers to PPS or BPS + * @bw_min: Minimum guaranteed rate for this shaper + * @bw_max: Maximum peak rate allowed for this shaper + * @burst: Maximum burst for the peek rate of this shaper + * @priority: Scheduling priority for this shaper + * @weight: Scheduling weight for this shaper + */ +struct net_shaper { + struct net_shaper_handle parent; + struct net_shaper_handle handle; + enum net_shaper_metric metric; + u64 bw_min; + u64 bw_max; + u64 burst; + u32 priority; + u32 weight; + + /* private: */ + u32 leaves; /* accounted only for NODE scope */ + struct rcu_head rcu; +}; + +/** + * struct net_shaper_ops - Operations on device H/W shapers + * + * The operations applies to either net_device and devlink objects. + * The initial shaping configuration at device initialization is empty: + * does not constraint the rate in any way. + * The network core keeps track of the applied user-configuration in + * the net_device or devlink structure. + * The operations are serialized via a per device lock. + * + * Device not supporting any kind of nesting should not provide the + * group operation. + * + * Each shaper is uniquely identified within the device with a 'handle' + * comprising the shaper scope and a scope-specific id. + */ +struct net_shaper_ops { + /** + * @group: create the specified shapers scheduling group + * + * Nest the @leaves shapers identified under the * @node shaper. + * All the shapers belong to the device specified by @binding. + * The @leaves arrays size is specified by @leaves_count. + * Create either the @leaves and the @node shaper; or if they already + * exists, links them together in the desired way. + * @leaves scope must be NET_SHAPER_SCOPE_QUEUE. + */ + int (*group)(struct net_shaper_binding *binding, int leaves_count, + const struct net_shaper *leaves, + const struct net_shaper *node, + struct netlink_ext_ack *extack); + + /** + * @set: Updates the specified shaper + * + * Updates or creates the @shaper on the device specified by @binding. + */ + int (*set)(struct net_shaper_binding *binding, + const struct net_shaper *shaper, + struct netlink_ext_ack *extack); + + /** + * @delete: Removes the specified shaper + * + * Removes the shaper configuration as identified by the given @handle + * on the device specified by @binding, restoring the default behavior. + */ + int (*delete)(struct net_shaper_binding *binding, + const struct net_shaper_handle *handle, + struct netlink_ext_ack *extack); + + /** + * @capabilities: get the shaper features supported by the device + * + * Fills the bitmask @cap with the supported capabilities for the + * specified @scope and device specified by @binding. + */ + void (*capabilities)(struct net_shaper_binding *binding, + enum net_shaper_scope scope, unsigned long *cap); +}; + +#endif diff --git a/net/core/dev.c b/net/core/dev.c index ea5fbcd133ae..6e727c49a6f7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11147,6 +11147,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, hash_init(dev->qdisc_hash); #endif + mutex_init(&dev->lock); + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); @@ -11217,6 +11219,8 @@ void free_netdev(struct net_device *dev) return; } + mutex_destroy(&dev->lock); + kfree(dev->ethtool); netif_free_tx_queues(dev); netif_free_rx_queues(dev); @@ -11426,6 +11430,8 @@ void unregister_netdevice_many_notify(struct list_head *head, mutex_destroy(&dev->ethtool->rss_lock); + net_shaper_flush_netdev(dev); + if (skb) rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); diff --git a/net/core/dev.h b/net/core/dev.h index 5654325c5b71..13c558874af3 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -35,6 +35,12 @@ void dev_addr_flush(struct net_device *dev); int dev_addr_init(struct net_device *dev); void dev_addr_check(struct net_device *dev); +#if IS_ENABLED(CONFIG_NET_SHAPER) +void net_shaper_flush_netdev(struct net_device *dev); +#else +static inline void net_shaper_flush_netdev(struct net_device *dev) {} +#endif + /* sysctls not referred to from outside net/core/ */ extern int netdev_unregister_timeout_secs; extern int weight_p; diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index a1b20888f502..22daf7dde999 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -1,30 +1,333 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include #include +#include +#include #include +#include +#include +#include #include "shaper_nl_gen.h" +#include "../core/dev.h" + +#define NET_SHAPER_SCOPE_SHIFT 26 +#define NET_SHAPER_ID_MASK GENMASK(NET_SHAPER_SCOPE_SHIFT - 1, 0) +#define NET_SHAPER_SCOPE_MASK GENMASK(31, NET_SHAPER_SCOPE_SHIFT) + +#define NET_SHAPER_ID_UNSPEC NET_SHAPER_ID_MASK + +struct net_shaper_hierarchy { + struct xarray shapers; +}; + +struct net_shaper_nl_ctx { + struct net_shaper_binding binding; + netdevice_tracker dev_tracker; + unsigned long start_index; +}; + +static struct net_shaper_binding *net_shaper_binding_from_ctx(void *ctx) +{ + return &((struct net_shaper_nl_ctx *)ctx)->binding; +} + +static struct net_shaper_hierarchy * +net_shaper_hierarchy(struct net_shaper_binding *binding) +{ + /* Pairs with WRITE_ONCE() in net_shaper_hierarchy_setup. */ + if (binding->type == NET_SHAPER_BINDING_TYPE_NETDEV) + return READ_ONCE(binding->netdev->net_shaper_hierarchy); + + /* No other type supported yet. */ + return NULL; +} + +static int net_shaper_fill_binding(struct sk_buff *msg, + const struct net_shaper_binding *binding, + u32 type) +{ + /* Should never happen, as currently only NETDEV is supported. */ + if (WARN_ON_ONCE(binding->type != NET_SHAPER_BINDING_TYPE_NETDEV)) + return -EINVAL; + + if (nla_put_u32(msg, type, binding->netdev->ifindex)) + return -EMSGSIZE; + + return 0; +} + +static int net_shaper_fill_handle(struct sk_buff *msg, + const struct net_shaper_handle *handle, + u32 type) +{ + struct nlattr *handle_attr; + + if (handle->scope == NET_SHAPER_SCOPE_UNSPEC) + return 0; + + handle_attr = nla_nest_start(msg, type); + if (!handle_attr) + return -EMSGSIZE; + + if (nla_put_u32(msg, NET_SHAPER_A_HANDLE_SCOPE, handle->scope) || + (handle->scope >= NET_SHAPER_SCOPE_QUEUE && + nla_put_u32(msg, NET_SHAPER_A_HANDLE_ID, handle->id))) + goto handle_nest_cancel; + + nla_nest_end(msg, handle_attr); + return 0; + +handle_nest_cancel: + nla_nest_cancel(msg, handle_attr); + return -EMSGSIZE; +} + +static int +net_shaper_fill_one(struct sk_buff *msg, + const struct net_shaper_binding *binding, + const struct net_shaper *shaper, + const struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_iput(msg, info); + if (!hdr) + return -EMSGSIZE; + + if (net_shaper_fill_binding(msg, binding, NET_SHAPER_A_IFINDEX) || + net_shaper_fill_handle(msg, &shaper->parent, + NET_SHAPER_A_PARENT) || + net_shaper_fill_handle(msg, &shaper->handle, + NET_SHAPER_A_HANDLE) || + ((shaper->bw_min || shaper->bw_max || shaper->burst) && + nla_put_u32(msg, NET_SHAPER_A_METRIC, shaper->metric)) || + (shaper->bw_min && + nla_put_uint(msg, NET_SHAPER_A_BW_MIN, shaper->bw_min)) || + (shaper->bw_max && + nla_put_uint(msg, NET_SHAPER_A_BW_MAX, shaper->bw_max)) || + (shaper->burst && + nla_put_uint(msg, NET_SHAPER_A_BURST, shaper->burst)) || + (shaper->priority && + nla_put_u32(msg, NET_SHAPER_A_PRIORITY, shaper->priority)) || + (shaper->weight && + nla_put_u32(msg, NET_SHAPER_A_WEIGHT, shaper->weight))) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/* Initialize the context fetching the relevant device and + * acquiring a reference to it. + */ +static int net_shaper_ctx_setup(const struct genl_info *info, int type, + struct net_shaper_nl_ctx *ctx) +{ + struct net *ns = genl_info_net(info); + struct net_device *dev; + int ifindex; + + if (GENL_REQ_ATTR_CHECK(info, type)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[type]); + dev = netdev_get_by_index(ns, ifindex, &ctx->dev_tracker, GFP_KERNEL); + if (!dev) { + NL_SET_BAD_ATTR(info->extack, info->attrs[type]); + return -ENOENT; + } + + if (!dev->netdev_ops->net_shaper_ops) { + NL_SET_BAD_ATTR(info->extack, info->attrs[type]); + netdev_put(dev, &ctx->dev_tracker); + return -EOPNOTSUPP; + } + + ctx->binding.type = NET_SHAPER_BINDING_TYPE_NETDEV; + ctx->binding.netdev = dev; + return 0; +} + +static void net_shaper_ctx_cleanup(struct net_shaper_nl_ctx *ctx) +{ + if (ctx->binding.type == NET_SHAPER_BINDING_TYPE_NETDEV) + netdev_put(ctx->binding.netdev, &ctx->dev_tracker); +} + +static u32 net_shaper_handle_to_index(const struct net_shaper_handle *handle) +{ + return FIELD_PREP(NET_SHAPER_SCOPE_MASK, handle->scope) | + FIELD_PREP(NET_SHAPER_ID_MASK, handle->id); +} + +static struct net_shaper * +net_shaper_lookup(struct net_shaper_binding *binding, + const struct net_shaper_handle *handle) +{ + struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); + u32 index = net_shaper_handle_to_index(handle); + + return hierarchy ? xa_load(&hierarchy->shapers, index) : NULL; +} + +static int net_shaper_parse_handle(const struct nlattr *attr, + const struct genl_info *info, + struct net_shaper_handle *handle) +{ + struct nlattr *tb[NET_SHAPER_A_HANDLE_MAX + 1]; + struct nlattr *id_attr; + u32 id = 0; + int ret; + + ret = nla_parse_nested(tb, NET_SHAPER_A_HANDLE_MAX, attr, + net_shaper_handle_nl_policy, info->extack); + if (ret < 0) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, + NET_SHAPER_A_HANDLE_SCOPE)) + return -EINVAL; + + handle->scope = nla_get_u32(tb[NET_SHAPER_A_HANDLE_SCOPE]); + + /* The default id for NODE scope shapers is an invalid one + * to help the 'group' operation discriminate between new + * NODE shaper creation (ID_UNSPEC) and reuse of existing + * shaper (any other value). + */ + id_attr = tb[NET_SHAPER_A_HANDLE_ID]; + if (id_attr) + id = nla_get_u32(id_attr); + else if (handle->scope == NET_SHAPER_SCOPE_NODE) + id = NET_SHAPER_ID_UNSPEC; + + handle->id = id; + return 0; +} + +static int net_shaper_generic_pre(struct genl_info *info, int type) +{ + struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)info->ctx; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(info->ctx)); + + return net_shaper_ctx_setup(info, type, ctx); +} + int net_shaper_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + return net_shaper_generic_pre(info, NET_SHAPER_A_IFINDEX); +} + +static void net_shaper_generic_post(struct genl_info *info) +{ + net_shaper_ctx_cleanup((struct net_shaper_nl_ctx *)info->ctx); } void net_shaper_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { + net_shaper_generic_post(info); +} + +int net_shaper_nl_pre_dumpit(struct netlink_callback *cb) +{ + struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; + const struct genl_info *info = genl_info_dump(cb); + + return net_shaper_ctx_setup(info, NET_SHAPER_A_IFINDEX, ctx); +} + +int net_shaper_nl_post_dumpit(struct netlink_callback *cb) +{ + net_shaper_ctx_cleanup((struct net_shaper_nl_ctx *)cb->ctx); + return 0; } int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + struct net_shaper_binding *binding; + struct net_shaper_handle handle; + struct net_shaper *shaper; + struct sk_buff *msg; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_HANDLE)) + return -EINVAL; + + binding = net_shaper_binding_from_ctx(info->ctx); + ret = net_shaper_parse_handle(info->attrs[NET_SHAPER_A_HANDLE], info, + &handle); + if (ret < 0) + return ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + rcu_read_lock(); + shaper = net_shaper_lookup(binding, &handle); + if (!shaper) { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NET_SHAPER_A_HANDLE]); + rcu_read_unlock(); + ret = -ENOENT; + goto free_msg; + } + + ret = net_shaper_fill_one(msg, binding, shaper, info); + rcu_read_unlock(); + if (ret) + goto free_msg; + + ret = genlmsg_reply(msg, info); + if (ret) + goto free_msg; + + return 0; + +free_msg: + nlmsg_free(msg); + return ret; } int net_shaper_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - return -EOPNOTSUPP; + struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; + const struct genl_info *info = genl_info_dump(cb); + struct net_shaper_hierarchy *hierarchy; + struct net_shaper_binding *binding; + struct net_shaper *shaper; + int ret = 0; + + /* Don't error out dumps performed before any set operation. */ + binding = net_shaper_binding_from_ctx(ctx); + hierarchy = net_shaper_hierarchy(binding); + if (!hierarchy) + return 0; + + rcu_read_lock(); + for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index, + U32_MAX, XA_PRESENT)); ctx->start_index++) { + ret = net_shaper_fill_one(skb, binding, shaper, info); + if (ret) + break; + } + rcu_read_unlock(); + + return ret; } int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info) @@ -37,14 +340,32 @@ int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; } -int net_shaper_nl_pre_dumpit(struct netlink_callback *cb) +static void net_shaper_flush(struct net_shaper_binding *binding) { - return -EOPNOTSUPP; + struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); + struct net_shaper *cur; + unsigned long index; + + if (!hierarchy) + return; + + xa_lock(&hierarchy->shapers); + xa_for_each(&hierarchy->shapers, index, cur) { + __xa_erase(&hierarchy->shapers, index); + kfree(cur); + } + xa_unlock(&hierarchy->shapers); + kfree(hierarchy); } -int net_shaper_nl_post_dumpit(struct netlink_callback *cb) +void net_shaper_flush_netdev(struct net_device *dev) { - return -EOPNOTSUPP; + struct net_shaper_binding binding = { + .type = NET_SHAPER_BINDING_TYPE_NETDEV, + .netdev = dev, + }; + + net_shaper_flush(&binding); } static int __init shaper_init(void) -- cgit v1.2.3 From f15e3b3ddb9fab1c1731b6154e2cd6573fb54c4d Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:44:56 +0000 Subject: net: napi: Make napi_defer_hard_irqs per-NAPI Add defer_hard_irqs to napi_struct in preparation for per-NAPI settings. The existing sysfs parameter is respected; writes to sysfs will write to all NAPI structs for the device and the net_device defer_hard_irq field. Reads from sysfs show the net_device field. The ability to set defer_hard_irqs on specific NAPI instances will be added in a later commit, via netdev-genl. Signed-off-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20241011184527.16393-2-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/net_device.rst | 1 + include/linux/netdevice.h | 3 +- net/core/dev.c | 10 +++--- net/core/dev.h | 36 ++++++++++++++++++++++ net/core/net-sysfs.c | 2 +- 5 files changed, 45 insertions(+), 7 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 1b018ac35e9a..5a7388b2ab6f 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -186,4 +186,5 @@ struct dpll_pin* dpll_pin struct hlist_head page_pools struct dim_irq_moder* irq_moder u64 max_pacing_offload_horizon +u32 napi_defer_hard_irqs =================================== =========================== =================== =================== =================================================================================== diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e6b93d01e631..2e7bc23660ec 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -373,6 +373,7 @@ struct napi_struct { unsigned int napi_id; struct hrtimer timer; struct task_struct *thread; + u32 defer_hard_irqs; /* control-path-only fields follow */ struct list_head dev_list; struct hlist_node napi_hash_node; @@ -2085,7 +2086,6 @@ struct net_device { unsigned int real_num_rx_queues; struct netdev_rx_queue *_rx; unsigned long gro_flush_timeout; - u32 napi_defer_hard_irqs; unsigned int gro_max_size; unsigned int gro_ipv4_max_size; rx_handler_func_t __rcu *rx_handler; @@ -2413,6 +2413,7 @@ struct net_device { struct dim_irq_moder *irq_moder; u64 max_pacing_offload_horizon; + u32 napi_defer_hard_irqs; /** * @lock: protects @net_shaper_hierarchy, feel free to use for other diff --git a/net/core/dev.c b/net/core/dev.c index b590eefce3b4..fbaa9eabf77f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6233,7 +6233,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (work_done) { if (n->gro_bitmask) timeout = READ_ONCE(n->dev->gro_flush_timeout); - n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs); + n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); } if (n->defer_hard_irqs_count > 0) { n->defer_hard_irqs_count--; @@ -6371,7 +6371,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (flags & NAPI_F_PREFER_BUSY_POLL) { - napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); + napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); timeout = READ_ONCE(napi->dev->gro_flush_timeout); if (napi->defer_hard_irqs_count && timeout) { hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); @@ -6653,6 +6653,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, INIT_HLIST_NODE(&napi->napi_hash_node); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; + napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); init_gro_hash(napi); napi->skb = NULL; INIT_LIST_HEAD(&napi->rx_list); @@ -11059,7 +11060,7 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev) if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { dev->gro_flush_timeout = 20000; - dev->napi_defer_hard_irqs = 1; + netdev_set_defer_hard_irqs(dev, 1); } } EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); @@ -12003,7 +12004,6 @@ static void __init net_dev_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout); - CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); @@ -12015,7 +12015,7 @@ static void __init net_dev_struct_check(void) #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); #endif - CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 100); } /* diff --git a/net/core/dev.h b/net/core/dev.h index d3ea92949ff3..0716b1048261 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -148,6 +148,42 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev, WRITE_ONCE(dev->gro_ipv4_max_size, size); } +/** + * napi_get_defer_hard_irqs - get the NAPI's defer_hard_irqs + * @n: napi struct to get the defer_hard_irqs field from + * + * Return: the per-NAPI value of the defar_hard_irqs field. + */ +static inline u32 napi_get_defer_hard_irqs(const struct napi_struct *n) +{ + return READ_ONCE(n->defer_hard_irqs); +} + +/** + * napi_set_defer_hard_irqs - set the defer_hard_irqs for a napi + * @n: napi_struct to set the defer_hard_irqs field + * @defer: the value the field should be set to + */ +static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer) +{ + WRITE_ONCE(n->defer_hard_irqs, defer); +} + +/** + * netdev_set_defer_hard_irqs - set defer_hard_irqs for all NAPIs of a netdev + * @netdev: the net_device for which all NAPIs will have defer_hard_irqs set + * @defer: the defer_hard_irqs value to set + */ +static inline void netdev_set_defer_hard_irqs(struct net_device *netdev, + u32 defer) +{ + struct napi_struct *napi; + + WRITE_ONCE(netdev->napi_defer_hard_irqs, defer); + list_for_each_entry(napi, &netdev->napi_list, dev_list) + napi_set_defer_hard_irqs(napi, defer); +} + int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 05cf5347f25e..25125f356a15 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -429,7 +429,7 @@ static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val if (val > S32_MAX) return -ERANGE; - WRITE_ONCE(dev->napi_defer_hard_irqs, val); + netdev_set_defer_hard_irqs(dev, (u32)val); return 0; } -- cgit v1.2.3 From acb8d4ed5661d05f794ef2ce34fd11e699e9ca32 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:44:58 +0000 Subject: net: napi: Make gro_flush_timeout per-NAPI Allow per-NAPI gro_flush_timeout setting. The existing sysfs parameter is respected; writes to sysfs will write to all NAPI structs for the device and the net_device gro_flush_timeout field. Reads from sysfs will read from the net_device field. The ability to set gro_flush_timeout on specific NAPI instances will be added in a later commit, via netdev-genl. Signed-off-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20241011184527.16393-4-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/net_device.rst | 1 + include/linux/netdevice.h | 3 +- net/core/dev.c | 12 +++---- net/core/dev.h | 40 ++++++++++++++++++++++ net/core/net-sysfs.c | 2 +- 5 files changed, 50 insertions(+), 8 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 5a7388b2ab6f..67910ea49160 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -186,5 +186,6 @@ struct dpll_pin* dpll_pin struct hlist_head page_pools struct dim_irq_moder* irq_moder u64 max_pacing_offload_horizon +unsigned_long gro_flush_timeout u32 napi_defer_hard_irqs =================================== =========================== =================== =================== =================================================================================== diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2e7bc23660ec..93241d4de437 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -373,6 +373,7 @@ struct napi_struct { unsigned int napi_id; struct hrtimer timer; struct task_struct *thread; + unsigned long gro_flush_timeout; u32 defer_hard_irqs; /* control-path-only fields follow */ struct list_head dev_list; @@ -2085,7 +2086,6 @@ struct net_device { int ifindex; unsigned int real_num_rx_queues; struct netdev_rx_queue *_rx; - unsigned long gro_flush_timeout; unsigned int gro_max_size; unsigned int gro_ipv4_max_size; rx_handler_func_t __rcu *rx_handler; @@ -2413,6 +2413,7 @@ struct net_device { struct dim_irq_moder *irq_moder; u64 max_pacing_offload_horizon; + unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; /** diff --git a/net/core/dev.c b/net/core/dev.c index fbaa9eabf77f..e21ace3551d5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6232,12 +6232,12 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (work_done) { if (n->gro_bitmask) - timeout = READ_ONCE(n->dev->gro_flush_timeout); + timeout = napi_get_gro_flush_timeout(n); n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); } if (n->defer_hard_irqs_count > 0) { n->defer_hard_irqs_count--; - timeout = READ_ONCE(n->dev->gro_flush_timeout); + timeout = napi_get_gro_flush_timeout(n); if (timeout) ret = false; } @@ -6372,7 +6372,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); - timeout = READ_ONCE(napi->dev->gro_flush_timeout); + timeout = napi_get_gro_flush_timeout(napi); if (napi->defer_hard_irqs_count && timeout) { hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); skip_schedule = true; @@ -6654,6 +6654,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); + napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout)); init_gro_hash(napi); napi->skb = NULL; INIT_LIST_HEAD(&napi->rx_list); @@ -11059,7 +11060,7 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev) WARN_ON(dev->reg_state == NETREG_REGISTERED); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { - dev->gro_flush_timeout = 20000; + netdev_set_gro_flush_timeout(dev, 20000); netdev_set_defer_hard_irqs(dev, 1); } } @@ -12003,7 +12004,6 @@ static void __init net_dev_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); - CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); @@ -12015,7 +12015,7 @@ static void __init net_dev_struct_check(void) #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); #endif - CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 100); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92); } /* diff --git a/net/core/dev.h b/net/core/dev.h index 0716b1048261..7d0aab7e3ef1 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -184,6 +184,46 @@ static inline void netdev_set_defer_hard_irqs(struct net_device *netdev, napi_set_defer_hard_irqs(napi, defer); } +/** + * napi_get_gro_flush_timeout - get the gro_flush_timeout + * @n: napi struct to get the gro_flush_timeout from + * + * Return: the per-NAPI value of the gro_flush_timeout field. + */ +static inline unsigned long +napi_get_gro_flush_timeout(const struct napi_struct *n) +{ + return READ_ONCE(n->gro_flush_timeout); +} + +/** + * napi_set_gro_flush_timeout - set the gro_flush_timeout for a napi + * @n: napi struct to set the gro_flush_timeout + * @timeout: timeout value to set + * + * napi_set_gro_flush_timeout sets the per-NAPI gro_flush_timeout + */ +static inline void napi_set_gro_flush_timeout(struct napi_struct *n, + unsigned long timeout) +{ + WRITE_ONCE(n->gro_flush_timeout, timeout); +} + +/** + * netdev_set_gro_flush_timeout - set gro_flush_timeout of a netdev's NAPIs + * @netdev: the net_device for which all NAPIs will have gro_flush_timeout set + * @timeout: the timeout value to set + */ +static inline void netdev_set_gro_flush_timeout(struct net_device *netdev, + unsigned long timeout) +{ + struct napi_struct *napi; + + WRITE_ONCE(netdev->gro_flush_timeout, timeout); + list_for_each_entry(napi, &netdev->napi_list, dev_list) + napi_set_gro_flush_timeout(napi, timeout); +} + int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 25125f356a15..2d9afc6e2161 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -409,7 +409,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { - WRITE_ONCE(dev->gro_flush_timeout, val); + netdev_set_gro_flush_timeout(dev, val); return 0; } -- cgit v1.2.3 From 86e25f40aa1e9e54e081e55016f65b5c92523989 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:45:00 +0000 Subject: net: napi: Add napi_config Add a persistent NAPI config area for NAPI configuration to the core. Drivers opt-in to setting the persistent config for a NAPI by passing an index when calling netif_napi_add_config. napi_config is allocated in alloc_netdev_mqs, freed in free_netdev (after the NAPIs are deleted). Drivers which call netif_napi_add_config will have persistent per-NAPI settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings. Per-NAPI settings are saved in napi_disable and restored in napi_enable. Co-developed-by: Martin Karsten Signed-off-by: Martin Karsten Signed-off-by: Joe Damato Reviewed-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241011184527.16393-6-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/net_device.rst | 1 + include/linux/netdevice.h | 36 +++++++++- net/core/dev.c | 80 ++++++++++++++++++++-- net/core/dev.h | 12 ++++ 4 files changed, 119 insertions(+), 10 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 67910ea49160..db6192b2bb50 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -186,6 +186,7 @@ struct dpll_pin* dpll_pin struct hlist_head page_pools struct dim_irq_moder* irq_moder u64 max_pacing_offload_horizon +struct_napi_config* napi_config unsigned_long gro_flush_timeout u32 napi_defer_hard_irqs =================================== =========================== =================== =================== =================================================================================== diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 93241d4de437..8feaca12655e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -342,6 +342,15 @@ struct gro_list { */ #define GRO_HASH_BUCKETS 8 +/* + * Structure for per-NAPI config + */ +struct napi_config { + u64 gro_flush_timeout; + u32 defer_hard_irqs; + unsigned int napi_id; +}; + /* * Structure for NAPI scheduling similar to tasklet but with weighting */ @@ -379,6 +388,8 @@ struct napi_struct { struct list_head dev_list; struct hlist_node napi_hash_node; int irq; + int index; + struct napi_config *config; }; enum { @@ -1868,9 +1879,6 @@ enum netdev_reg_state { * allocated at register_netdev() time * @real_num_rx_queues: Number of RX queues currently active in device * @xdp_prog: XDP sockets filter program pointer - * @gro_flush_timeout: timeout for GRO layer in NAPI - * @napi_defer_hard_irqs: If not zero, provides a counter that would - * allow to avoid NIC hard IRQ, on busy queues. * * @rx_handler: handler for received packets * @rx_handler_data: XXX: need comments on this one @@ -2020,6 +2028,11 @@ enum netdev_reg_state { * where the clock is recovered. * * @max_pacing_offload_horizon: max EDT offload horizon in nsec. + * @napi_config: An array of napi_config structures containing per-NAPI + * settings. + * @gro_flush_timeout: timeout for GRO layer in NAPI + * @napi_defer_hard_irqs: If not zero, provides a counter that would + * allow to avoid NIC hard IRQ, on busy queues. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2413,6 +2426,7 @@ struct net_device { struct dim_irq_moder *irq_moder; u64 max_pacing_offload_horizon; + struct napi_config *napi_config; unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; @@ -2678,6 +2692,22 @@ netif_napi_add_tx_weight(struct net_device *dev, netif_napi_add_weight(dev, napi, poll, weight); } +/** + * netif_napi_add_config - initialize a NAPI context with persistent config + * @dev: network device + * @napi: NAPI context + * @poll: polling function + * @index: the NAPI index + */ +static inline void +netif_napi_add_config(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int index) +{ + napi->index = index; + napi->config = &dev->napi_config[index]; + netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT); +} + /** * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only * @dev: network device diff --git a/net/core/dev.c b/net/core/dev.c index e21ace3551d5..c682173a7642 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6505,6 +6505,23 @@ EXPORT_SYMBOL(napi_busy_loop); #endif /* CONFIG_NET_RX_BUSY_POLL */ +static void __napi_hash_add_with_id(struct napi_struct *napi, + unsigned int napi_id) +{ + napi->napi_id = napi_id; + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); +} + +static void napi_hash_add_with_id(struct napi_struct *napi, + unsigned int napi_id) +{ + spin_lock(&napi_hash_lock); + WARN_ON_ONCE(napi_by_id(napi_id)); + __napi_hash_add_with_id(napi, napi_id); + spin_unlock(&napi_hash_lock); +} + static void napi_hash_add(struct napi_struct *napi) { if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) @@ -6517,10 +6534,8 @@ static void napi_hash_add(struct napi_struct *napi) if (unlikely(++napi_gen_id < MIN_NAPI_ID)) napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); - napi->napi_id = napi_gen_id; - hlist_add_head_rcu(&napi->napi_hash_node, - &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + __napi_hash_add_with_id(napi, napi_gen_id); spin_unlock(&napi_hash_lock); } @@ -6643,6 +6658,28 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, } EXPORT_SYMBOL(netif_queue_set_napi); +static void napi_restore_config(struct napi_struct *n) +{ + n->defer_hard_irqs = n->config->defer_hard_irqs; + n->gro_flush_timeout = n->config->gro_flush_timeout; + /* a NAPI ID might be stored in the config, if so use it. if not, use + * napi_hash_add to generate one for us. It will be saved to the config + * in napi_disable. + */ + if (n->config->napi_id) + napi_hash_add_with_id(n, n->config->napi_id); + else + napi_hash_add(n); +} + +static void napi_save_config(struct napi_struct *n) +{ + n->config->defer_hard_irqs = n->defer_hard_irqs; + n->config->gro_flush_timeout = n->gro_flush_timeout; + n->config->napi_id = n->napi_id; + napi_hash_del(n); +} + void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { @@ -6653,8 +6690,6 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, INIT_HLIST_NODE(&napi->napi_hash_node); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; - napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); - napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout)); init_gro_hash(napi); napi->skb = NULL; INIT_LIST_HEAD(&napi->rx_list); @@ -6672,7 +6707,13 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state); list_add_rcu(&napi->dev_list, &dev->napi_list); - napi_hash_add(napi); + + /* default settings from sysfs are applied to all NAPIs. any per-NAPI + * configuration will be loaded in napi_enable + */ + napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); + napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout)); + napi_get_frags_check(napi); /* Create kthread for this napi if dev->threaded is set. * Clear dev->threaded if kthread creation failed so that @@ -6704,6 +6745,11 @@ void napi_disable(struct napi_struct *n) hrtimer_cancel(&n->timer); + if (n->config) + napi_save_config(n); + else + napi_hash_del(n); + clear_bit(NAPI_STATE_DISABLE, &n->state); } EXPORT_SYMBOL(napi_disable); @@ -6719,6 +6765,11 @@ void napi_enable(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); + if (n->config) + napi_restore_config(n); + else + napi_hash_add(n); + do { BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); @@ -6748,7 +6799,11 @@ void __netif_napi_del(struct napi_struct *napi) if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return; - napi_hash_del(napi); + if (napi->config) { + napi->index = -1; + napi->config = NULL; + } + list_del_rcu(&napi->dev_list); napi_free_frags(napi); @@ -11085,6 +11140,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned int txqs, unsigned int rxqs) { struct net_device *dev; + size_t napi_config_sz; + unsigned int maxqs; BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -11098,6 +11155,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } + maxqs = max(txqs, rxqs); + dev = kvzalloc(struct_size(dev, priv, sizeof_priv), GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!dev) @@ -11174,6 +11233,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (!dev->ethtool) goto free_all; + napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); + dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); + if (!dev->napi_config) + goto free_all; + strscpy(dev->name, name); dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; @@ -11237,6 +11301,8 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); + kvfree(dev->napi_config); + ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); diff --git a/net/core/dev.h b/net/core/dev.h index 7d0aab7e3ef1..7881bced70a9 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -177,11 +177,17 @@ static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer) static inline void netdev_set_defer_hard_irqs(struct net_device *netdev, u32 defer) { + unsigned int count = max(netdev->num_rx_queues, + netdev->num_tx_queues); struct napi_struct *napi; + int i; WRITE_ONCE(netdev->napi_defer_hard_irqs, defer); list_for_each_entry(napi, &netdev->napi_list, dev_list) napi_set_defer_hard_irqs(napi, defer); + + for (i = 0; i < count; i++) + netdev->napi_config[i].defer_hard_irqs = defer; } /** @@ -217,11 +223,17 @@ static inline void napi_set_gro_flush_timeout(struct napi_struct *n, static inline void netdev_set_gro_flush_timeout(struct net_device *netdev, unsigned long timeout) { + unsigned int count = max(netdev->num_rx_queues, + netdev->num_tx_queues); struct napi_struct *napi; + int i; WRITE_ONCE(netdev->gro_flush_timeout, timeout); list_for_each_entry(napi, &netdev->napi_list, dev_list) napi_set_gro_flush_timeout(napi, timeout); + + for (i = 0; i < count; i++) + netdev->napi_config[i].gro_flush_timeout = timeout; } int rps_cpumask_housekeeping(struct cpumask *mask); -- cgit v1.2.3 From e44ef3f66c5472c2cbc6957c684d7279c26b0db1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Oct 2024 05:21:08 +0000 Subject: netpoll: remove ndo_netpoll_setup() second argument npinfo is not used in any of the ndo_netpoll_setup() methods. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241018052108.2610827-1-edumazet@google.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/macvlan.c | 2 +- drivers/net/team/team_core.c | 3 +-- include/linux/netdevice.h | 3 +-- net/8021q/vlan_dev.c | 2 +- net/bridge/br_device.c | 2 +- net/core/netpoll.c | 2 +- net/dsa/user.c | 3 +-- 8 files changed, 8 insertions(+), 11 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index b1bffd8e9a95..3928287f5865 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1476,7 +1476,7 @@ static void bond_netpoll_cleanup(struct net_device *bond_dev) slave_disable_netpoll(slave); } -static int bond_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) +static int bond_netpoll_setup(struct net_device *dev) { struct bonding *bond = netdev_priv(dev); struct list_head *iter; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index cf18e66de142..edbd5afcec41 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1117,7 +1117,7 @@ static void macvlan_dev_poll_controller(struct net_device *dev) return; } -static int macvlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo) +static int macvlan_dev_netpoll_setup(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *real_dev = vlan->lowerdev; diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index 18191d5a8bd4..a1b27b69f010 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -1946,8 +1946,7 @@ static void team_netpoll_cleanup(struct net_device *dev) mutex_unlock(&team->lock); } -static int team_netpoll_setup(struct net_device *dev, - struct netpoll_info *npifo) +static int team_netpoll_setup(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8feaca12655e..86a0b7eb9461 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1425,8 +1425,7 @@ struct net_device_ops { __be16 proto, u16 vid); #ifdef CONFIG_NET_POLL_CONTROLLER void (*ndo_poll_controller)(struct net_device *dev); - int (*ndo_netpoll_setup)(struct net_device *dev, - struct netpoll_info *info); + int (*ndo_netpoll_setup)(struct net_device *dev); void (*ndo_netpoll_cleanup)(struct net_device *dev); #endif int (*ndo_set_vf_mac)(struct net_device *dev, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 458040e8a0e0..91d134961357 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -725,7 +725,7 @@ static void vlan_dev_poll_controller(struct net_device *dev) return; } -static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo) +static int vlan_dev_netpoll_setup(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 26b79feb385d..0ab4613aa07a 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -328,7 +328,7 @@ int br_netpoll_enable(struct net_bridge_port *p) return __br_netpoll_enable(p); } -static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) +static int br_netpoll_setup(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index aa49b92e9194..94b7f07a952f 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -641,7 +641,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) ops = ndev->netdev_ops; if (ops->ndo_netpoll_setup) { - err = ops->ndo_netpoll_setup(ndev, npinfo); + err = ops->ndo_netpoll_setup(ndev); if (err) goto free_npinfo; } diff --git a/net/dsa/user.c b/net/dsa/user.c index 64f660d2334b..91a1fa5f8ab0 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -1308,8 +1308,7 @@ static int dsa_user_set_pauseparam(struct net_device *dev, } #ifdef CONFIG_NET_POLL_CONTROLLER -static int dsa_user_netpoll_setup(struct net_device *dev, - struct netpoll_info *ni) +static int dsa_user_netpoll_setup(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_user_priv *p = netdev_priv(dev); -- cgit v1.2.3 From 7cfc1b1fa8673fe386304194d4cc2c8fe555bbf9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Oct 2024 05:23:10 +0000 Subject: net: netdev_tx_sent_queue() small optimization Change smp_mb() imediately following a set_bit() with smp_mb__after_atomic(). Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241018052310.2612084-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 86a0b7eb9461..bbd30f3c5d29 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3517,7 +3517,7 @@ static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, * because in netdev_tx_completed_queue we update the dql_completed * before checking the XOFF flag. */ - smp_mb(); + smp_mb__after_atomic(); /* check again in case another CPU has just made room avail */ if (unlikely(dql_avail(&dev_queue->dql) >= 0)) -- cgit v1.2.3 From f7f52738637f4361c108cad36e23ee98959a9006 Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:43 +0000 Subject: neighbour: Create netdev->neighbour association Create a mapping between a netdev and its neighoburs, allowing for much cheaper flushes. Signed-off-by: Gilad Naaman Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241107160444.2913124-7-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/net_device.rst | 1 + include/linux/netdevice.h | 7 ++ include/net/neighbour.h | 9 +- include/net/neighbour_tables.h | 12 +++ net/core/neighbour.c | 96 +++++++++++++--------- 5 files changed, 80 insertions(+), 45 deletions(-) create mode 100644 include/net/neighbour_tables.h (limited to 'include/linux/netdevice.h') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index ade50d4e67cf..15e31ece675f 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -188,4 +188,5 @@ u64 max_pacing_offload_horizon struct_napi_config* napi_config unsigned_long gro_flush_timeout u32 napi_defer_hard_irqs +struct hlist_head neighbours[2] =================================== =========================== =================== =================== =================================================================================== diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3c552b648b27..df4483598628 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -52,6 +52,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -2032,6 +2033,9 @@ enum netdev_reg_state { * @napi_defer_hard_irqs: If not zero, provides a counter that would * allow to avoid NIC hard IRQ, on busy queues. * + * @neighbours: List heads pointing to this device's neighbours' + * dev_list, one per address-family. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2440,6 +2444,9 @@ struct net_device { */ struct net_shaper_hierarchy *net_shaper_hierarchy; #endif + + struct hlist_head neighbours[NEIGH_NR_TABLES]; + u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 40aac1e24c68..9a832cab5b1d 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -29,6 +29,7 @@ #include #include #include +#include /* * NUD stands for "neighbor unreachability detection" @@ -136,6 +137,7 @@ struct neigh_statistics { struct neighbour { struct hlist_node hash; + struct hlist_node dev_list; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; @@ -236,13 +238,6 @@ struct neigh_table { struct pneigh_entry **phash_buckets; }; -enum { - NEIGH_ARP_TABLE = 0, - NEIGH_ND_TABLE = 1, - NEIGH_NR_TABLES, - NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ -}; - static inline int neigh_parms_family(struct neigh_parms *p) { return p->tbl->family; diff --git a/include/net/neighbour_tables.h b/include/net/neighbour_tables.h new file mode 100644 index 000000000000..bcffbe8f7601 --- /dev/null +++ b/include/net/neighbour_tables.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_NEIGHBOUR_TABLES_H +#define _NET_NEIGHBOUR_TABLES_H + +enum { + NEIGH_ARP_TABLE = 0, + NEIGH_ND_TABLE = 1, + NEIGH_NR_TABLES, + NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ +}; + +#endif diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 59f359c7b5e3..5e572f6eaf2c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -60,6 +60,25 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, static const struct seq_operations neigh_stat_seq_ops; #endif +static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family) +{ + int i; + + switch (family) { + default: + DEBUG_NET_WARN_ON_ONCE(1); + fallthrough; /* to avoid panic by null-ptr-deref */ + case AF_INET: + i = NEIGH_ARP_TABLE; + break; + case AF_INET6: + i = NEIGH_ND_TABLE; + break; + } + + return &dev->neighbours[i]; +} + /* Neighbour hash table buckets are protected with rwlock tbl->lock. @@ -211,6 +230,7 @@ bool neigh_remove_one(struct neighbour *n) write_lock(&n->lock); if (refcount_read(&n->refcnt) == 1) { hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); retval = true; } @@ -351,48 +371,42 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { - int i; - struct neigh_hash_table *nht; - - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); + struct hlist_head *dev_head; + struct hlist_node *tmp; + struct neighbour *n; - for (i = 0; i < (1 << nht->hash_shift); i++) { - struct hlist_node *tmp; - struct neighbour *n; + dev_head = neigh_get_dev_table(dev, tbl->family); - neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { - if (dev && n->dev != dev) - continue; - if (skip_perm && n->nud_state & NUD_PERMANENT) - continue; + hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { + if (skip_perm && n->nud_state & NUD_PERMANENT) + continue; - hlist_del_rcu(&n->hash); - write_lock(&n->lock); - neigh_del_timer(n); - neigh_mark_dead(n); - if (refcount_read(&n->refcnt) != 1) { - /* The most unpleasant situation. - We must destroy neighbour entry, - but someone still uses it. - - The destroy will be delayed until - the last user releases us, but - we must kill timers etc. and move - it to safe state. - */ - __skb_queue_purge(&n->arp_queue); - n->arp_queue_len_bytes = 0; - WRITE_ONCE(n->output, neigh_blackhole); - if (n->nud_state & NUD_VALID) - n->nud_state = NUD_NOARP; - else - n->nud_state = NUD_NONE; - neigh_dbg(2, "neigh %p is stray\n", n); - } - write_unlock(&n->lock); - neigh_cleanup_and_release(n); + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); + write_lock(&n->lock); + neigh_del_timer(n); + neigh_mark_dead(n); + if (refcount_read(&n->refcnt) != 1) { + /* The most unpleasant situation. + * We must destroy neighbour entry, + * but someone still uses it. + * + * The destroy will be delayed until + * the last user releases us, but + * we must kill timers etc. and move + * it to safe state. + */ + __skb_queue_purge(&n->arp_queue); + n->arp_queue_len_bytes = 0; + WRITE_ONCE(n->output, neigh_blackhole); + if (n->nud_state & NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + neigh_dbg(2, "neigh %p is stray\n", n); } + write_unlock(&n->lock); + neigh_cleanup_and_release(n); } } @@ -655,6 +669,10 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, if (want_ref) neigh_hold(n); hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); + + hlist_add_head_rcu(&n->dev_list, + neigh_get_dev_table(dev, tbl->family)); + write_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; @@ -935,6 +953,7 @@ static void neigh_periodic_work(struct work_struct *work) !time_in_range_open(jiffies, n->used, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); @@ -3054,6 +3073,7 @@ void __neigh_for_each_release(struct neigh_table *tbl, release = cb(n); if (release) { hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); } write_unlock(&n->lock); -- cgit v1.2.3 From 5dc51ec86df6e2214d8398079c1e31736593ab53 Mon Sep 17 00:00:00 2001 From: Martin Karsten Date: Sat, 9 Nov 2024 05:02:31 +0000 Subject: net: Add napi_struct parameter irq_suspend_timeout Add a per-NAPI IRQ suspension parameter, which can be get/set with netdev-genl. This patch doesn't change any behavior but prepares the code for other changes in the following commits which use irq_suspend_timeout as a timeout for IRQ suspension. Signed-off-by: Martin Karsten Co-developed-by: Joe Damato Signed-off-by: Joe Damato Tested-by: Joe Damato Tested-by: Martin Karsten Acked-by: Stanislav Fomichev Reviewed-by: Sridhar Samudrala Link: https://patch.msgid.link/20241109050245.191288-2-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 7 +++++++ include/linux/netdevice.h | 2 ++ include/uapi/linux/netdev.h | 1 + net/core/dev.c | 2 ++ net/core/dev.h | 25 +++++++++++++++++++++++++ net/core/netdev-genl-gen.c | 5 +++-- net/core/netdev-genl.c | 12 ++++++++++++ tools/include/uapi/linux/netdev.h | 1 + 8 files changed, 53 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index f9cb97d6106c..cbb544bd6c84 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -263,6 +263,11 @@ attribute-sets: the end of a NAPI cycle. This may add receive latency in exchange for reducing the number of frames processed by the network stack. type: uint + - + name: irq-suspend-timeout + doc: The timeout, in nanoseconds, of how long to suspend irq + processing, if event polling finds events + type: uint - name: queue attributes: @@ -653,6 +658,7 @@ operations: - pid - defer-hard-irqs - gro-flush-timeout + - irq-suspend-timeout dump: request: attributes: @@ -704,6 +710,7 @@ operations: - id - defer-hard-irqs - gro-flush-timeout + - irq-suspend-timeout kernel-family: headers: [ "linux/list.h"] diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index df4483598628..0aae346d919e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -348,6 +348,7 @@ struct gro_list { */ struct napi_config { u64 gro_flush_timeout; + u64 irq_suspend_timeout; u32 defer_hard_irqs; unsigned int napi_id; }; @@ -384,6 +385,7 @@ struct napi_struct { struct hrtimer timer; struct task_struct *thread; unsigned long gro_flush_timeout; + unsigned long irq_suspend_timeout; u32 defer_hard_irqs; /* control-path-only fields follow */ struct list_head dev_list; diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e3ebb49f60d2..e4be227d3ad6 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -124,6 +124,7 @@ enum { NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) diff --git a/net/core/dev.c b/net/core/dev.c index 6a31152e4606..4d910872963f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6666,6 +6666,7 @@ static void napi_restore_config(struct napi_struct *n) { n->defer_hard_irqs = n->config->defer_hard_irqs; n->gro_flush_timeout = n->config->gro_flush_timeout; + n->irq_suspend_timeout = n->config->irq_suspend_timeout; /* a NAPI ID might be stored in the config, if so use it. if not, use * napi_hash_add to generate one for us. It will be saved to the config * in napi_disable. @@ -6680,6 +6681,7 @@ static void napi_save_config(struct napi_struct *n) { n->config->defer_hard_irqs = n->defer_hard_irqs; n->config->gro_flush_timeout = n->gro_flush_timeout; + n->config->irq_suspend_timeout = n->irq_suspend_timeout; n->config->napi_id = n->napi_id; napi_hash_del(n); } diff --git a/net/core/dev.h b/net/core/dev.h index 7881bced70a9..d043dee25a68 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -236,6 +236,31 @@ static inline void netdev_set_gro_flush_timeout(struct net_device *netdev, netdev->napi_config[i].gro_flush_timeout = timeout; } +/** + * napi_get_irq_suspend_timeout - get the irq_suspend_timeout + * @n: napi struct to get the irq_suspend_timeout from + * + * Return: the per-NAPI value of the irq_suspend_timeout field. + */ +static inline unsigned long +napi_get_irq_suspend_timeout(const struct napi_struct *n) +{ + return READ_ONCE(n->irq_suspend_timeout); +} + +/** + * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi + * @n: napi struct to set the irq_suspend_timeout + * @timeout: timeout value to set + * + * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout + */ +static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, + unsigned long timeout) +{ + WRITE_ONCE(n->irq_suspend_timeout, timeout); +} + int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 21de7e10be16..a89cbd8d87c3 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -92,10 +92,11 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] }; /* NETDEV_CMD_NAPI_SET - do */ -static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT + 1] = { +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT + 1] = { [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, + [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, }; /* Ops table for netdev */ @@ -186,7 +187,7 @@ static const struct genl_split_ops netdev_nl_ops[] = { .cmd = NETDEV_CMD_NAPI_SET, .doit = netdev_nl_napi_set_doit, .policy = netdev_napi_set_nl_policy, - .maxattr = NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + .maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, }; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index b49c3b4e5fbe..765ce7c9d73b 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -161,6 +161,7 @@ static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { + unsigned long irq_suspend_timeout; unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; void *hdr; @@ -196,6 +197,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, napi_defer_hard_irqs)) goto nla_put_failure; + irq_suspend_timeout = napi_get_irq_suspend_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + irq_suspend_timeout)) + goto nla_put_failure; + gro_flush_timeout = napi_get_gro_flush_timeout(napi); if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, gro_flush_timeout)) @@ -306,6 +312,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) static int netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) { + u64 irq_suspend_timeout = 0; u64 gro_flush_timeout = 0; u32 defer = 0; @@ -314,6 +321,11 @@ netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) napi_set_defer_hard_irqs(napi, defer); } + if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) { + irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]); + napi_set_irq_suspend_timeout(napi, irq_suspend_timeout); + } + if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); napi_set_gro_flush_timeout(napi, gro_flush_timeout); diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index e3ebb49f60d2..e4be227d3ad6 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -124,6 +124,7 @@ enum { NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) -- cgit v1.2.3 From 4b42fbc6bd8f73d9ded535d8c61ccaa837ff3bd4 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 14 Nov 2024 15:09:53 +0100 Subject: ndo_fdb_add: Add a parameter to report whether notification was sent Currently when FDB entries are added to or deleted from a VXLAN netdevice, the VXLAN driver emits one notification, including the VXLAN-specific attributes. The core however always sends a notification as well, a generic one. Thus two notifications are unnecessarily sent for these operations. A similar situation comes up with bridge driver, which also emits notifications on its own: # ip link add name vx type vxlan id 1000 dstport 4789 # bridge monitor fdb & [1] 1981693 # bridge fdb add de:ad:be:ef:13:37 dev vx self dst 192.0.2.1 de:ad:be:ef:13:37 dev vx dst 192.0.2.1 self permanent de:ad:be:ef:13:37 dev vx self permanent In order to prevent this duplicity, add a paremeter to ndo_fdb_add, bool *notified. The flag is primed to false, and if the callee sends a notification on its own, it sets it to true, thus informing the core that it should not generate another notification. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/cbf6ae8195e85cbf922f8058ce4eba770f3b71ed.1731589511.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 ++- drivers/net/ethernet/intel/ice/ice_main.c | 4 +++- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/ethernet/mscc/ocelot_net.c | 2 +- drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c | 2 +- drivers/net/macvlan.c | 2 +- drivers/net/vxlan/vxlan_core.c | 5 ++++- include/linux/netdevice.h | 5 ++++- net/bridge/br_fdb.c | 12 +++++++----- net/bridge/br_private.h | 2 +- net/core/rtnetlink.c | 9 ++++++--- 12 files changed, 32 insertions(+), 18 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 55fb362eb508..ab5febf83ec3 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -13095,12 +13095,13 @@ static int i40e_get_phys_port_id(struct net_device *netdev, * @addr: the MAC address entry being added * @vid: VLAN ID * @flags: instructions from stack about fdb operation + * @notified: whether notification was emitted * @extack: netlink extended ack, unused currently */ static int i40e_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - u16 flags, + u16 flags, bool *notified, struct netlink_ext_ack *extack) { struct i40e_netdev_priv *np = netdev_priv(dev); diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index a6f586f9bfd1..c875036f654b 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -6125,12 +6125,14 @@ ice_set_tx_maxrate(struct net_device *netdev, int queue_index, u32 maxrate) * @addr: the MAC address entry being added * @vid: VLAN ID * @flags: instructions from stack about fdb operation + * @notified: whether notification was emitted * @extack: netlink extended ack */ static int ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - u16 flags, struct netlink_ext_ack __always_unused *extack) + u16 flags, bool *notified, + struct netlink_ext_ack __always_unused *extack) { int err; diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index f1d088168723..f0528bd13184 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2486,7 +2486,7 @@ static int igb_set_features(struct net_device *netdev, static int igb_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - u16 flags, + u16 flags, bool *notified, struct netlink_ext_ack *extack) { /* guarantee we can provide a unique filter for the unicast address */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 8b8404d8c946..adc9392463ce 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9954,7 +9954,7 @@ static int ixgbe_set_features(struct net_device *netdev, static int ixgbe_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - u16 flags, + u16 flags, bool *notified, struct netlink_ext_ack *extack) { /* guarantee we can provide a unique filter for the unicast address */ diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 7c9540a71725..4f15ba2c5525 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -730,7 +730,7 @@ static void ocelot_get_stats64(struct net_device *dev, static int ocelot_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, - u16 vid, u16 flags, + u16 vid, u16 flags, bool *notified, struct netlink_ext_ack *extack) { struct ocelot_port_private *priv = netdev_priv(dev); diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index b3588a1ebc25..2484cebd97d4 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -394,7 +394,7 @@ static int qlcnic_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], static int qlcnic_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *netdev, const unsigned char *addr, u16 vid, u16 flags, - struct netlink_ext_ack *extack) + bool *notified, struct netlink_ext_ack *extack) { struct qlcnic_adapter *adapter = netdev_priv(netdev); int err = 0; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index edbd5afcec41..dfb462e63248 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1024,7 +1024,7 @@ static int macvlan_vlan_rx_kill_vid(struct net_device *dev, static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - u16 flags, + u16 flags, bool *notified, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 42b07bc2b107..22f17c5c7549 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1241,7 +1241,7 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, - struct netlink_ext_ack *extack) + bool *notified, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); /* struct net *net = dev_net(vxlan->dev); */ @@ -1277,6 +1277,9 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], nhid, true, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); + if (!err) + *notified = true; + return err; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0aae346d919e..6a7fd191e1ee 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1248,8 +1248,10 @@ struct netdev_net_notifier { * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, * const unsigned char *addr, u16 vid, u16 flags, - * struct netlink_ext_ack *extack); + * bool *notified, struct netlink_ext_ack *extack); * Adds an FDB entry to dev for addr. + * Callee shall set *notified to true if it sent any appropriate + * notification(s). Otherwise core will send a generic one. * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, * const unsigned char *addr, u16 vid) @@ -1525,6 +1527,7 @@ struct net_device_ops { const unsigned char *addr, u16 vid, u16 flags, + bool *notified, struct netlink_ext_ack *extack); int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 77f110035df1..5f29958f3ddd 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -1152,7 +1152,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 nlh_flags, u16 vid, struct nlattr *nfea_tb[], - struct netlink_ext_ack *extack) + bool *notified, struct netlink_ext_ack *extack) { int err = 0; @@ -1183,6 +1183,8 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, spin_unlock_bh(&br->hash_lock); } + if (!err) + *notified = true; return err; } @@ -1195,7 +1197,7 @@ static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = { int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags, - struct netlink_ext_ack *extack) + bool *notified, struct netlink_ext_ack *extack) { struct nlattr *nfea_tb[NFEA_MAX + 1], *attr; struct net_bridge_vlan_group *vg; @@ -1258,10 +1260,10 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], /* VID was specified, so use it. */ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid, nfea_tb, - extack); + notified, extack); } else { err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0, nfea_tb, - extack); + notified, extack); if (err || !vg || !vg->num_vlans) goto out; @@ -1273,7 +1275,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], if (!br_vlan_should_use(v)) continue; err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid, - nfea_tb, extack); + nfea_tb, notified, extack); if (err) goto out; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 041f6e571a20..ebfc59049ec1 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -858,7 +858,7 @@ int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack); int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags, - struct netlink_ext_ack *extack); + bool *notified, struct netlink_ext_ack *extack); int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *fdev, int *idx); int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 327fa4957929..f31b2436cde5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4578,9 +4578,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; + bool notified = false; err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, - nlh->nlmsg_flags, extack); + nlh->nlmsg_flags, ¬ified, extack); if (err) goto out; else @@ -4589,16 +4590,18 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, /* Embedded bridge, macvlan, and any other device support */ if ((ndm->ndm_flags & NTF_SELF)) { + bool notified = false; + if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, - extack); + ¬ified, extack); else err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); - if (!err) { + if (!err && !notified) { rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; -- cgit v1.2.3 From 42575ad5aab932273475d1ec3e7881cb5a05420e Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 14 Nov 2024 15:09:54 +0100 Subject: ndo_fdb_del: Add a parameter to report whether notification was sent In a similar fashion to ndo_fdb_add, which was covered in the previous patch, add the bool *notified argument to ndo_fdb_del. Callees that send a notification on their own set the flag to true. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/06b1acf4953ef0a5ed153ef1f32d7292044f2be6.1731589511.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_main.c | 4 +++- drivers/net/ethernet/mscc/ocelot_net.c | 2 +- drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c | 2 +- drivers/net/macvlan.c | 2 +- drivers/net/vxlan/vxlan_core.c | 5 ++++- include/linux/netdevice.h | 9 +++++++-- net/bridge/br_fdb.c | 15 ++++++++------- net/bridge/br_private.h | 2 +- net/core/rtnetlink.c | 11 ++++++++--- 9 files changed, 34 insertions(+), 18 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index c875036f654b..b79848fe2a9e 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -6166,12 +6166,14 @@ ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[], * @dev: the net device pointer * @addr: the MAC address entry being added * @vid: VLAN ID + * @notified: whether notification was emitted * @extack: netlink extended ack */ static int ice_fdb_del(struct ndmsg *ndm, __always_unused struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, - __always_unused u16 vid, struct netlink_ext_ack *extack) + __always_unused u16 vid, bool *notified, + struct netlink_ext_ack *extack) { int err; diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 4f15ba2c5525..558e03301aa8 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -744,7 +744,7 @@ static int ocelot_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], static int ocelot_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - struct netlink_ext_ack *extack) + bool *notified, struct netlink_ext_ack *extack) { struct ocelot_port_private *priv = netdev_priv(dev); struct ocelot_port *ocelot_port = &priv->port; diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index 2484cebd97d4..eb69121df726 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -367,7 +367,7 @@ static int qlcnic_set_mac(struct net_device *netdev, void *p) static int qlcnic_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *netdev, - const unsigned char *addr, u16 vid, + const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct qlcnic_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index dfb462e63248..fed4fe2a4748 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1049,7 +1049,7 @@ static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], static int macvlan_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid, + const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 22f17c5c7549..9ea63059d52d 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1319,7 +1319,7 @@ out: /* Delete entry (via netlink) */ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid, + const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); @@ -1341,6 +1341,9 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], true); spin_unlock_bh(&vxlan->hash_lock[hash_index]); + if (!err) + *notified = true; + return err; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6a7fd191e1ee..ecc686409161 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1254,8 +1254,11 @@ struct netdev_net_notifier { * notification(s). Otherwise core will send a generic one. * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, - * const unsigned char *addr, u16 vid) + * const unsigned char *addr, u16 vid + * bool *notified, struct netlink_ext_ack *extack); * Deletes the FDB entry from dev corresponding to addr. + * Callee shall set *notified to true if it sent any appropriate + * notification(s). Otherwise core will send a generic one. * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev, * struct netlink_ext_ack *extack); * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, @@ -1533,7 +1536,9 @@ struct net_device_ops { struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, - u16 vid, struct netlink_ext_ack *extack); + u16 vid, + bool *notified, + struct netlink_ext_ack *extack); int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 5f29958f3ddd..82bac2426631 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -1287,7 +1287,7 @@ out: static int fdb_delete_by_addr_and_port(struct net_bridge *br, const struct net_bridge_port *p, - const u8 *addr, u16 vlan) + const u8 *addr, u16 vlan, bool *notified) { struct net_bridge_fdb_entry *fdb; @@ -1296,18 +1296,19 @@ static int fdb_delete_by_addr_and_port(struct net_bridge *br, return -ENOENT; fdb_delete(br, fdb, true); + *notified = true; return 0; } static int __br_fdb_delete(struct net_bridge *br, const struct net_bridge_port *p, - const unsigned char *addr, u16 vid) + const unsigned char *addr, u16 vid, bool *notified) { int err; spin_lock_bh(&br->hash_lock); - err = fdb_delete_by_addr_and_port(br, p, addr, vid); + err = fdb_delete_by_addr_and_port(br, p, addr, vid, notified); spin_unlock_bh(&br->hash_lock); return err; @@ -1316,7 +1317,7 @@ static int __br_fdb_delete(struct net_bridge *br, /* Remove neighbor entry with RTM_DELNEIGH */ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid, + const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; @@ -1339,19 +1340,19 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], } if (vid) { - err = __br_fdb_delete(br, p, addr, vid); + err = __br_fdb_delete(br, p, addr, vid, notified); } else { struct net_bridge_vlan *v; err = -ENOENT; - err &= __br_fdb_delete(br, p, addr, 0); + err &= __br_fdb_delete(br, p, addr, 0, notified); if (!vg || !vg->num_vlans) return err; list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; - err &= __br_fdb_delete(br, p, addr, v->vid); + err &= __br_fdb_delete(br, p, addr, v->vid, notified); } } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index ebfc59049ec1..9853cfbb9d14 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -853,7 +853,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - struct netlink_ext_ack *extack); + bool *notified, struct netlink_ext_ack *extack); int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack); int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f31b2436cde5..dd142f444659 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4701,11 +4701,13 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); + bool notified = false; ops = br_dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, + ¬ified, extack); } else { if (ops->ndo_fdb_del_bulk) err = ops->ndo_fdb_del_bulk(nlh, dev, extack); @@ -4719,10 +4721,13 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { + bool notified = false; + ops = dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, + ¬ified, extack); else err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); } else { @@ -4733,7 +4738,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, } if (!err) { - if (!del_bulk) + if (!del_bulk && !notified) rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; -- cgit v1.2.3