From eec517cdb4810b3843eb7707971de3164088bff1 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 20 Apr 2020 00:11:50 +0200 Subject: net: Add IF_OPER_TESTING RFC 2863 defines the operational state testing. Add support for this state, both as a IF_LINK_MODE_ and __LINK_STATE_. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/netdevice.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 130a668049ab..0750b54b3765 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -288,6 +288,7 @@ enum netdev_state_t { __LINK_STATE_NOCARRIER, __LINK_STATE_LINKWATCH_PENDING, __LINK_STATE_DORMANT, + __LINK_STATE_TESTING, }; @@ -3907,6 +3908,46 @@ static inline bool netif_dormant(const struct net_device *dev) } +/** + * netif_testing_on - mark device as under test. + * @dev: network device + * + * Mark device as under test (as per RFC2863). + * + * The testing state indicates that some test(s) must be performed on + * the interface. After completion, of the test, the interface state + * will change to up, dormant, or down, as appropriate. + */ +static inline void netif_testing_on(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_TESTING, &dev->state)) + linkwatch_fire_event(dev); +} + +/** + * netif_testing_off - set device as not under test. + * @dev: network device + * + * Device is not in testing state. + */ +static inline void netif_testing_off(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_TESTING, &dev->state)) + linkwatch_fire_event(dev); +} + +/** + * netif_testing - test if device is under test + * @dev: network device + * + * Check if device is under test + */ +static inline bool netif_testing(const struct net_device *dev) +{ + return test_bit(__LINK_STATE_TESTING, &dev->state); +} + + /** * netif_oper_up - test if device is operational * @dev: network device -- cgit v1.2.3 From 6f8b12d661d09b488b9ac879b8eafbd2cc4a1450 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Apr 2020 09:13:27 -0700 Subject: net: napi: add hard irqs deferral feature Back in commit 3b47d30396ba ("net: gro: add a per device gro flush timer") we added the ability to arm one high resolution timer, that we used to keep not-complete packets in GRO engine a bit longer, hoping that further frames might be added to them. Since then, we added the napi_complete_done() interface, and commit 364b6055738b ("net: busy-poll: return busypolling status to drivers") allowed drivers to avoid re-arming NIC interrupts if we made a promise that their NAPI poll() handler would be called in the near future. This infrastructure can be leveraged, thanks to a new device parameter, which allows to arm the napi hrtimer, instead of re-arming the device hard IRQ. We have noticed that on some servers with 32 RX queues or more, the chit-chat between the NIC and the host caused by IRQ delivery and re-arming could hurt throughput by ~20% on 100Gbit NIC. In contrast, hrtimers are using local (percpu) resources and might have lower cost. The new tunable, named napi_defer_hard_irqs, is placed in the same hierarchy than gro_flush_timeout (/sys/class/net/ethX/) By default, both gro_flush_timeout and napi_defer_hard_irqs are zero. This patch does not change the prior behavior of gro_flush_timeout if used alone : NIC hard irqs should be rearmed as before. One concrete usage can be : echo 20000 >/sys/class/net/eth1/gro_flush_timeout echo 10 >/sys/class/net/eth1/napi_defer_hard_irqs If at least one packet is retired, then we will reset napi counter to 10 (napi_defer_hard_irqs), ensuring at least 10 periodic scans of the queue. On busy queues, this should avoid NIC hard IRQ, while before this patch IRQ avoidance was only possible if napi->poll() was exhausting its budget and not call napi_complete_done(). This feature also can be used to work around some non-optimal NIC irq coalescing strategies. Having the ability to insert XX usec delays between each napi->poll() can increase cache efficiency, since we increase batch sizes. It also keeps serving cpus not idle too long, reducing tail latencies. Co-developed-by: Luigi Rizzo Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 29 ++++++++++++++++++----------- net/core/net-sysfs.c | 18 ++++++++++++++++++ 3 files changed, 38 insertions(+), 11 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0750b54b3765..5a8d40f1ffe2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -329,6 +329,7 @@ struct napi_struct { unsigned long state; int weight; + int defer_hard_irqs_count; unsigned long gro_bitmask; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL @@ -1995,6 +1996,7 @@ struct net_device { struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; + int napi_defer_hard_irqs; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; diff --git a/net/core/dev.c b/net/core/dev.c index fb61522b1ce1..67585484ad32 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6227,7 +6227,8 @@ EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { - unsigned long flags, val, new; + unsigned long flags, val, new, timeout = 0; + bool ret = true; /* * 1) Don't let napi dequeue from the cpu poll list @@ -6239,20 +6240,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - if (n->gro_bitmask) { - unsigned long timeout = 0; - - if (work_done) + if (work_done) { + if (n->gro_bitmask) timeout = n->dev->gro_flush_timeout; - + n->defer_hard_irqs_count = n->dev->napi_defer_hard_irqs; + } + if (n->defer_hard_irqs_count > 0) { + n->defer_hard_irqs_count--; + timeout = n->dev->gro_flush_timeout; + if (timeout) + ret = false; + } + if (n->gro_bitmask) { /* When the NAPI instance uses a timeout and keeps postponing * it, we need to bound somehow the time packets are kept in * the GRO layer */ napi_gro_flush(n, !!timeout); - if (timeout) - hrtimer_start(&n->timer, ns_to_ktime(timeout), - HRTIMER_MODE_REL_PINNED); } gro_normal_list(n); @@ -6284,7 +6288,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done) return false; } - return true; + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + return ret; } EXPORT_SYMBOL(napi_complete_done); @@ -6464,7 +6471,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (napi->gro_bitmask && !napi_disable_pending(napi) && + if (!napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 0d9e46de205e..f3b650cd0923 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -382,6 +382,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev, } NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); +static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) +{ + dev->napi_defer_hard_irqs = val; + return 0; +} + +static ssize_t napi_defer_hard_irqs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs); +} +NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec); + static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -545,6 +562,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_flags.attr, &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, + &dev_attr_napi_defer_hard_irqs.attr, &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, -- cgit v1.2.3 From cff9f12b18915d957a2130885a00f8ab15cff7e4 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:31 +0300 Subject: net/core: Introduce netdev_get_xmit_slave Add new ndo to get the xmit slave of master device. The reference counters are not incremented so the caller must be careful with locks. User can ask to get the xmit slave assume all the slaves can transmit by set all_slaves arg to true. Signed-off-by: Maor Gottlieb Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- include/linux/netdevice.h | 12 ++++++++++++ net/core/dev.c | 22 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 130a668049ab..26bc0f11b7ad 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1146,6 +1146,12 @@ struct netdev_net_notifier { * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to release previously enslaved netdev. * + * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev, + * struct sk_buff *skb, + * bool all_slaves); + * Get the xmit slave of master device. If all_slaves is true, function + * assume all the slaves can transmit. + * * Feature/offload setting functions. * netdev_features_t (*ndo_fix_features)(struct net_device *dev, * netdev_features_t features); @@ -1389,6 +1395,9 @@ struct net_device_ops { struct netlink_ext_ack *extack); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); + struct net_device* (*ndo_get_xmit_slave)(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves); netdev_features_t (*ndo_fix_features)(struct net_device *dev, netdev_features_t features); int (*ndo_set_features)(struct net_device *dev, @@ -2731,6 +2740,9 @@ void netdev_freemem(struct net_device *dev); void synchronize_net(void); int init_dummy_netdev(struct net_device *dev); +struct net_device *netdev_get_xmit_slave(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves); struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); diff --git a/net/core/dev.c b/net/core/dev.c index 9c9e763bfe0e..e6c10980abfd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7785,6 +7785,28 @@ void netdev_bonding_info_change(struct net_device *dev, } EXPORT_SYMBOL(netdev_bonding_info_change); +/** + * netdev_get_xmit_slave - Get the xmit slave of master device + * @skb: The packet + * @all_slaves: assume all the slaves are active + * + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + * %NULL is returned if no slave is found. + */ + +struct net_device *netdev_get_xmit_slave(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_xmit_slave) + return NULL; + return ops->ndo_get_xmit_slave(dev, skb, all_slaves); +} +EXPORT_SYMBOL(netdev_get_xmit_slave); + static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; -- cgit v1.2.3 From 1a33e10e4a95cb109ff1145098175df3113313ef Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 2 May 2020 22:22:19 -0700 Subject: net: partially revert dynamic lockdep key changes This patch reverts the folowing commits: commit 064ff66e2bef84f1153087612032b5b9eab005bd "bonding: add missing netdev_update_lockdep_key()" commit 53d374979ef147ab51f5d632dfe20b14aebeccd0 "net: avoid updating qdisc_xmit_lock_key in netdev_update_lockdep_key()" commit 1f26c0d3d24125992ab0026b0dab16c08df947c7 "net: fix kernel-doc warning in " commit ab92d68fc22f9afab480153bd82a20f6e2533769 "net: core: add generic lockdep keys" but keeps the addr_list_lock_key because we still lock addr_list_lock nestedly on stack devices, unlikely xmit_lock this is safe because we don't take addr_list_lock on any fast path. Reported-and-tested-by: syzbot+aaa6fa4949cc5d9b7b25@syzkaller.appspotmail.com Cc: Dmitry Vyukov Cc: Taehee Yoo Signed-off-by: Cong Wang Acked-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 1 + drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 16 ++++ drivers/net/hamradio/bpqether.c | 20 +++++ drivers/net/hyperv/netvsc_drv.c | 2 + drivers/net/ipvlan/ipvlan_main.c | 2 + drivers/net/macsec.c | 2 + drivers/net/macvlan.c | 2 + drivers/net/ppp/ppp_generic.c | 2 + drivers/net/team/team.c | 1 + drivers/net/vrf.c | 1 + drivers/net/wireless/intersil/hostap/hostap_hw.c | 22 ++++++ include/linux/netdevice.h | 27 +++++-- net/8021q/vlan_dev.c | 21 ++++++ net/batman-adv/soft-interface.c | 30 ++++++++ net/bluetooth/6lowpan.c | 8 ++ net/core/dev.c | 90 ++++++++++++++++++----- net/dsa/slave.c | 12 +++ net/ieee802154/6lowpan/core.c | 8 ++ net/l2tp/l2tp_eth.c | 1 + net/netrom/af_netrom.c | 21 ++++++ net/rose/af_rose.c | 21 ++++++ net/sched/sch_generic.c | 17 +++-- 22 files changed, 294 insertions(+), 33 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 2e70e43c5df5..d01871321d22 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4898,6 +4898,7 @@ static int bond_init(struct net_device *bond_dev) spin_lock_init(&bond->stats_lock); lockdep_register_key(&bond->stats_lock_key); lockdep_set_class(&bond->stats_lock, &bond->stats_lock_key); + netdev_lockdep_set_classes(bond_dev); list_add_tail(&bond->bond_list, &bn->dev_list); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c index 79d72c88bbef..b3cabc274121 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c @@ -299,6 +299,20 @@ static void nfp_repr_clean(struct nfp_repr *repr) nfp_port_free(repr->port); } +static struct lock_class_key nfp_repr_netdev_xmit_lock_key; + +static void nfp_repr_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &nfp_repr_netdev_xmit_lock_key); +} + +static void nfp_repr_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, nfp_repr_set_lockdep_class_one, NULL); +} + int nfp_repr_init(struct nfp_app *app, struct net_device *netdev, u32 cmsg_port_id, struct nfp_port *port, struct net_device *pf_netdev) @@ -308,6 +322,8 @@ int nfp_repr_init(struct nfp_app *app, struct net_device *netdev, u32 repr_cap = nn->tlv_caps.repr_cap; int err; + nfp_repr_set_lockdep_class(netdev); + repr->port = port; repr->dst = metadata_dst_alloc(0, METADATA_HW_PORT_MUX, GFP_KERNEL); if (!repr->dst) diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c index fbea6f232819..206688154fdf 100644 --- a/drivers/net/hamradio/bpqether.c +++ b/drivers/net/hamradio/bpqether.c @@ -107,6 +107,25 @@ struct bpqdev { static LIST_HEAD(bpq_devices); +/* + * bpqether network devices are paired with ethernet devices below them, so + * form a special "super class" of normal ethernet devices; split their locks + * off into a separate class since they always nest. + */ +static struct lock_class_key bpq_netdev_xmit_lock_key; + +static void bpq_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &bpq_netdev_xmit_lock_key); +} + +static void bpq_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, bpq_set_lockdep_class_one, NULL); +} + /* ------------------------------------------------------------------------ */ @@ -477,6 +496,7 @@ static int bpq_new_device(struct net_device *edev) err = register_netdevice(ndev); if (err) goto error; + bpq_set_lockdep_class(ndev); /* List protected by RTNL */ list_add_rcu(&bpq->bpq_list, &bpq_devices); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index d8e86bdbfba1..c0b647a4c893 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2456,6 +2456,8 @@ static int netvsc_probe(struct hv_device *dev, NETIF_F_HW_VLAN_CTAG_RX; net->vlan_features = net->features; + netdev_lockdep_set_classes(net); + /* MTU range: 68 - 1500 or 65521 */ net->min_mtu = NETVSC_MTU_MIN; if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index f195f278a83a..15e87c097b0b 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -131,6 +131,8 @@ static int ipvlan_init(struct net_device *dev) dev->gso_max_segs = phy_dev->gso_max_segs; dev->hard_header_len = phy_dev->hard_header_len; + netdev_lockdep_set_classes(dev); + ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats); if (!ipvlan->pcpu_stats) return -ENOMEM; diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 758baf7cb8a1..ea3f25cc79ef 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -4047,6 +4047,8 @@ static int macsec_newlink(struct net *net, struct net_device *dev, if (err < 0) return err; + netdev_lockdep_set_classes(dev); + err = netdev_upper_dev_link(real_dev, dev, extack); if (err < 0) goto unregister; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d45600e0a38c..34eb073cdd74 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -890,6 +890,8 @@ static int macvlan_init(struct net_device *dev) dev->gso_max_segs = lowerdev->gso_max_segs; dev->hard_header_len = lowerdev->hard_header_len; + netdev_lockdep_set_classes(dev); + vlan->pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->pcpu_stats) return -ENOMEM; diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 22cc2cb9d878..7d005896a0f9 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1410,6 +1410,8 @@ static int ppp_dev_init(struct net_device *dev) { struct ppp *ppp; + netdev_lockdep_set_classes(dev); + ppp = netdev_priv(dev); /* Let the netdevice take a reference on the ppp file. This ensures * that ppp_destroy_interface() won't run before the device gets diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 04845a4017f9..8c1e02752ff6 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1647,6 +1647,7 @@ static int team_init(struct net_device *dev) lockdep_register_key(&team->team_lock_key); __mutex_init(&team->lock, "team->team_lock_key", &team->team_lock_key); + netdev_lockdep_set_classes(dev); return 0; diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 56f8aab46f89..43928a1c2f2a 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -867,6 +867,7 @@ static int vrf_dev_init(struct net_device *dev) /* similarly, oper state is irrelevant; set to up to avoid confusion */ dev->operstate = IF_OPER_UP; + netdev_lockdep_set_classes(dev); return 0; out_rth: diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c index 58212c532c90..aadf3dec5bf3 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_hw.c +++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c @@ -3041,6 +3041,27 @@ static void prism2_clear_set_tim_queue(local_info_t *local) } } + +/* + * HostAP uses two layers of net devices, where the inner + * layer gets called all the time from the outer layer. + * This is a natural nesting, which needs a split lock type. + */ +static struct lock_class_key hostap_netdev_xmit_lock_key; + +static void prism2_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, + &hostap_netdev_xmit_lock_key); +} + +static void prism2_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, prism2_set_lockdep_class_one, NULL); +} + static struct net_device * prism2_init_local_data(struct prism2_helper_functions *funcs, int card_idx, struct device *sdev) @@ -3199,6 +3220,7 @@ while (0) if (ret >= 0) ret = register_netdevice(dev); + prism2_set_lockdep_class(dev); rtnl_unlock(); if (ret < 0) { printk(KERN_WARNING "%s: register netdevice failed!\n", diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5a8d40f1ffe2..7725efd6e48a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1805,13 +1805,11 @@ enum netdev_priv_flags { * @phydev: Physical device may attach itself * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. - * @qdisc_tx_busylock_key: lockdep class annotating Qdisc->busylock - * spinlock - * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount - * @qdisc_xmit_lock_key: lockdep class annotating - * netdev_queue->_xmit_lock spinlock + * * @addr_list_lock_key: lockdep class annotating * net_device->addr_list_lock spinlock + * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock + * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount * * @proto_down: protocol port state information can be sent to the * switch driver and used to set the phys state of the @@ -2112,10 +2110,9 @@ struct net_device { #endif struct phy_device *phydev; struct sfp_bus *sfp_bus; - struct lock_class_key qdisc_tx_busylock_key; - struct lock_class_key qdisc_running_key; - struct lock_class_key qdisc_xmit_lock_key; struct lock_class_key addr_list_lock_key; + struct lock_class_key *qdisc_tx_busylock; + struct lock_class_key *qdisc_running_key; bool proto_down; unsigned wol_enabled:1; @@ -2200,6 +2197,20 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, f(dev, &dev->_tx[i], arg); } +#define netdev_lockdep_set_classes(dev) \ +{ \ + static struct lock_class_key qdisc_tx_busylock_key; \ + static struct lock_class_key qdisc_running_key; \ + static struct lock_class_key qdisc_xmit_lock_key; \ + unsigned int i; \ + \ + (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ + (dev)->qdisc_running_key = &qdisc_running_key; \ + for (i = 0; i < (dev)->num_tx_queues; i++) \ + lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ + &qdisc_xmit_lock_key); \ +} + u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 990b9fde28c6..319220b2341d 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -489,6 +489,25 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); } +/* + * vlan network devices have devices nesting below it, and are a special + * "super class" of normal network devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key vlan_netdev_xmit_lock_key; + +static void vlan_dev_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *unused) +{ + lockdep_set_class(&txq->_xmit_lock, &vlan_netdev_xmit_lock_key); +} + +static void vlan_dev_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL); +} + static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .parse = eth_header_parse, @@ -579,6 +598,8 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); + vlan_dev_set_lockdep_class(dev); + vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) return -ENOMEM; diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 5f05a728f347..822af540b854 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -739,6 +739,34 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, return 0; } +/* batman-adv network devices have devices nesting below it and are a special + * "super class" of normal network devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key batadv_netdev_xmit_lock_key; + +/** + * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue + * @dev: device which owns the tx queue + * @txq: tx queue to modify + * @_unused: always NULL + */ +static void batadv_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key); +} + +/** + * batadv_set_lockdep_class() - Set txq and addr_list lockdep class + * @dev: network device to modify + */ +static void batadv_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); +} + /** * batadv_softif_init_late() - late stage initialization of soft interface * @dev: registered network device to modify @@ -752,6 +780,8 @@ static int batadv_softif_init_late(struct net_device *dev) int ret; size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; + batadv_set_lockdep_class(dev); + bat_priv = netdev_priv(dev); bat_priv->soft_iface = dev; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 4febc82a7c76..bb55d92691b0 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -571,7 +571,15 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) return err < 0 ? NET_XMIT_DROP : err; } +static int bt_dev_init(struct net_device *dev) +{ + netdev_lockdep_set_classes(dev); + + return 0; +} + static const struct net_device_ops netdev_ops = { + .ndo_init = bt_dev_init, .ndo_start_xmit = bt_xmit, }; diff --git a/net/core/dev.c b/net/core/dev.c index afff16849c26..f8d83922a6af 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -398,6 +398,74 @@ static RAW_NOTIFIER_HEAD(netdev_chain); DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); +#ifdef CONFIG_LOCKDEP +/* + * register_netdevice() inits txq->_xmit_lock and sets lockdep class + * according to dev->type + */ +static const unsigned short netdev_lock_type[] = { + ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, + ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, + ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, + ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, + ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, + ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, + ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, + ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, + ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, + ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, + ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, + ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, + ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, + ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, + ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; + +static const char *const netdev_lock_name[] = { + "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", + "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", + "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", + "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; + +static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; + +static inline unsigned short netdev_lock_pos(unsigned short dev_type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) + if (netdev_lock_type[i] == dev_type) + return i; + /* the last key is used by default */ + return ARRAY_SIZE(netdev_lock_type) - 1; +} + +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ + int i; + + i = netdev_lock_pos(dev_type); + lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], + netdev_lock_name[i]); +} +#else +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ +} +#endif + /******************************************************************************* * * Protocol management and registration routines @@ -9208,7 +9276,7 @@ static void netdev_init_one_queue(struct net_device *dev, { /* Initialize queue lock */ spin_lock_init(&queue->_xmit_lock); - lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; @@ -9255,22 +9323,6 @@ void netif_tx_stop_all_queues(struct net_device *dev) } EXPORT_SYMBOL(netif_tx_stop_all_queues); -static void netdev_register_lockdep_key(struct net_device *dev) -{ - lockdep_register_key(&dev->qdisc_tx_busylock_key); - lockdep_register_key(&dev->qdisc_running_key); - lockdep_register_key(&dev->qdisc_xmit_lock_key); - lockdep_register_key(&dev->addr_list_lock_key); -} - -static void netdev_unregister_lockdep_key(struct net_device *dev) -{ - lockdep_unregister_key(&dev->qdisc_tx_busylock_key); - lockdep_unregister_key(&dev->qdisc_running_key); - lockdep_unregister_key(&dev->qdisc_xmit_lock_key); - lockdep_unregister_key(&dev->addr_list_lock_key); -} - void netdev_update_lockdep_key(struct net_device *dev) { lockdep_unregister_key(&dev->addr_list_lock_key); @@ -9837,7 +9889,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - netdev_register_lockdep_key(dev); + lockdep_register_key(&dev->addr_list_lock_key); dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; @@ -9926,7 +9978,7 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; - netdev_unregister_lockdep_key(dev); + lockdep_unregister_key(&dev->addr_list_lock_key); /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ba8bf90dc0cc..fa2634043751 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1671,6 +1671,15 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return ret; } +static struct lock_class_key dsa_slave_netdev_xmit_lock_key; +static void dsa_slave_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, + &dsa_slave_netdev_xmit_lock_key); +} + int dsa_slave_suspend(struct net_device *slave_dev) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); @@ -1754,6 +1763,9 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->max_mtu = ETH_MAX_MTU; SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); + netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, + NULL); + SET_NETDEV_DEV(slave_dev, port->ds->dev); slave_dev->dev.of_node = port->dn; slave_dev->vlan_features = master->vlan_features; diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index c0b107cdd715..3297e7fa9945 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -58,6 +58,13 @@ static const struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; +static int lowpan_dev_init(struct net_device *ldev) +{ + netdev_lockdep_set_classes(ldev); + + return 0; +} + static int lowpan_open(struct net_device *dev) { if (!open_count) @@ -89,6 +96,7 @@ static int lowpan_get_iflink(const struct net_device *dev) } static const struct net_device_ops lowpan_netdev_ops = { + .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index d3b520b9b2c9..fd5ac2788e45 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -56,6 +56,7 @@ static int l2tp_eth_dev_init(struct net_device *dev) { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); + netdev_lockdep_set_classes(dev); return 0; } diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 7b1a74f74aad..eccc7d366e17 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -63,6 +63,26 @@ static DEFINE_SPINLOCK(nr_list_lock); static const struct proto_ops nr_proto_ops; +/* + * NETROM network devices are virtual network devices encapsulating NETROM + * frames into AX.25 which will be sent through an AX.25 device, so form a + * special "super class" of normal net devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key nr_netdev_xmit_lock_key; + +static void nr_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &nr_netdev_xmit_lock_key); +} + +static void nr_set_lockdep_key(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL); +} + /* * Socket removal during an interrupt is now safe. */ @@ -1394,6 +1414,7 @@ static int __init nr_proto_init(void) free_netdev(dev); goto fail; } + nr_set_lockdep_key(dev); dev_nr[i] = dev; } diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 1e8eeb044b07..e7a872207b46 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -64,6 +64,26 @@ static const struct proto_ops rose_proto_ops; ax25_address rose_callsign; +/* + * ROSE network devices are virtual network devices encapsulating ROSE + * frames into AX.25 which will be sent through an AX.25 device, so form a + * special "super class" of normal net devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key rose_netdev_xmit_lock_key; + +static void rose_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &rose_netdev_xmit_lock_key); +} + +static void rose_set_lockdep_key(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL); +} + /* * Convert a ROSE address into text. */ @@ -1511,6 +1531,7 @@ static int __init rose_proto_init(void) free_netdev(dev); goto fail; } + rose_set_lockdep_key(dev); dev_rose[i] = dev; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ad24fa1a51e6..ebc55d884247 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -794,6 +794,9 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { }; EXPORT_SYMBOL(pfifo_fast_ops); +static struct lock_class_key qdisc_tx_busylock; +static struct lock_class_key qdisc_running_key; + struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack) @@ -846,9 +849,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, } spin_lock_init(&sch->busylock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + seqcount_init(&sch->running); + lockdep_set_class(&sch->running, + dev->qdisc_running_key ?: &qdisc_running_key); sch->ops = ops; sch->flags = ops->static_flags; @@ -859,12 +870,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, dev_hold(dev); refcount_set(&sch->refcnt, 1); - if (sch != &noop_qdisc) { - lockdep_set_class(&sch->busylock, &dev->qdisc_tx_busylock_key); - lockdep_set_class(&sch->seqlock, &dev->qdisc_tx_busylock_key); - lockdep_set_class(&sch->running, &dev->qdisc_running_key); - } - return sch; errout1: kfree(p); -- cgit v1.2.3 From 607259a695312cdfac2b52fb9d5b5890c834d573 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 May 2020 15:03:13 +0200 Subject: net: add a new ndo_tunnel_ioctl method This method is used to properly allow kernel callers of the IPv4 route management ioctls. The exsting ip_tunnel_ioctl helper is renamed to ip_tunnel_ctl to better reflect that it doesn't directly implement ioctls touching user memory, and is used for the guts of ndo_tunnel_ctl implementations. A new ip_tunnel_ioctl helper is added that can be wired up directly to the ndo_do_ioctl method and takes care of the copy to and from userspace. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ include/net/ip_tunnels.h | 3 ++- net/ipv4/ip_gre.c | 35 ++++++++++++++--------------------- net/ipv4/ip_tunnel.c | 16 +++++++++++++++- net/ipv4/ip_vti.c | 32 +++++++++++++------------------- net/ipv4/ipip.c | 30 +++++++++--------------------- 6 files changed, 59 insertions(+), 63 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6a8f8daef09d..a18f8fdf4260 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -53,6 +53,7 @@ struct netpoll_info; struct device; struct phy_device; struct dsa_port; +struct ip_tunnel_parm; struct macsec_context; struct macsec_ops; @@ -1274,6 +1275,9 @@ struct netdev_net_notifier { * Get devlink port instance associated with a given netdev. * Called with a reference on the netdevice and devlink locks only, * rtnl_lock is not held. + * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, + * int cmd); + * Add, change, delete or get information on an IPv4 tunnel. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1479,6 +1483,8 @@ struct net_device_ops { int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); + int (*ndo_tunnel_ctl)(struct net_device *dev, + struct ip_tunnel_parm *p, int cmd); }; /** diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 236503a50759..076e5d7db7d3 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -269,7 +269,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const u8 proto, int tunnel_hlen); -int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); +int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); +int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0ce9b91ff55c..4e31f23e4117 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -768,45 +768,37 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) } } -static int ipgre_tunnel_ioctl(struct net_device *dev, - struct ifreq *ifr, int cmd) +static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, + int cmd) { - struct ip_tunnel_parm p; int err; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; - if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || - p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) || - ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING))) + if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE || + p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) || + ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; } - p.i_flags = gre_flags_to_tnl_flags(p.i_flags); - p.o_flags = gre_flags_to_tnl_flags(p.o_flags); + p->i_flags = gre_flags_to_tnl_flags(p->i_flags); + p->o_flags = gre_flags_to_tnl_flags(p->o_flags); - err = ip_tunnel_ioctl(dev, &p, cmd); + err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd == SIOCCHGTUNNEL) { struct ip_tunnel *t = netdev_priv(dev); - t->parms.i_flags = p.i_flags; - t->parms.o_flags = p.o_flags; + t->parms.i_flags = p->i_flags; + t->parms.o_flags = p->o_flags; if (strcmp(dev->rtnl_link_ops->kind, "erspan")) ipgre_link_update(dev, true); } - p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags); - p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags); - - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - return -EFAULT; - + p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); + p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); return 0; } @@ -924,10 +916,11 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_stop = ipgre_close, #endif .ndo_start_xmit = ipgre_xmit, - .ndo_do_ioctl = ipgre_tunnel_ioctl, + .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = ipgre_tunnel_ctl, }; #define GRE_FEATURES (NETIF_F_SG | \ diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index cd4b84310d92..f4f1d11eab50 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -860,7 +860,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, netdev_state_change(dev); } -int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { int err = 0; struct ip_tunnel *t = netdev_priv(dev); @@ -960,6 +960,20 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) done: return err; } +EXPORT_SYMBOL_GPL(ip_tunnel_ctl); + +int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct ip_tunnel_parm p; + int err; + + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; + err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); + if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + return -EFAULT; + return err; +} EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 1b4e6f298648..c8974360a99f 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -378,38 +378,31 @@ static int vti4_err(struct sk_buff *skb, u32 info) } static int -vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { int err = 0; - struct ip_tunnel_parm p; - - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || - p.iph.ihl != 5) + if (p->iph.version != 4 || p->iph.protocol != IPPROTO_IPIP || + p->iph.ihl != 5) return -EINVAL; } - if (!(p.i_flags & GRE_KEY)) - p.i_key = 0; - if (!(p.o_flags & GRE_KEY)) - p.o_key = 0; + if (!(p->i_flags & GRE_KEY)) + p->i_key = 0; + if (!(p->o_flags & GRE_KEY)) + p->o_key = 0; - p.i_flags = VTI_ISVTI; + p->i_flags = VTI_ISVTI; - err = ip_tunnel_ioctl(dev, &p, cmd); + err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { - p.i_flags |= GRE_KEY; - p.o_flags |= GRE_KEY; + p->i_flags |= GRE_KEY; + p->o_flags |= GRE_KEY; } - - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - return -EFAULT; return 0; } @@ -417,10 +410,11 @@ static const struct net_device_ops vti_netdev_ops = { .ndo_init = vti_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = vti_tunnel_xmit, - .ndo_do_ioctl = vti_tunnel_ioctl, + .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = vti_tunnel_ctl, }; static void vti_tunnel_setup(struct net_device *dev) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 2f01cf6fa0de..df663baf2516 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -327,41 +327,29 @@ static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto) } static int -ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { - int err = 0; - struct ip_tunnel_parm p; - - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; - if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { - if (p.iph.version != 4 || - !ipip_tunnel_ioctl_verify_protocol(p.iph.protocol) || - p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) + if (p->iph.version != 4 || + !ipip_tunnel_ioctl_verify_protocol(p->iph.protocol) || + p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF))) return -EINVAL; } - p.i_key = p.o_key = 0; - p.i_flags = p.o_flags = 0; - err = ip_tunnel_ioctl(dev, &p, cmd); - if (err) - return err; - - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - return -EFAULT; - - return 0; + p->i_key = p->o_key = 0; + p->i_flags = p->o_flags = 0; + return ip_tunnel_ctl(dev, p, cmd); } static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = ipip_tunnel_xmit, - .ndo_do_ioctl = ipip_tunnel_ioctl, + .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = ipip_tunnel_ctl, }; #define IPIP_FEATURES (NETIF_F_SG | \ -- cgit v1.2.3 From cd16627fc0468564fdd60f20ad52420b87195127 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Sat, 23 May 2020 15:27:10 +0200 Subject: net: devres: provide devm_register_netdev() Provide devm_register_netdev() - a device resource managed variant of register_netdev(). This new helper will only work for net_device structs that are also already managed by devres. Signed-off-by: Bartosz Golaszewski Signed-off-by: David S. Miller --- Documentation/driver-api/driver-model/devres.rst | 1 + include/linux/netdevice.h | 2 + net/devres.c | 55 ++++++++++++++++++++++++ 3 files changed, 58 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst index 50df28d20fa7..fc242ed4bde5 100644 --- a/Documentation/driver-api/driver-model/devres.rst +++ b/Documentation/driver-api/driver-model/devres.rst @@ -375,6 +375,7 @@ MUX NET devm_alloc_etherdev() devm_alloc_etherdev_mqs() + devm_register_netdev() PER-CPU MEM devm_alloc_percpu() diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a18f8fdf4260..1a96e9c4ec36 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4280,6 +4280,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, int register_netdev(struct net_device *dev); void unregister_netdev(struct net_device *dev); +int devm_register_netdev(struct device *dev, struct net_device *ndev); + /* General hardware address lists handling functions */ int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); diff --git a/net/devres.c b/net/devres.c index b97b0c5a8216..57a6a88d11f6 100644 --- a/net/devres.c +++ b/net/devres.c @@ -38,3 +38,58 @@ struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, return dr->ndev; } EXPORT_SYMBOL(devm_alloc_etherdev_mqs); + +static void devm_netdev_release(struct device *dev, void *this) +{ + struct net_device_devres *res = this; + + unregister_netdev(res->ndev); +} + +static int netdev_devres_match(struct device *dev, void *this, void *match_data) +{ + struct net_device_devres *res = this; + struct net_device *ndev = match_data; + + return ndev == res->ndev; +} + +/** + * devm_register_netdev - resource managed variant of register_netdev() + * @dev: managing device for this netdev - usually the parent device + * @ndev: device to register + * + * This is a devres variant of register_netdev() for which the unregister + * function will be call automatically when the managing device is + * detached. Note: the net_device used must also be resource managed by + * the same struct device. + */ +int devm_register_netdev(struct device *dev, struct net_device *ndev) +{ + struct net_device_devres *dr; + int ret; + + /* struct net_device must itself be managed. For now a managed netdev + * can only be allocated by devm_alloc_etherdev_mqs() so the check is + * straightforward. + */ + if (WARN_ON(!devres_find(dev, devm_free_netdev, + netdev_devres_match, ndev))) + return -EINVAL; + + dr = devres_alloc(devm_netdev_release, sizeof(*dr), GFP_KERNEL); + if (!dr) + return -ENOMEM; + + ret = register_netdev(ndev); + if (ret) { + devres_free(dr); + return ret; + } + + dr->ndev = ndev; + devres_add(ndev->dev.parent, dr); + + return 0; +} +EXPORT_SYMBOL(devm_register_netdev); -- cgit v1.2.3 From ceabef7dd71720aef58bd182943352c9c307a3de Mon Sep 17 00:00:00 2001 From: Orson Zhai Date: Sun, 7 Jun 2020 21:40:14 -0700 Subject: dynamic_debug: add an option to enable dynamic debug for modules only Instead of enabling dynamic debug globally with CONFIG_DYNAMIC_DEBUG, CONFIG_DYNAMIC_DEBUG_CORE will only enable core function of dynamic debug. With the DYNAMIC_DEBUG_MODULE defined for any modules, dynamic debug will be tied to them. This is useful for people who only want to enable dynamic debug for kernel modules without worrying about kernel image size and memory consumption is increasing too much. [orson.zhai@unisoc.com: v2] Link: http://lkml.kernel.org/r/1587408228-10861-1-git-send-email-orson.unisoc@gmail.com Signed-off-by: Orson Zhai Signed-off-by: Andrew Morton Acked-by: Greg Kroah-Hartman Acked-by: Petr Mladek Cc: Jonathan Corbet Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Jason Baron Cc: Randy Dunlap Link: http://lkml.kernel.org/r/1586521984-5890-1-git-send-email-orson.unisoc@gmail.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/dynamic-debug-howto.rst | 5 +++++ include/linux/dev_printk.h | 6 ++++-- include/linux/dynamic_debug.h | 2 +- include/linux/net.h | 3 ++- include/linux/netdevice.h | 6 ++++-- include/linux/printk.h | 9 ++++++--- include/rdma/ib_verbs.h | 6 ++++-- lib/Kconfig.debug | 12 ++++++++++++ lib/Makefile | 2 +- lib/dynamic_debug.c | 9 +++++++-- 10 files changed, 46 insertions(+), 14 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst index 0dc2eb8e44e5..1012bd9305e9 100644 --- a/Documentation/admin-guide/dynamic-debug-howto.rst +++ b/Documentation/admin-guide/dynamic-debug-howto.rst @@ -13,6 +13,11 @@ kernel code to obtain additional kernel information. Currently, if ``print_hex_dump_debug()``/``print_hex_dump_bytes()`` calls can be dynamically enabled per-callsite. +If you do not want to enable dynamic debug globally (i.e. in some embedded +system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic +debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any +modules which you'd like to dynamically debug later. + If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is just shortcut for ``print_hex_dump(KERN_DEBUG)``. diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h index 5aad06b4ca7b..3028b644b4fb 100644 --- a/include/linux/dev_printk.h +++ b/include/linux/dev_printk.h @@ -109,7 +109,8 @@ void _dev_info(const struct device *dev, const char *fmt, ...) #define dev_info(dev, fmt, ...) \ _dev_info(dev, dev_fmt(fmt), ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define dev_dbg(dev, fmt, ...) \ dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__) #elif defined(DEBUG) @@ -181,7 +182,8 @@ do { \ dev_level_ratelimited(dev_notice, dev, fmt, ##__VA_ARGS__) #define dev_info_ratelimited(dev, fmt, ...) \ dev_level_ratelimited(dev_info, dev, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define dev_dbg_ratelimited(dev, fmt, ...) \ do { \ diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 4cf02ecd67de..abcd5fde30eb 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -48,7 +48,7 @@ struct _ddebug { -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG_CORE) int ddebug_add_module(struct _ddebug *tab, unsigned int n, const char *modname); extern int ddebug_remove_module(const char *mod_name); diff --git a/include/linux/net.h b/include/linux/net.h index e10f378194a5..016a9c5faa34 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -264,7 +264,8 @@ do { \ net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__) #define net_info_ratelimited(fmt, ...) \ net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define net_dbg_ratelimited(fmt, ...) \ do { \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a96e9c4ec36..5b364a2e0006 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4942,7 +4942,8 @@ do { \ #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define netdev_dbg(__dev, format, args...) \ do { \ dynamic_netdev_dbg(__dev, format, ##args); \ @@ -5012,7 +5013,8 @@ do { \ #define netif_info(priv, type, dev, fmt, args...) \ netif_level(info, priv, type, dev, fmt, ##args) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define netif_dbg(priv, type, netdev, format, args...) \ do { \ if (netif_msg_##type(priv)) \ diff --git a/include/linux/printk.h b/include/linux/printk.h index 3cc2f178bf06..fc8f03c54543 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -399,7 +399,8 @@ extern int kptr_restrict; /* If you are writing a driver, please use dev_dbg instead */ -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #include /** @@ -535,7 +536,8 @@ extern int kptr_restrict; #endif /* If you are writing a driver, please use dev_dbg instead */ -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define pr_debug_ratelimited(fmt, ...) \ do { \ @@ -582,7 +584,8 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type, #endif -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) \ dynamic_hex_dump(prefix_str, prefix_type, rowsize, \ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 033e7044f29c..ef2f3986c493 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -100,7 +100,8 @@ void ibdev_notice(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_info(const struct ib_device *ibdev, const char *format, ...); -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define ibdev_dbg(__dev, format, args...) \ dynamic_ibdev_dbg(__dev, format, ##args) #else @@ -133,7 +134,8 @@ do { \ #define ibdev_info_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_info, ibdev, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define ibdev_dbg_ratelimited(ibdev, fmt, ...) \ do { \ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9bd4eb7f5ec1..333e878d8af9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -99,6 +99,7 @@ config DYNAMIC_DEBUG default n depends on PRINTK depends on (DEBUG_FS || PROC_FS) + select DYNAMIC_DEBUG_CORE help Compiles debug level messages into the kernel, which would not @@ -165,6 +166,17 @@ config DYNAMIC_DEBUG See Documentation/admin-guide/dynamic-debug-howto.rst for additional information. +config DYNAMIC_DEBUG_CORE + bool "Enable core function of dynamic debug support" + depends on PRINTK + depends on (DEBUG_FS || PROC_FS) + help + Enable core functional support of dynamic debug. It is useful + when you want to tie dynamic debug to your kernel modules with + DYNAMIC_DEBUG_MODULE defined for each of them, especially for + the case of embedded system where the kernel image size is + sensitive for people. + config SYMBOLIC_ERRNAME bool "Support symbolic error names in printf" default y if PRINTK diff --git a/lib/Makefile b/lib/Makefile index 32f19b4d1d2a..315516fa4ef4 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -190,7 +190,7 @@ lib-$(CONFIG_GENERIC_BUG) += bug.o obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o -obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o +obj-$(CONFIG_DYNAMIC_DEBUG_CORE) += dynamic_debug.o obj-$(CONFIG_SYMBOLIC_ERRNAME) += errname.o obj-$(CONFIG_NLATTR) += nlattr.o diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 8f199f403ab5..321437bbf87d 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -1032,8 +1032,13 @@ static int __init dynamic_debug_init(void) int verbose_bytes = 0; if (&__start___verbose == &__stop___verbose) { - pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); - return 1; + if (IS_ENABLED(CONFIG_DYNAMIC_DEBUG)) { + pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); + return 1; + } + pr_info("Ignore empty _ddebug table in a CONFIG_DYNAMIC_DEBUG_CORE build\n"); + ddebug_init_success = 1; + return 0; } iter = __start___verbose; modname = iter->modname; -- cgit v1.2.3 From 845e0ebb4408d4473cf60d21224a897037e9a77a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 8 Jun 2020 14:53:01 -0700 Subject: net: change addr_list_lock back to static key The dynamic key update for addr_list_lock still causes troubles, for example the following race condition still exists: CPU 0: CPU 1: (RCU read lock) (RTNL lock) dev_mc_seq_show() netdev_update_lockdep_key() -> lockdep_unregister_key() -> netif_addr_lock_bh() because lockdep doesn't provide an API to update it atomically. Therefore, we have to move it back to static keys and use subclass for nest locking like before. In commit 1a33e10e4a95 ("net: partially revert dynamic lockdep key changes"), I already reverted most parts of commit ab92d68fc22f ("net: core: add generic lockdep keys"). This patch reverts the rest and also part of commit f3b0a18bb6cb ("net: remove unnecessary variables and callback"). After this patch, addr_list_lock changes back to using static keys and subclasses to satisfy lockdep. Thanks to dev->lower_level, we do not have to change back to ->ndo_get_lock_subclass(). And hopefully this reduces some syzbot lockdep noises too. Reported-by: syzbot+f3a0e80c34b3fc28ac5e@syzkaller.appspotmail.com Cc: Taehee Yoo Cc: Dmitry Vyukov Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 -- drivers/net/bonding/bond_options.c | 2 -- drivers/net/hamradio/bpqether.c | 2 ++ drivers/net/macsec.c | 5 ++++ drivers/net/macvlan.c | 13 ++++++++-- drivers/net/vxlan.c | 4 +--- drivers/net/wireless/intersil/hostap/hostap_hw.c | 3 +++ include/linux/netdevice.h | 12 ++++++---- net/8021q/vlan_dev.c | 8 +++++-- net/batman-adv/soft-interface.c | 2 ++ net/bridge/br_device.c | 8 +++++++ net/core/dev.c | 30 +++++++++++++----------- net/core/dev_addr_lists.c | 12 +++++----- net/core/rtnetlink.c | 1 - net/dsa/master.c | 4 ++++ net/netrom/af_netrom.c | 2 ++ net/rose/af_rose.c | 2 ++ 17 files changed, 76 insertions(+), 36 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index a25c65d4af71..004919aea5fb 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3687,8 +3687,6 @@ static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd case BOND_RELEASE_OLD: case SIOCBONDRELEASE: res = bond_release(bond_dev, slave_dev); - if (!res) - netdev_update_lockdep_key(slave_dev); break; case BOND_SETHWADDR_OLD: case SIOCBONDSETHWADDR: diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 215c10923289..ddb3916d3506 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -1398,8 +1398,6 @@ static int bond_option_slaves_set(struct bonding *bond, case '-': slave_dbg(bond->dev, dev, "Releasing interface\n"); ret = bond_release(bond->dev, dev); - if (!ret) - netdev_update_lockdep_key(dev); break; default: diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c index 60dcaf2a04a9..1ad6085994b1 100644 --- a/drivers/net/hamradio/bpqether.c +++ b/drivers/net/hamradio/bpqether.c @@ -113,6 +113,7 @@ static LIST_HEAD(bpq_devices); * off into a separate class since they always nest. */ static struct lock_class_key bpq_netdev_xmit_lock_key; +static struct lock_class_key bpq_netdev_addr_lock_key; static void bpq_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, @@ -123,6 +124,7 @@ static void bpq_set_lockdep_class_one(struct net_device *dev, static void bpq_set_lockdep_class(struct net_device *dev) { + lockdep_set_class(&dev->addr_list_lock, &bpq_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, bpq_set_lockdep_class_one, NULL); } diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 20b53e255f68..e56547bfdac9 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -3999,6 +3999,8 @@ static int macsec_add_dev(struct net_device *dev, sci_t sci, u8 icv_len) return 0; } +static struct lock_class_key macsec_netdev_addr_lock_key; + static int macsec_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) @@ -4050,6 +4052,9 @@ static int macsec_newlink(struct net *net, struct net_device *dev, return err; netdev_lockdep_set_classes(dev); + lockdep_set_class_and_subclass(&dev->addr_list_lock, + &macsec_netdev_addr_lock_key, + dev->lower_level); err = netdev_upper_dev_link(real_dev, dev, extack); if (err < 0) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 563aed5b3d9f..6a6cc9f75307 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -860,6 +860,8 @@ static int macvlan_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ +static struct lock_class_key macvlan_netdev_addr_lock_key; + #define ALWAYS_ON_OFFLOADS \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | \ NETIF_F_GSO_ROBUST | NETIF_F_GSO_ENCAP_ALL) @@ -875,6 +877,14 @@ static int macvlan_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) #define MACVLAN_STATE_MASK \ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) +static void macvlan_set_lockdep_class(struct net_device *dev) +{ + netdev_lockdep_set_classes(dev); + lockdep_set_class_and_subclass(&dev->addr_list_lock, + &macvlan_netdev_addr_lock_key, + dev->lower_level); +} + static int macvlan_init(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); @@ -892,8 +902,7 @@ static int macvlan_init(struct net_device *dev) dev->gso_max_size = lowerdev->gso_max_size; dev->gso_max_segs = lowerdev->gso_max_segs; dev->hard_header_len = lowerdev->hard_header_len; - - netdev_lockdep_set_classes(dev); + macvlan_set_lockdep_class(dev); vlan->pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->pcpu_stats) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 5bb448ae6c9c..47424b2da643 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -4245,10 +4245,8 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], mod_timer(&vxlan->age_timer, jiffies); netdev_adjacent_change_commit(dst->remote_dev, lowerdev, dev); - if (lowerdev && lowerdev != dst->remote_dev) { + if (lowerdev && lowerdev != dst->remote_dev) dst->remote_dev = lowerdev; - netdev_update_lockdep_key(lowerdev); - } vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true); return 0; } diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c index aadf3dec5bf3..2ab34cf74ecc 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_hw.c +++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c @@ -3048,6 +3048,7 @@ static void prism2_clear_set_tim_queue(local_info_t *local) * This is a natural nesting, which needs a split lock type. */ static struct lock_class_key hostap_netdev_xmit_lock_key; +static struct lock_class_key hostap_netdev_addr_lock_key; static void prism2_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, @@ -3059,6 +3060,8 @@ static void prism2_set_lockdep_class_one(struct net_device *dev, static void prism2_set_lockdep_class(struct net_device *dev) { + lockdep_set_class(&dev->addr_list_lock, + &hostap_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, prism2_set_lockdep_class_one, NULL); } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a96e9c4ec36..e2825e27ef89 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1821,8 +1821,6 @@ enum netdev_priv_flags { * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. * - * @addr_list_lock_key: lockdep class annotating - * net_device->addr_list_lock spinlock * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount * @@ -2125,7 +2123,6 @@ struct net_device { #endif struct phy_device *phydev; struct sfp_bus *sfp_bus; - struct lock_class_key addr_list_lock_key; struct lock_class_key *qdisc_tx_busylock; struct lock_class_key *qdisc_running_key; bool proto_down; @@ -2217,10 +2214,13 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, static struct lock_class_key qdisc_tx_busylock_key; \ static struct lock_class_key qdisc_running_key; \ static struct lock_class_key qdisc_xmit_lock_key; \ + static struct lock_class_key dev_addr_list_lock_key; \ unsigned int i; \ \ (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ (dev)->qdisc_running_key = &qdisc_running_key; \ + lockdep_set_class(&(dev)->addr_list_lock, \ + &dev_addr_list_lock_key); \ for (i = 0; i < (dev)->num_tx_queues; i++) \ lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ &qdisc_xmit_lock_key); \ @@ -3253,7 +3253,6 @@ static inline void netif_stop_queue(struct net_device *dev) } void netif_tx_stop_all_queues(struct net_device *dev); -void netdev_update_lockdep_key(struct net_device *dev); static inline bool netif_tx_queue_stopped(const struct netdev_queue *dev_queue) { @@ -4239,6 +4238,11 @@ static inline void netif_addr_lock(struct net_device *dev) spin_lock(&dev->addr_list_lock); } +static inline void netif_addr_lock_nested(struct net_device *dev) +{ + spin_lock_nested(&dev->addr_list_lock, dev->lower_level); +} + static inline void netif_addr_lock_bh(struct net_device *dev) { spin_lock_bh(&dev->addr_list_lock); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index f00bb57f0f60..c8d6a07e23c5 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -494,6 +494,7 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) * separate class since they always nest. */ static struct lock_class_key vlan_netdev_xmit_lock_key; +static struct lock_class_key vlan_netdev_addr_lock_key; static void vlan_dev_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, @@ -502,8 +503,11 @@ static void vlan_dev_set_lockdep_one(struct net_device *dev, lockdep_set_class(&txq->_xmit_lock, &vlan_netdev_xmit_lock_key); } -static void vlan_dev_set_lockdep_class(struct net_device *dev) +static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass) { + lockdep_set_class_and_subclass(&dev->addr_list_lock, + &vlan_netdev_addr_lock_key, + subclass); netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL); } @@ -597,7 +601,7 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); - vlan_dev_set_lockdep_class(dev); + vlan_dev_set_lockdep_class(dev, dev->lower_level); vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 0ddd80130ea3..f1f1c86f3419 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -745,6 +745,7 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, * separate class since they always nest. */ static struct lock_class_key batadv_netdev_xmit_lock_key; +static struct lock_class_key batadv_netdev_addr_lock_key; /** * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue @@ -765,6 +766,7 @@ static void batadv_set_lockdep_class_one(struct net_device *dev, */ static void batadv_set_lockdep_class(struct net_device *dev) { + lockdep_set_class(&dev->addr_list_lock, &batadv_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 8ec1362588af..8c7b78f8bc23 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -105,6 +105,13 @@ out: return NETDEV_TX_OK; } +static struct lock_class_key bridge_netdev_addr_lock_key; + +static void br_set_lockdep_class(struct net_device *dev) +{ + lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key); +} + static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -143,6 +150,7 @@ static int br_dev_init(struct net_device *dev) br_fdb_hash_fini(br); } + br_set_lockdep_class(dev); return err; } diff --git a/net/core/dev.c b/net/core/dev.c index 061496a1f640..6bc2388141f6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -439,6 +439,7 @@ static const char *const netdev_lock_name[] = { "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; +static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; static inline unsigned short netdev_lock_pos(unsigned short dev_type) { @@ -460,11 +461,25 @@ static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], netdev_lock_name[i]); } + +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ + int i; + + i = netdev_lock_pos(dev->type); + lockdep_set_class_and_name(&dev->addr_list_lock, + &netdev_addr_lock_key[i], + netdev_lock_name[i]); +} #else static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { } + +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ +} #endif /******************************************************************************* @@ -9373,15 +9388,6 @@ void netif_tx_stop_all_queues(struct net_device *dev) } EXPORT_SYMBOL(netif_tx_stop_all_queues); -void netdev_update_lockdep_key(struct net_device *dev) -{ - lockdep_unregister_key(&dev->addr_list_lock_key); - lockdep_register_key(&dev->addr_list_lock_key); - - lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key); -} -EXPORT_SYMBOL(netdev_update_lockdep_key); - /** * register_netdevice - register a network device * @dev: device to register @@ -9420,7 +9426,7 @@ int register_netdevice(struct net_device *dev) return ret; spin_lock_init(&dev->addr_list_lock); - lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key); + netdev_set_addr_lockdep_class(dev); ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) @@ -9939,8 +9945,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - lockdep_register_key(&dev->addr_list_lock_key); - dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; dev->upper_level = 1; @@ -10028,8 +10032,6 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; - lockdep_unregister_key(&dev->addr_list_lock_key); - /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 2f949b5a1eb9..6393ba930097 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -637,7 +637,7 @@ int dev_uc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock(to); + netif_addr_lock_nested(to); err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -667,7 +667,7 @@ int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock(to); + netif_addr_lock_nested(to); err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -691,7 +691,7 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from) return; netif_addr_lock_bh(from); - netif_addr_lock(to); + netif_addr_lock_nested(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); @@ -858,7 +858,7 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock(to); + netif_addr_lock_nested(to); err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -888,7 +888,7 @@ int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock(to); + netif_addr_lock_nested(to); err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -912,7 +912,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from) return; netif_addr_lock_bh(from); - netif_addr_lock(to); + netif_addr_lock_nested(to); __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2269199c5891..9aedc15736ad 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2462,7 +2462,6 @@ static int do_set_master(struct net_device *dev, int ifindex, err = ops->ndo_del_slave(upper_dev, dev); if (err) return err; - netdev_update_lockdep_key(dev); } else { return -EOPNOTSUPP; } diff --git a/net/dsa/master.c b/net/dsa/master.c index a621367c6e8c..480a61460c23 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -327,6 +327,8 @@ static void dsa_master_reset_mtu(struct net_device *dev) rtnl_unlock(); } +static struct lock_class_key dsa_master_addr_list_lock_key; + int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { int ret; @@ -345,6 +347,8 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) wmb(); dev->dsa_ptr = cpu_dp; + lockdep_set_class(&dev->addr_list_lock, + &dsa_master_addr_list_lock_key); ret = dsa_master_ethtool_setup(dev); if (ret) return ret; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index eccc7d366e17..f90ef6934b8f 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -70,6 +70,7 @@ static const struct proto_ops nr_proto_ops; * separate class since they always nest. */ static struct lock_class_key nr_netdev_xmit_lock_key; +static struct lock_class_key nr_netdev_addr_lock_key; static void nr_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, @@ -80,6 +81,7 @@ static void nr_set_lockdep_one(struct net_device *dev, static void nr_set_lockdep_key(struct net_device *dev) { + lockdep_set_class(&dev->addr_list_lock, &nr_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL); } diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index e7a872207b46..ce85656ac9c1 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -71,6 +71,7 @@ ax25_address rose_callsign; * separate class since they always nest. */ static struct lock_class_key rose_netdev_xmit_lock_key; +static struct lock_class_key rose_netdev_addr_lock_key; static void rose_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, @@ -81,6 +82,7 @@ static void rose_set_lockdep_one(struct net_device *dev, static void rose_set_lockdep_key(struct net_device *dev) { + lockdep_set_class(&dev->addr_list_lock, &rose_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL); } -- cgit v1.2.3 From fb7861d14c8d7edac65b2fcb6e8031cb138457b2 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 16 Jun 2020 15:52:05 +0000 Subject: net: core: reduce recursion limit value In the current code, ->ndo_start_xmit() can be executed recursively only 10 times because of stack memory. But, in the case of the vxlan, 10 recursion limit value results in a stack overflow. In the current code, the nested interface is limited by 8 depth. There is no critical reason that the recursion limitation value should be 10. So, it would be good to be the same value with the limitation value of nesting interface depth. Test commands: ip link add vxlan10 type vxlan vni 10 dstport 4789 srcport 4789 4789 ip link set vxlan10 up ip a a 192.168.10.1/24 dev vxlan10 ip n a 192.168.10.2 dev vxlan10 lladdr fc:22:33:44:55:66 nud permanent for i in {9..0} do let A=$i+1 ip link add vxlan$i type vxlan vni $i dstport 4789 srcport 4789 4789 ip link set vxlan$i up ip a a 192.168.$i.1/24 dev vxlan$i ip n a 192.168.$i.2 dev vxlan$i lladdr fc:22:33:44:55:66 nud permanent bridge fdb add fc:22:33:44:55:66 dev vxlan$A dst 192.168.$i.2 self done hping3 192.168.10.2 -2 -d 60000 Splat looks like: [ 103.814237][ T1127] ============================================================================= [ 103.871955][ T1127] BUG kmalloc-2k (Tainted: G B ): Padding overwritten. 0x00000000897a2e4f-0x000 [ 103.873187][ T1127] ----------------------------------------------------------------------------- [ 103.873187][ T1127] [ 103.874252][ T1127] INFO: Slab 0x000000005cccc724 objects=5 used=5 fp=0x0000000000000000 flags=0x10000000001020 [ 103.881323][ T1127] CPU: 3 PID: 1127 Comm: hping3 Tainted: G B 5.7.0+ #575 [ 103.882131][ T1127] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 103.883006][ T1127] Call Trace: [ 103.883324][ T1127] dump_stack+0x96/0xdb [ 103.883716][ T1127] slab_err+0xad/0xd0 [ 103.884106][ T1127] ? _raw_spin_unlock+0x1f/0x30 [ 103.884620][ T1127] ? get_partial_node.isra.78+0x140/0x360 [ 103.885214][ T1127] slab_pad_check.part.53+0xf7/0x160 [ 103.885769][ T1127] ? pskb_expand_head+0x110/0xe10 [ 103.886316][ T1127] check_slab+0x97/0xb0 [ 103.886763][ T1127] alloc_debug_processing+0x84/0x1a0 [ 103.887308][ T1127] ___slab_alloc+0x5a5/0x630 [ 103.887765][ T1127] ? pskb_expand_head+0x110/0xe10 [ 103.888265][ T1127] ? lock_downgrade+0x730/0x730 [ 103.888762][ T1127] ? pskb_expand_head+0x110/0xe10 [ 103.889244][ T1127] ? __slab_alloc+0x3e/0x80 [ 103.889675][ T1127] __slab_alloc+0x3e/0x80 [ 103.890108][ T1127] __kmalloc_node_track_caller+0xc7/0x420 [ ... ] Fixes: 11a766ce915f ("net: Increase xmit RECURSION_LIMIT to 10.") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6fc613ed8eae..39e28e11863c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3157,7 +3157,7 @@ static inline int dev_recursion_level(void) return this_cpu_read(softnet_data.xmit.recursion); } -#define XMIT_RECURSION_LIMIT 10 +#define XMIT_RECURSION_LIMIT 8 static inline bool dev_xmit_recursion(void) { return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > -- cgit v1.2.3 From 5c45a918263e4da142b9cf40a1dfcb4134d454a2 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 23 Jun 2020 09:08:59 +0200 Subject: net: netdevice.h: add a description for napi_defer_hard_irqs Changeset 6f8b12d661d0 ("net: napi: add hard irqs deferral feature") added a new element at struct net_device. Add a description for it, based on what's described at the changeset which added such feature. Fixes: 6f8b12d661d0 ("net: napi: add hard irqs deferral feature") Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/807a3840e7bc1562adefadb0535c9f47e6ab52e0.1592895969.git.mchehab+huawei@kernel.org Signed-off-by: Jonathan Corbet --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6fc613ed8eae..f3ca52958a17 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1742,6 +1742,8 @@ enum netdev_priv_flags { * @real_num_rx_queues: Number of RX queues currently active in device * @xdp_prog: XDP sockets filter program pointer * @gro_flush_timeout: timeout for GRO layer in NAPI + * @napi_defer_hard_irqs: If not zero, provides a counter that would + * allow to avoid NIC hard IRQ, on busy queues. * * @rx_handler: handler for received packets * @rx_handler_data: XXX: need comments on this one -- cgit v1.2.3 From cc4e3835eff474aa274d6e1d18f69d9d296d3b76 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 9 Jul 2020 17:42:46 -0700 Subject: udp_tunnel: add central NIC RX port offload infrastructure Cater to devices which: (a) may want to sleep in the callbacks; (b) only have IPv4 support; (c) need all the programming to happen while the netdev is up. Drivers attach UDP tunnel offload info struct to their netdevs, where they declare how many UDP ports of various tunnel types they support. Core takes care of tracking which ports to offload. Use a fixed-size array since this matches what almost all drivers do, and avoids a complexity and uncertainty around memory allocations in an atomic context. Make sure that tunnel drivers don't try to replay the ports when new NIC netdev is registered. Automatic replays would mess up reference counting, and will be removed completely once all drivers are converted. v4: - use a #define NULL to avoid build issues with CONFIG_INET=n. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/geneve.c | 6 +- drivers/net/vxlan.c | 6 +- include/linux/netdevice.h | 8 + include/net/udp_tunnel.h | 137 ++++++++ net/ipv4/Makefile | 3 +- net/ipv4/udp_tunnel.c | 224 ------------- net/ipv4/udp_tunnel_core.c | 224 +++++++++++++ net/ipv4/udp_tunnel_nic.c | 821 +++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/udp_tunnel_stub.c | 7 + 9 files changed, 1207 insertions(+), 229 deletions(-) delete mode 100644 net/ipv4/udp_tunnel.c create mode 100644 net/ipv4/udp_tunnel_core.c create mode 100644 net/ipv4/udp_tunnel_nic.c create mode 100644 net/ipv4/udp_tunnel_stub.c (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index e3d074008da2..49b00def2eef 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1796,9 +1796,11 @@ static int geneve_netdevice_event(struct notifier_block *unused, event == NETDEV_UDP_TUNNEL_DROP_INFO) { geneve_offload_rx_ports(dev, event == NETDEV_UDP_TUNNEL_PUSH_INFO); } else if (event == NETDEV_UNREGISTER) { - geneve_offload_rx_ports(dev, false); + if (!dev->udp_tunnel_nic_info) + geneve_offload_rx_ports(dev, false); } else if (event == NETDEV_REGISTER) { - geneve_offload_rx_ports(dev, true); + if (!dev->udp_tunnel_nic_info) + geneve_offload_rx_ports(dev, true); } return NOTIFY_DONE; diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 89d85dcb200e..a43c97b13924 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -4477,10 +4477,12 @@ static int vxlan_netdevice_event(struct notifier_block *unused, struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); if (event == NETDEV_UNREGISTER) { - vxlan_offload_rx_ports(dev, false); + if (!dev->udp_tunnel_nic_info) + vxlan_offload_rx_ports(dev, false); vxlan_handle_lowerdev_unregister(vn, dev); } else if (event == NETDEV_REGISTER) { - vxlan_offload_rx_ports(dev, true); + if (!dev->udp_tunnel_nic_info) + vxlan_offload_rx_ports(dev, true); } else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO || event == NETDEV_UDP_TUNNEL_DROP_INFO) { vxlan_offload_rx_ports(dev, event == NETDEV_UDP_TUNNEL_PUSH_INFO); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 39e28e11863c..ac2cd3f49aba 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -65,6 +65,8 @@ struct wpan_dev; struct mpls_dev; /* UDP Tunnel offloads */ struct udp_tunnel_info; +struct udp_tunnel_nic_info; +struct udp_tunnel_nic; struct bpf_prog; struct xdp_buff; @@ -1836,6 +1838,10 @@ enum netdev_priv_flags { * * @macsec_ops: MACsec offloading ops * + * @udp_tunnel_nic_info: static structure describing the UDP tunnel + * offload capabilities of the device + * @udp_tunnel_nic: UDP tunnel offload state + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2134,6 +2140,8 @@ struct net_device { /* MACsec management functions */ const struct macsec_ops *macsec_ops; #endif + const struct udp_tunnel_nic_info *udp_tunnel_nic_info; + struct udp_tunnel_nic *udp_tunnel_nic; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 0615e25f041c..ee34619e4cfa 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -115,6 +115,7 @@ struct udp_tunnel_info { unsigned short type; sa_family_t sa_family; __be16 port; + u8 hw_priv; }; /* Notify network devices of offloadable types */ @@ -181,4 +182,140 @@ static inline void udp_tunnel_encap_enable(struct socket *sock) udp_encap_enable(); } +#define UDP_TUNNEL_NIC_MAX_TABLES 4 + +enum udp_tunnel_nic_info_flags { + /* Device callbacks may sleep */ + UDP_TUNNEL_NIC_INFO_MAY_SLEEP = BIT(0), + /* Device only supports offloads when it's open, all ports + * will be removed before close and re-added after open. + */ + UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(1), + /* Device supports only IPv4 tunnels */ + UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(2), +}; + +/** + * struct udp_tunnel_nic_info - driver UDP tunnel offload information + * @set_port: callback for adding a new port + * @unset_port: callback for removing a port + * @sync_table: callback for syncing the entire port table at once + * @flags: device flags from enum udp_tunnel_nic_info_flags + * @tables: UDP port tables this device has + * @tables.n_entries: number of entries in this table + * @tables.tunnel_types: types of tunnels this table accepts + * + * Drivers are expected to provide either @set_port and @unset_port callbacks + * or the @sync_table callback. Callbacks are invoked with rtnl lock held. + * + * Known limitations: + * - UDP tunnel port notifications are fundamentally best-effort - + * it is likely the driver will both see skbs which use a UDP tunnel port, + * while not being a tunneled skb, and tunnel skbs from other ports - + * drivers should only use these ports for non-critical RX-side offloads, + * e.g. the checksum offload; + * - none of the devices care about the socket family at present, so we don't + * track it. Please extend this code if you care. + */ +struct udp_tunnel_nic_info { + /* one-by-one */ + int (*set_port)(struct net_device *dev, + unsigned int table, unsigned int entry, + struct udp_tunnel_info *ti); + int (*unset_port)(struct net_device *dev, + unsigned int table, unsigned int entry, + struct udp_tunnel_info *ti); + + /* all at once */ + int (*sync_table)(struct net_device *dev, unsigned int table); + + unsigned int flags; + + struct udp_tunnel_nic_table_info { + unsigned int n_entries; + unsigned int tunnel_types; + } tables[UDP_TUNNEL_NIC_MAX_TABLES]; +}; + +/* UDP tunnel module dependencies + * + * Tunnel drivers are expected to have a hard dependency on the udp_tunnel + * module. NIC drivers are not, they just attach their + * struct udp_tunnel_nic_info to the netdev and wait for callbacks to come. + * Loading a tunnel driver will cause the udp_tunnel module to be loaded + * and only then will all the required state structures be allocated. + * Since we want a weak dependency from the drivers and the core to udp_tunnel + * we call things through the following stubs. + */ +struct udp_tunnel_nic_ops { + void (*get_port)(struct net_device *dev, unsigned int table, + unsigned int idx, struct udp_tunnel_info *ti); + void (*set_port_priv)(struct net_device *dev, unsigned int table, + unsigned int idx, u8 priv); + void (*add_port)(struct net_device *dev, struct udp_tunnel_info *ti); + void (*del_port)(struct net_device *dev, struct udp_tunnel_info *ti); + void (*reset_ntf)(struct net_device *dev); +}; + +#ifdef CONFIG_INET +extern const struct udp_tunnel_nic_ops *udp_tunnel_nic_ops; +#else +#define udp_tunnel_nic_ops ((struct udp_tunnel_nic_ops *)NULL) +#endif + +static inline void +udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table, + unsigned int idx, struct udp_tunnel_info *ti) +{ + /* This helper is used from .sync_table, we indicate empty entries + * by zero'ed @ti. Drivers which need to know the details of a port + * when it gets deleted should use the .set_port / .unset_port + * callbacks. + * Zero out here, otherwise !CONFIG_INET causes uninitilized warnings. + */ + memset(ti, 0, sizeof(*ti)); + + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->get_port(dev, table, idx, ti); +} + +static inline void +udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, + unsigned int idx, u8 priv) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv); +} + +static inline void +udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->add_port(dev, ti); +} + +static inline void +udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->del_port(dev, ti); +} + +/** + * udp_tunnel_nic_reset_ntf() - device-originating reset notification + * @dev: network interface device structure + * + * Called by the driver to inform the core that the entire UDP tunnel port + * state has been lost, usually due to device reset. Core will assume device + * forgot all the ports and issue .set_port and .sync_table callbacks as + * necessary. + * + * This function must be called with rtnl lock held, and will issue all + * the callbacks before returning. + */ +static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->reset_ntf(dev); +} #endif diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9e1a186a3671..5b77a46885b9 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -14,7 +14,7 @@ obj-y := route.o inetpeer.o protocol.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ - metrics.o netlink.o nexthop.o + metrics.o netlink.o nexthop.o udp_tunnel_stub.o obj-$(CONFIG_BPFILTER) += bpfilter/ @@ -29,6 +29,7 @@ gre-y := gre_demux.o obj-$(CONFIG_NET_FOU) += fou.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o +udp_tunnel-y := udp_tunnel_core.o udp_tunnel_nic.o obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o obj-$(CONFIG_NET_IPVTI) += ip_vti.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c deleted file mode 100644 index 3eecba0874aa..000000000000 --- a/net/ipv4/udp_tunnel.c +++ /dev/null @@ -1,224 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, - struct socket **sockp) -{ - int err; - struct socket *sock = NULL; - struct sockaddr_in udp_addr; - - err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock); - if (err < 0) - goto error; - - if (cfg->bind_ifindex) { - err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); - if (err < 0) - goto error; - } - - udp_addr.sin_family = AF_INET; - udp_addr.sin_addr = cfg->local_ip; - udp_addr.sin_port = cfg->local_udp_port; - err = kernel_bind(sock, (struct sockaddr *)&udp_addr, - sizeof(udp_addr)); - if (err < 0) - goto error; - - if (cfg->peer_udp_port) { - udp_addr.sin_family = AF_INET; - udp_addr.sin_addr = cfg->peer_ip; - udp_addr.sin_port = cfg->peer_udp_port; - err = kernel_connect(sock, (struct sockaddr *)&udp_addr, - sizeof(udp_addr), 0); - if (err < 0) - goto error; - } - - sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; - - *sockp = sock; - return 0; - -error: - if (sock) { - kernel_sock_shutdown(sock, SHUT_RDWR); - sock_release(sock); - } - *sockp = NULL; - return err; -} -EXPORT_SYMBOL(udp_sock_create4); - -void setup_udp_tunnel_sock(struct net *net, struct socket *sock, - struct udp_tunnel_sock_cfg *cfg) -{ - struct sock *sk = sock->sk; - - /* Disable multicast loopback */ - inet_sk(sk)->mc_loop = 0; - - /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ - inet_inc_convert_csum(sk); - - rcu_assign_sk_user_data(sk, cfg->sk_user_data); - - udp_sk(sk)->encap_type = cfg->encap_type; - udp_sk(sk)->encap_rcv = cfg->encap_rcv; - udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup; - udp_sk(sk)->encap_destroy = cfg->encap_destroy; - udp_sk(sk)->gro_receive = cfg->gro_receive; - udp_sk(sk)->gro_complete = cfg->gro_complete; - - udp_tunnel_encap_enable(sock); -} -EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); - -void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, - unsigned short type) -{ - struct sock *sk = sock->sk; - struct udp_tunnel_info ti; - - if (!dev->netdev_ops->ndo_udp_tunnel_add || - !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - - ti.type = type; - ti.sa_family = sk->sk_family; - ti.port = inet_sk(sk)->inet_sport; - - dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); -} -EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); - -void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, - unsigned short type) -{ - struct sock *sk = sock->sk; - struct udp_tunnel_info ti; - - if (!dev->netdev_ops->ndo_udp_tunnel_del || - !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - - ti.type = type; - ti.sa_family = sk->sk_family; - ti.port = inet_sk(sk)->inet_sport; - - dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); -} -EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port); - -/* Notify netdevs that UDP port started listening */ -void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) -{ - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); - struct udp_tunnel_info ti; - struct net_device *dev; - - ti.type = type; - ti.sa_family = sk->sk_family; - ti.port = inet_sk(sk)->inet_sport; - - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { - if (!dev->netdev_ops->ndo_udp_tunnel_add) - continue; - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - continue; - dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); - -/* Notify netdevs that UDP port is no more listening */ -void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) -{ - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); - struct udp_tunnel_info ti; - struct net_device *dev; - - ti.type = type; - ti.sa_family = sk->sk_family; - ti.port = inet_sk(sk)->inet_sport; - - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { - if (!dev->netdev_ops->ndo_udp_tunnel_del) - continue; - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - continue; - dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); - -void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 tos, __u8 ttl, - __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck) -{ - struct udphdr *uh; - - __skb_push(skb, sizeof(*uh)); - skb_reset_transport_header(skb); - uh = udp_hdr(skb); - - uh->dest = dst_port; - uh->source = src_port; - uh->len = htons(skb->len); - - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - - udp_set_csum(nocheck, skb, src, dst, skb->len); - - iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); -} -EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); - -void udp_tunnel_sock_release(struct socket *sock) -{ - rcu_assign_sk_user_data(sock->sk, NULL); - kernel_sock_shutdown(sock, SHUT_RDWR); - sock_release(sock); -} -EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); - -struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, - __be16 flags, __be64 tunnel_id, int md_size) -{ - struct metadata_dst *tun_dst; - struct ip_tunnel_info *info; - - if (family == AF_INET) - tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size); - else - tun_dst = ipv6_tun_rx_dst(skb, flags, tunnel_id, md_size); - if (!tun_dst) - return NULL; - - info = &tun_dst->u.tun_info; - info->key.tp_src = udp_hdr(skb)->source; - info->key.tp_dst = udp_hdr(skb)->dest; - if (udp_hdr(skb)->check) - info->key.tun_flags |= TUNNEL_CSUM; - return tun_dst; -} -EXPORT_SYMBOL_GPL(udp_tun_rx_dst); - -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c new file mode 100644 index 000000000000..3eecba0874aa --- /dev/null +++ b/net/ipv4/udp_tunnel_core.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp) +{ + int err; + struct socket *sock = NULL; + struct sockaddr_in udp_addr; + + err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock); + if (err < 0) + goto error; + + if (cfg->bind_ifindex) { + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); + if (err < 0) + goto error; + } + + udp_addr.sin_family = AF_INET; + udp_addr.sin_addr = cfg->local_ip; + udp_addr.sin_port = cfg->local_udp_port; + err = kernel_bind(sock, (struct sockaddr *)&udp_addr, + sizeof(udp_addr)); + if (err < 0) + goto error; + + if (cfg->peer_udp_port) { + udp_addr.sin_family = AF_INET; + udp_addr.sin_addr = cfg->peer_ip; + udp_addr.sin_port = cfg->peer_udp_port; + err = kernel_connect(sock, (struct sockaddr *)&udp_addr, + sizeof(udp_addr), 0); + if (err < 0) + goto error; + } + + sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; + + *sockp = sock; + return 0; + +error: + if (sock) { + kernel_sock_shutdown(sock, SHUT_RDWR); + sock_release(sock); + } + *sockp = NULL; + return err; +} +EXPORT_SYMBOL(udp_sock_create4); + +void setup_udp_tunnel_sock(struct net *net, struct socket *sock, + struct udp_tunnel_sock_cfg *cfg) +{ + struct sock *sk = sock->sk; + + /* Disable multicast loopback */ + inet_sk(sk)->mc_loop = 0; + + /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ + inet_inc_convert_csum(sk); + + rcu_assign_sk_user_data(sk, cfg->sk_user_data); + + udp_sk(sk)->encap_type = cfg->encap_type; + udp_sk(sk)->encap_rcv = cfg->encap_rcv; + udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup; + udp_sk(sk)->encap_destroy = cfg->encap_destroy; + udp_sk(sk)->gro_receive = cfg->gro_receive; + udp_sk(sk)->gro_complete = cfg->gro_complete; + + udp_tunnel_encap_enable(sock); +} +EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); + +void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, + unsigned short type) +{ + struct sock *sk = sock->sk; + struct udp_tunnel_info ti; + + if (!dev->netdev_ops->ndo_udp_tunnel_add || + !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); +} +EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); + +void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, + unsigned short type) +{ + struct sock *sk = sock->sk; + struct udp_tunnel_info ti; + + if (!dev->netdev_ops->ndo_udp_tunnel_del || + !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); +} +EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port); + +/* Notify netdevs that UDP port started listening */ +void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct udp_tunnel_info ti; + struct net_device *dev; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (!dev->netdev_ops->ndo_udp_tunnel_add) + continue; + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + continue; + dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); + +/* Notify netdevs that UDP port is no more listening */ +void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct udp_tunnel_info ti; + struct net_device *dev; + + ti.type = type; + ti.sa_family = sk->sk_family; + ti.port = inet_sk(sk)->inet_sport; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (!dev->netdev_ops->ndo_udp_tunnel_del) + continue; + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + continue; + dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); + +void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 tos, __u8 ttl, + __be16 df, __be16 src_port, __be16 dst_port, + bool xnet, bool nocheck) +{ + struct udphdr *uh; + + __skb_push(skb, sizeof(*uh)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + uh->dest = dst_port; + uh->source = src_port; + uh->len = htons(skb->len); + + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + udp_set_csum(nocheck, skb, src, dst, skb->len); + + iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); +} +EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); + +void udp_tunnel_sock_release(struct socket *sock) +{ + rcu_assign_sk_user_data(sock->sk, NULL); + kernel_sock_shutdown(sock, SHUT_RDWR); + sock_release(sock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); + +struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, + __be16 flags, __be64 tunnel_id, int md_size) +{ + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + + if (family == AF_INET) + tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size); + else + tun_dst = ipv6_tun_rx_dst(skb, flags, tunnel_id, md_size); + if (!tun_dst) + return NULL; + + info = &tun_dst->u.tun_info; + info->key.tp_src = udp_hdr(skb)->source; + info->key.tp_dst = udp_hdr(skb)->dest; + if (udp_hdr(skb)->check) + info->key.tun_flags |= TUNNEL_CSUM; + return tun_dst; +} +EXPORT_SYMBOL_GPL(udp_tun_rx_dst); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c new file mode 100644 index 000000000000..056cfe0b770e --- /dev/null +++ b/net/ipv4/udp_tunnel_nic.c @@ -0,0 +1,821 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2020 Facebook Inc. + +#include +#include +#include +#include +#include + +enum udp_tunnel_nic_table_entry_flags { + UDP_TUNNEL_NIC_ENTRY_ADD = BIT(0), + UDP_TUNNEL_NIC_ENTRY_DEL = BIT(1), + UDP_TUNNEL_NIC_ENTRY_OP_FAIL = BIT(2), + UDP_TUNNEL_NIC_ENTRY_FROZEN = BIT(3), +}; + +struct udp_tunnel_nic_table_entry { + __be16 port; + u8 type; + u8 use_cnt; + u8 flags; + u8 hw_priv; +}; + +/** + * struct udp_tunnel_nic - UDP tunnel port offload state + * @work: async work for talking to hardware from process context + * @dev: netdev pointer + * @need_sync: at least one port start changed + * @need_replay: space was freed, we need a replay of all ports + * @work_pending: @work is currently scheduled + * @n_tables: number of tables under @entries + * @missed: bitmap of tables which overflown + * @entries: table of tables of ports currently offloaded + */ +struct udp_tunnel_nic { + struct work_struct work; + + struct net_device *dev; + + u8 need_sync:1; + u8 need_replay:1; + u8 work_pending:1; + + unsigned int n_tables; + unsigned long missed; + struct udp_tunnel_nic_table_entry **entries; +}; + +/* We ensure all work structs are done using driver state, but not the code. + * We need a workqueue we can flush before module gets removed. + */ +static struct workqueue_struct *udp_tunnel_nic_workqueue; + +static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type) +{ + switch (type) { + case UDP_TUNNEL_TYPE_VXLAN: + return "vxlan"; + case UDP_TUNNEL_TYPE_GENEVE: + return "geneve"; + case UDP_TUNNEL_TYPE_VXLAN_GPE: + return "vxlan-gpe"; + default: + return "unknown"; + } +} + +static bool +udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->use_cnt == 0 && !entry->flags; +} + +static bool +udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN; +} + +static void +udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry) +{ + if (!udp_tunnel_nic_entry_is_free(entry)) + entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN; +} + +static void +udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry) +{ + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN; +} + +static bool +udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry) +{ + return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD | + UDP_TUNNEL_NIC_ENTRY_DEL); +} + +static void +udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn, + struct udp_tunnel_nic_table_entry *entry, + unsigned int flag) +{ + entry->flags |= flag; + utn->need_sync = 1; +} + +static void +udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry, + struct udp_tunnel_info *ti) +{ + memset(ti, 0, sizeof(*ti)); + ti->port = entry->port; + ti->type = entry->type; + ti->hw_priv = entry->hw_priv; +} + +static bool +udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) + return false; + return true; +} + +static bool +udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_table_info *table; + unsigned int i, j; + + if (!utn->missed) + return false; + + for (i = 0; i < utn->n_tables; i++) { + table = &dev->udp_tunnel_nic_info->tables[i]; + if (!test_bit(i, &utn->missed)) + continue; + + for (j = 0; j < table->n_entries; j++) + if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) + return true; + } + + return false; +} + +static void +__udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table, + unsigned int idx, struct udp_tunnel_info *ti) +{ + struct udp_tunnel_nic_table_entry *entry; + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + entry = &utn->entries[table][idx]; + + if (entry->use_cnt) + udp_tunnel_nic_ti_from_entry(entry, ti); +} + +static void +__udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, + unsigned int idx, u8 priv) +{ + dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv; +} + +static void +udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry, + int err) +{ + bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; + + WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && + entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL); + + if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && + (!err || (err == -EEXIST && dodgy))) + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD; + + if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL && + (!err || (err == -ENOENT && dodgy))) + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL; + + if (!err) + entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL; + else + entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL; +} + +static void +udp_tunnel_nic_device_sync_one(struct net_device *dev, + struct udp_tunnel_nic *utn, + unsigned int table, unsigned int idx) +{ + struct udp_tunnel_nic_table_entry *entry; + struct udp_tunnel_info ti; + int err; + + entry = &utn->entries[table][idx]; + if (!udp_tunnel_nic_entry_is_queued(entry)) + return; + + udp_tunnel_nic_ti_from_entry(entry, &ti); + if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD) + err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti); + else + err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx, + &ti); + udp_tunnel_nic_entry_update_done(entry, err); + + if (err) + netdev_warn(dev, + "UDP tunnel port sync failed port %d type %s: %d\n", + be16_to_cpu(entry->port), + udp_tunnel_nic_tunnel_type_name(entry->type), + err); +} + +static void +udp_tunnel_nic_device_sync_by_port(struct net_device *dev, + struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + udp_tunnel_nic_device_sync_one(dev, utn, i, j); +} + +static void +udp_tunnel_nic_device_sync_by_table(struct net_device *dev, + struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + int err; + + for (i = 0; i < utn->n_tables; i++) { + /* Find something that needs sync in this table */ + for (j = 0; j < info->tables[i].n_entries; j++) + if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j])) + break; + if (j == info->tables[i].n_entries) + continue; + + err = info->sync_table(dev, i); + if (err) + netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n", + i, err); + + for (j = 0; j < info->tables[i].n_entries; j++) { + struct udp_tunnel_nic_table_entry *entry; + + entry = &utn->entries[i][j]; + if (udp_tunnel_nic_entry_is_queued(entry)) + udp_tunnel_nic_entry_update_done(entry, err); + } + } +} + +static void +__udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + if (!utn->need_sync) + return; + + if (dev->udp_tunnel_nic_info->sync_table) + udp_tunnel_nic_device_sync_by_table(dev, utn); + else + udp_tunnel_nic_device_sync_by_port(dev, utn); + + utn->need_sync = 0; + /* Can't replay directly here, in case we come from the tunnel driver's + * notification - trying to replay may deadlock inside tunnel driver. + */ + utn->need_replay = udp_tunnel_nic_should_replay(dev, utn); +} + +static void +udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + bool may_sleep; + + if (!utn->need_sync) + return; + + /* Drivers which sleep in the callback need to update from + * the workqueue, if we come from the tunnel driver's notification. + */ + may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP; + if (!may_sleep) + __udp_tunnel_nic_device_sync(dev, utn); + if (may_sleep || utn->need_replay) { + queue_work(udp_tunnel_nic_workqueue, &utn->work); + utn->work_pending = 1; + } +} + +static bool +udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table, + struct udp_tunnel_info *ti) +{ + return table->tunnel_types & ti->type; +} + +static bool +udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i; + + /* Special case IPv4-only NICs */ + if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY && + ti->sa_family != AF_INET) + return false; + + for (i = 0; i < utn->n_tables; i++) + if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti)) + return true; + return false; +} + +static int +udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic_table_entry *entry; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) { + entry = &utn->entries[i][j]; + + if (!udp_tunnel_nic_entry_is_free(entry) && + entry->port == ti->port && + entry->type != ti->type) { + __set_bit(i, &utn->missed); + return true; + } + } + return false; +} + +static void +udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn, + unsigned int table, unsigned int idx, int use_cnt_adj) +{ + struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; + bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; + unsigned int from, to; + + /* If not going from used to unused or vice versa - all done. + * For dodgy entries make sure we try to sync again (queue the entry). + */ + entry->use_cnt += use_cnt_adj; + if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj)) + return; + + /* Cancel the op before it was sent to the device, if possible, + * otherwise we'd need to take special care to issue commands + * in the same order the ports arrived. + */ + if (use_cnt_adj < 0) { + from = UDP_TUNNEL_NIC_ENTRY_ADD; + to = UDP_TUNNEL_NIC_ENTRY_DEL; + } else { + from = UDP_TUNNEL_NIC_ENTRY_DEL; + to = UDP_TUNNEL_NIC_ENTRY_ADD; + } + + if (entry->flags & from) { + entry->flags &= ~from; + if (!dodgy) + return; + } + + udp_tunnel_nic_entry_queue(utn, entry, to); +} + +static bool +udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn, + unsigned int table, unsigned int idx, + struct udp_tunnel_info *ti, int use_cnt_adj) +{ + struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; + + if (udp_tunnel_nic_entry_is_free(entry) || + entry->port != ti->port || + entry->type != ti->type) + return false; + + if (udp_tunnel_nic_entry_is_frozen(entry)) + return true; + + udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj); + return true; +} + +/* Try to find existing matching entry and adjust its use count, instead of + * adding a new one. Returns true if entry was found. In case of delete the + * entry may have gotten removed in the process, in which case it will be + * queued for removal. + */ +static bool +udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti, int use_cnt_adj) +{ + const struct udp_tunnel_nic_table_info *table; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) { + table = &dev->udp_tunnel_nic_info->tables[i]; + if (!udp_tunnel_nic_table_is_capable(table, ti)) + continue; + + for (j = 0; j < table->n_entries; j++) + if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti, + use_cnt_adj)) + return true; + } + + return false; +} + +static bool +udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + return udp_tunnel_nic_try_existing(dev, utn, ti, +1); +} + +static bool +udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + return udp_tunnel_nic_try_existing(dev, utn, ti, -1); +} + +static bool +udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn, + struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_table_info *table; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) { + table = &dev->udp_tunnel_nic_info->tables[i]; + if (!udp_tunnel_nic_table_is_capable(table, ti)) + continue; + + for (j = 0; j < table->n_entries; j++) { + struct udp_tunnel_nic_table_entry *entry; + + entry = &utn->entries[i][j]; + if (!udp_tunnel_nic_entry_is_free(entry)) + continue; + + entry->port = ti->port; + entry->type = ti->type; + entry->use_cnt = 1; + udp_tunnel_nic_entry_queue(utn, entry, + UDP_TUNNEL_NIC_ENTRY_ADD); + return true; + } + + /* The different table may still fit this port in, but there + * are no devices currently which have multiple tables accepting + * the same tunnel type, and false positives are okay. + */ + __set_bit(i, &utn->missed); + } + + return false; +} + +static void +__udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + if (!utn) + return; + if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY) + return; + + if (!udp_tunnel_nic_is_capable(dev, utn, ti)) + return; + + /* It may happen that a tunnel of one type is removed and different + * tunnel type tries to reuse its port before the device was informed. + * Rely on utn->missed to re-add this port later. + */ + if (udp_tunnel_nic_has_collision(dev, utn, ti)) + return; + + if (!udp_tunnel_nic_add_existing(dev, utn, ti)) + udp_tunnel_nic_add_new(dev, utn, ti); + + udp_tunnel_nic_device_sync(dev, utn); +} + +static void +__udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) +{ + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + if (!utn) + return; + + if (!udp_tunnel_nic_is_capable(dev, utn, ti)) + return; + + udp_tunnel_nic_del_existing(dev, utn, ti); + + udp_tunnel_nic_device_sync(dev, utn); +} + +static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + unsigned int i, j; + + ASSERT_RTNL(); + + utn = dev->udp_tunnel_nic; + if (!utn) + return; + + utn->need_sync = false; + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) { + struct udp_tunnel_nic_table_entry *entry; + + entry = &utn->entries[i][j]; + + entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL | + UDP_TUNNEL_NIC_ENTRY_OP_FAIL); + /* We don't release rtnl across ops */ + WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN); + if (!entry->use_cnt) + continue; + + udp_tunnel_nic_entry_queue(utn, entry, + UDP_TUNNEL_NIC_ENTRY_ADD); + } + + __udp_tunnel_nic_device_sync(dev, utn); +} + +static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { + .get_port = __udp_tunnel_nic_get_port, + .set_port_priv = __udp_tunnel_nic_set_port_priv, + .add_port = __udp_tunnel_nic_add_port, + .del_port = __udp_tunnel_nic_del_port, + .reset_ntf = __udp_tunnel_nic_reset_ntf, +}; + +static void +udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) { + int adj_cnt = -utn->entries[i][j].use_cnt; + + if (adj_cnt) + udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt); + } + + __udp_tunnel_nic_device_sync(dev, utn); + + for (i = 0; i < utn->n_tables; i++) + memset(utn->entries[i], 0, array_size(info->tables[i].n_entries, + sizeof(**utn->entries))); + WARN_ON(utn->need_sync); + utn->need_replay = 0; +} + +static void +udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + unsigned int i, j; + + /* Freeze all the ports we are already tracking so that the replay + * does not double up the refcount. + */ + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]); + utn->missed = 0; + utn->need_replay = 0; + + udp_tunnel_get_rx_info(dev); + + for (i = 0; i < utn->n_tables; i++) + for (j = 0; j < info->tables[i].n_entries; j++) + udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]); +} + +static void udp_tunnel_nic_device_sync_work(struct work_struct *work) +{ + struct udp_tunnel_nic *utn = + container_of(work, struct udp_tunnel_nic, work); + + rtnl_lock(); + utn->work_pending = 0; + __udp_tunnel_nic_device_sync(utn->dev, utn); + + if (utn->need_replay) + udp_tunnel_nic_replay(utn->dev, utn); + rtnl_unlock(); +} + +static struct udp_tunnel_nic * +udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, + unsigned int n_tables) +{ + struct udp_tunnel_nic *utn; + unsigned int i; + + utn = kzalloc(sizeof(*utn), GFP_KERNEL); + if (!utn) + return NULL; + utn->n_tables = n_tables; + INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work); + + utn->entries = kmalloc_array(n_tables, sizeof(void *), GFP_KERNEL); + if (!utn->entries) + goto err_free_utn; + + for (i = 0; i < n_tables; i++) { + utn->entries[i] = kcalloc(info->tables[i].n_entries, + sizeof(*utn->entries[i]), GFP_KERNEL); + if (!utn->entries[i]) + goto err_free_prev_entries; + } + + return utn; + +err_free_prev_entries: + while (i--) + kfree(utn->entries[i]); + kfree(utn->entries); +err_free_utn: + kfree(utn); + return NULL; +} + +static int udp_tunnel_nic_register(struct net_device *dev) +{ + const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + struct udp_tunnel_nic *utn; + unsigned int n_tables, i; + + BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE < + UDP_TUNNEL_NIC_MAX_TABLES); + + if (WARN_ON(!info->set_port != !info->unset_port) || + WARN_ON(!info->set_port == !info->sync_table) || + WARN_ON(!info->tables[0].n_entries)) + return -EINVAL; + + n_tables = 1; + for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { + if (!info->tables[i].n_entries) + continue; + + n_tables++; + if (WARN_ON(!info->tables[i - 1].n_entries)) + return -EINVAL; + } + + utn = udp_tunnel_nic_alloc(info, n_tables); + if (!utn) + return -ENOMEM; + + utn->dev = dev; + dev_hold(dev); + dev->udp_tunnel_nic = utn; + + if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) + udp_tunnel_get_rx_info(dev); + + return 0; +} + +static void +udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) +{ + unsigned int i; + + /* Flush before we check work, so we don't waste time adding entries + * from the work which we will boot immediately. + */ + udp_tunnel_nic_flush(dev, utn); + + /* Wait for the work to be done using the state, netdev core will + * retry unregister until we give up our reference on this device. + */ + if (utn->work_pending) + return; + + for (i = 0; i < utn->n_tables; i++) + kfree(utn->entries[i]); + kfree(utn->entries); + kfree(utn); + dev->udp_tunnel_nic = NULL; + dev_put(dev); +} + +static int +udp_tunnel_nic_netdevice_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + const struct udp_tunnel_nic_info *info; + struct udp_tunnel_nic *utn; + + info = dev->udp_tunnel_nic_info; + if (!info) + return NOTIFY_DONE; + + if (event == NETDEV_REGISTER) { + int err; + + err = udp_tunnel_nic_register(dev); + if (err) + netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err); + return notifier_from_errno(err); + } + /* All other events will need the udp_tunnel_nic state */ + utn = dev->udp_tunnel_nic; + if (!utn) + return NOTIFY_DONE; + + if (event == NETDEV_UNREGISTER) { + udp_tunnel_nic_unregister(dev, utn); + return NOTIFY_OK; + } + + /* All other events only matter if NIC has to be programmed open */ + if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) + return NOTIFY_DONE; + + if (event == NETDEV_UP) { + WARN_ON(!udp_tunnel_nic_is_empty(dev, utn)); + udp_tunnel_get_rx_info(dev); + return NOTIFY_OK; + } + if (event == NETDEV_GOING_DOWN) { + udp_tunnel_nic_flush(dev, utn); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = { + .notifier_call = udp_tunnel_nic_netdevice_event, +}; + +static int __init udp_tunnel_nic_init_module(void) +{ + int err; + + udp_tunnel_nic_workqueue = alloc_workqueue("udp_tunnel_nic", 0, 0); + if (!udp_tunnel_nic_workqueue) + return -ENOMEM; + + rtnl_lock(); + udp_tunnel_nic_ops = &__udp_tunnel_nic_ops; + rtnl_unlock(); + + err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block); + if (err) + goto err_unset_ops; + + return 0; + +err_unset_ops: + rtnl_lock(); + udp_tunnel_nic_ops = NULL; + rtnl_unlock(); + destroy_workqueue(udp_tunnel_nic_workqueue); + return err; +} +late_initcall(udp_tunnel_nic_init_module); + +static void __exit udp_tunnel_nic_cleanup_module(void) +{ + unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block); + + rtnl_lock(); + udp_tunnel_nic_ops = NULL; + rtnl_unlock(); + + destroy_workqueue(udp_tunnel_nic_workqueue); +} +module_exit(udp_tunnel_nic_cleanup_module); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/udp_tunnel_stub.c b/net/ipv4/udp_tunnel_stub.c new file mode 100644 index 000000000000..c4b2888f5fef --- /dev/null +++ b/net/ipv4/udp_tunnel_stub.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (c) 2020 Facebook Inc. + +#include + +const struct udp_tunnel_nic_ops *udp_tunnel_nic_ops; +EXPORT_SYMBOL_GPL(udp_tunnel_nic_ops); -- cgit v1.2.3 From 7f0a838254bdd9114b978ef2541a6ce330307e9e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 Jul 2020 23:45:55 -0700 Subject: bpf, xdp: Maintain info on attached XDP BPF programs in net_device Instead of delegating to drivers, maintain information about which BPF programs are attached in which XDP modes (generic/skb, driver, or hardware) locally in net_device. This effectively obsoletes XDP_QUERY_PROG command. Such re-organization simplifies existing code already. But it also allows to further add bpf_link-based XDP attachments without drivers having to know about any of this at all, which seems like a good setup. XDP_SETUP_PROG/XDP_SETUP_PROG_HW are just low-level commands to driver to install/uninstall active BPF program. All the higher-level concerns about prog/link interaction will be contained within generic driver-agnostic logic. All the XDP_QUERY_PROG calls to driver in dev_xdp_uninstall() were removed. It's not clear for me why dev_xdp_uninstall() were passing previous prog_flags when resetting installed programs. That seems unnecessary, plus most drivers don't populate prog_flags anyways. Having XDP_SETUP_PROG vs XDP_SETUP_PROG_HW should be enough of an indicator of what is required of driver to correctly reset active BPF program. dev_xdp_uninstall() is also generalized as an iteration over all three supported mode. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200722064603.3350758-3-andriin@fb.com --- include/linux/netdevice.h | 17 ++++- net/core/dev.c | 158 ++++++++++++++++++++++++++-------------------- net/core/rtnetlink.c | 5 +- 3 files changed, 105 insertions(+), 75 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ac2cd3f49aba..cad44b40c776 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -889,6 +889,17 @@ struct netlink_ext_ack; struct xdp_umem; struct xdp_dev_bulk_queue; +enum bpf_xdp_mode { + XDP_MODE_SKB = 0, + XDP_MODE_DRV = 1, + XDP_MODE_HW = 2, + __MAX_XDP_MODE +}; + +struct bpf_xdp_entity { + struct bpf_prog *prog; +}; + struct netdev_bpf { enum bpf_netdev_command command; union { @@ -2142,6 +2153,9 @@ struct net_device { #endif const struct udp_tunnel_nic_info *udp_tunnel_nic_info; struct udp_tunnel_nic *udp_tunnel_nic; + + /* protected by rtnl_lock */ + struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE]; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -3817,8 +3831,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags); -u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op, - enum bpf_netdev_command cmd); +u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); int xdp_umem_query(struct net_device *dev, u16 queue_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index fe2e387eed29..bf38fde667e9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8716,84 +8716,103 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down_generic); -u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, - enum bpf_netdev_command cmd) +static enum bpf_xdp_mode dev_xdp_mode(u32 flags) { - struct netdev_bpf xdp; + if (flags & XDP_FLAGS_HW_MODE) + return XDP_MODE_HW; + if (flags & XDP_FLAGS_DRV_MODE) + return XDP_MODE_DRV; + return XDP_MODE_SKB; +} - if (!bpf_op) - return 0; +static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) +{ + switch (mode) { + case XDP_MODE_SKB: + return generic_xdp_install; + case XDP_MODE_DRV: + case XDP_MODE_HW: + return dev->netdev_ops->ndo_bpf; + default: + return NULL; + }; +} - memset(&xdp, 0, sizeof(xdp)); - xdp.command = cmd; +static struct bpf_prog *dev_xdp_prog(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + return dev->xdp_state[mode].prog; +} + +u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) +{ + struct bpf_prog *prog = dev_xdp_prog(dev, mode); - /* Query must always succeed. */ - WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG); + return prog ? prog->aux->id : 0; +} - return xdp.prog_id; +static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_prog *prog) +{ + dev->xdp_state[mode].prog = prog; } -static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, - struct netlink_ext_ack *extack, u32 flags, - struct bpf_prog *prog) +static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, + bpf_op_t bpf_op, struct netlink_ext_ack *extack, + u32 flags, struct bpf_prog *prog) { - bool non_hw = !(flags & XDP_FLAGS_HW_MODE); - struct bpf_prog *prev_prog = NULL; struct netdev_bpf xdp; int err; - if (non_hw) { - prev_prog = bpf_prog_by_id(__dev_xdp_query(dev, bpf_op, - XDP_QUERY_PROG)); - if (IS_ERR(prev_prog)) - prev_prog = NULL; - } - memset(&xdp, 0, sizeof(xdp)); - if (flags & XDP_FLAGS_HW_MODE) - xdp.command = XDP_SETUP_PROG_HW; - else - xdp.command = XDP_SETUP_PROG; + xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; xdp.extack = extack; xdp.flags = flags; xdp.prog = prog; + /* Drivers assume refcnt is already incremented (i.e, prog pointer is + * "moved" into driver), so they don't increment it on their own, but + * they do decrement refcnt when program is detached or replaced. + * Given net_device also owns link/prog, we need to bump refcnt here + * to prevent drivers from underflowing it. + */ + if (prog) + bpf_prog_inc(prog); err = bpf_op(dev, &xdp); - if (!err && non_hw) - bpf_prog_change_xdp(prev_prog, prog); + if (err) { + if (prog) + bpf_prog_put(prog); + return err; + } - if (prev_prog) - bpf_prog_put(prev_prog); + if (mode != XDP_MODE_HW) + bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog); - return err; + return 0; } static void dev_xdp_uninstall(struct net_device *dev) { - struct netdev_bpf xdp; - bpf_op_t ndo_bpf; + struct bpf_prog *prog; + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; - /* Remove generic XDP */ - WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL)); + ASSERT_RTNL(); - /* Remove from the driver */ - ndo_bpf = dev->netdev_ops->ndo_bpf; - if (!ndo_bpf) - return; + for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) { + prog = dev_xdp_prog(dev, mode); + if (!prog) + continue; - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_QUERY_PROG; - WARN_ON(ndo_bpf(dev, &xdp)); - if (xdp.prog_id) - WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, - NULL)); + bpf_op = dev_xdp_bpf_op(dev, mode); + if (!bpf_op) + continue; - /* Remove HW offload */ - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_QUERY_PROG_HW; - if (!ndo_bpf(dev, &xdp) && xdp.prog_id) - WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, - NULL)); + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + + bpf_prog_put(prog); + dev_xdp_set_prog(dev, mode, NULL); + } } /** @@ -8810,29 +8829,22 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags) { const struct net_device_ops *ops = dev->netdev_ops; - enum bpf_netdev_command query; + enum bpf_xdp_mode mode = dev_xdp_mode(flags); + bool offload = mode == XDP_MODE_HW; u32 prog_id, expected_id = 0; - bpf_op_t bpf_op, bpf_chk; struct bpf_prog *prog; - bool offload; + bpf_op_t bpf_op; int err; ASSERT_RTNL(); - offload = flags & XDP_FLAGS_HW_MODE; - query = offload ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG; - - bpf_op = bpf_chk = ops->ndo_bpf; - if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) { + bpf_op = dev_xdp_bpf_op(dev, mode); + if (!bpf_op) { NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode"); return -EOPNOTSUPP; } - if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE)) - bpf_op = generic_xdp_install; - if (bpf_op == bpf_chk) - bpf_chk = generic_xdp_install; - prog_id = __dev_xdp_query(dev, bpf_op, query); + prog_id = dev_xdp_prog_id(dev, mode); if (flags & XDP_FLAGS_REPLACE) { if (expected_fd >= 0) { prog = bpf_prog_get_type_dev(expected_fd, @@ -8850,8 +8862,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, } } if (fd >= 0) { - if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) { - NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time"); + enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB + ? XDP_MODE_DRV : XDP_MODE_SKB; + + if (!offload && dev_xdp_prog_id(dev, other_mode)) { + NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time"); return -EEXIST; } @@ -8866,7 +8881,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return PTR_ERR(prog); if (!offload && bpf_prog_is_dev_bound(prog->aux)) { - NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported"); + NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported"); bpf_prog_put(prog); return -EINVAL; } @@ -8895,11 +8910,14 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, prog = NULL; } - err = dev_xdp_install(dev, bpf_op, extack, flags, prog); - if (err < 0 && prog) + err = dev_xdp_install(dev, mode, bpf_op, extack, flags, prog); + if (err < 0 && prog) { bpf_prog_put(prog); + return err; + } + dev_xdp_set_prog(dev, mode, prog); - return err; + return 0; } /** diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 85a4b0101f76..58c484a28395 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1416,13 +1416,12 @@ static u32 rtnl_xdp_prog_skb(struct net_device *dev) static u32 rtnl_xdp_prog_drv(struct net_device *dev) { - return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG); + return dev_xdp_prog_id(dev, XDP_MODE_DRV); } static u32 rtnl_xdp_prog_hw(struct net_device *dev) { - return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, - XDP_QUERY_PROG_HW); + return dev_xdp_prog_id(dev, XDP_MODE_HW); } static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, -- cgit v1.2.3 From aa8d3a716b59db6c1ad6c68fb8aa05e31980da60 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 Jul 2020 23:45:57 -0700 Subject: bpf, xdp: Add bpf_link-based XDP attachment API Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through BPF_LINK_CREATE command. bpf_xdp_link is mutually exclusive with direct BPF program attachment, previous BPF program should be detached prior to attempting to create a new bpf_xdp_link attachment (for a given XDP mode). Once BPF link is attached, it can't be replaced by other BPF program attachment or link attachment. It will be detached only when the last BPF link FD is closed. bpf_xdp_link will be auto-detached when net_device is shutdown, similarly to how other BPF links behave (cgroup, flow_dissector). At that point bpf_link will become defunct, but won't be destroyed until last FD is closed. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200722064603.3350758-5-andriin@fb.com --- include/linux/netdevice.h | 4 ++ include/uapi/linux/bpf.h | 7 +- kernel/bpf/syscall.c | 5 ++ net/core/dev.c | 169 ++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 178 insertions(+), 7 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cad44b40c776..7d3c412fcfe5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -888,6 +888,7 @@ struct bpf_prog_offload_ops; struct netlink_ext_ack; struct xdp_umem; struct xdp_dev_bulk_queue; +struct bpf_xdp_link; enum bpf_xdp_mode { XDP_MODE_SKB = 0, @@ -898,6 +899,7 @@ enum bpf_xdp_mode { struct bpf_xdp_entity { struct bpf_prog *prog; + struct bpf_xdp_link *link; }; struct netdev_bpf { @@ -3831,7 +3833,9 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags); +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); + int xdp_umem_query(struct net_device *dev, u16 queue_id); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 828c2f6438f2..87823fb9c123 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -230,6 +230,7 @@ enum bpf_attach_type { BPF_CGROUP_INET_SOCK_RELEASE, BPF_XDP_CPUMAP, BPF_SK_LOOKUP, + BPF_XDP, __MAX_BPF_ATTACH_TYPE }; @@ -242,6 +243,7 @@ enum bpf_link_type { BPF_LINK_TYPE_CGROUP = 3, BPF_LINK_TYPE_ITER = 4, BPF_LINK_TYPE_NETNS = 5, + BPF_LINK_TYPE_XDP = 6, MAX_BPF_LINK_TYPE, }; @@ -614,7 +616,10 @@ union bpf_attr { struct { /* struct used by BPF_LINK_CREATE command */ __u32 prog_fd; /* eBPF program to attach */ - __u32 target_fd; /* object to attach to */ + union { + __u32 target_fd; /* object to attach to */ + __u32 target_ifindex; /* target ifindex */ + }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ } link_create; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ee290b1f2d9e..0e8c88db7e7a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2824,6 +2824,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_TRACING; case BPF_SK_LOOKUP: return BPF_PROG_TYPE_SK_LOOKUP; + case BPF_XDP: + return BPF_PROG_TYPE_XDP; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3921,6 +3923,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; + case BPF_PROG_TYPE_XDP: + ret = bpf_xdp_link_attach(attr, prog); + break; default: ret = -EINVAL; } diff --git a/net/core/dev.c b/net/core/dev.c index 521ce031ee35..e24248f3d675 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8716,6 +8716,12 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down_generic); +struct bpf_xdp_link { + struct bpf_link link; + struct net_device *dev; /* protected by rtnl_lock, no refcnt held */ + int flags; +}; + static enum bpf_xdp_mode dev_xdp_mode(u32 flags) { if (flags & XDP_FLAGS_HW_MODE) @@ -8738,9 +8744,19 @@ static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) }; } +static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, + enum bpf_xdp_mode mode) +{ + return dev->xdp_state[mode].link; +} + static struct bpf_prog *dev_xdp_prog(struct net_device *dev, enum bpf_xdp_mode mode) { + struct bpf_xdp_link *link = dev_xdp_link(dev, mode); + + if (link) + return link->link.prog; return dev->xdp_state[mode].prog; } @@ -8751,9 +8767,17 @@ u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) return prog ? prog->aux->id : 0; } +static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode, + struct bpf_xdp_link *link) +{ + dev->xdp_state[mode].link = link; + dev->xdp_state[mode].prog = NULL; +} + static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode, struct bpf_prog *prog) { + dev->xdp_state[mode].link = NULL; dev->xdp_state[mode].prog = prog; } @@ -8793,6 +8817,7 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, static void dev_xdp_uninstall(struct net_device *dev) { + struct bpf_xdp_link *link; struct bpf_prog *prog; enum bpf_xdp_mode mode; bpf_op_t bpf_op; @@ -8810,14 +8835,20 @@ static void dev_xdp_uninstall(struct net_device *dev) WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); - bpf_prog_put(prog); - dev_xdp_set_prog(dev, mode, NULL); + /* auto-detach link from net device */ + link = dev_xdp_link(dev, mode); + if (link) + link->dev = NULL; + else + bpf_prog_put(prog); + + dev_xdp_set_link(dev, mode, NULL); } } static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack, - struct bpf_prog *new_prog, struct bpf_prog *old_prog, - u32 flags) + struct bpf_xdp_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog, u32 flags) { struct bpf_prog *cur_prog; enum bpf_xdp_mode mode; @@ -8826,6 +8857,14 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack ASSERT_RTNL(); + /* either link or prog attachment, never both */ + if (link && (new_prog || old_prog)) + return -EINVAL; + /* link supports only XDP mode flags */ + if (link && (flags & ~XDP_FLAGS_MODES)) { + NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); + return -EINVAL; + } /* just one XDP mode bit should be set, zero defaults to SKB mode */ if (hweight32(flags & XDP_FLAGS_MODES) > 1) { NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); @@ -8838,7 +8877,18 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack } mode = dev_xdp_mode(flags); + /* can't replace attached link */ + if (dev_xdp_link(dev, mode)) { + NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link"); + return -EBUSY; + } + cur_prog = dev_xdp_prog(dev, mode); + /* can't replace attached prog with link */ + if (link && cur_prog) { + NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link"); + return -EBUSY; + } if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) { NL_SET_ERR_MSG(extack, "Active program does not match expected"); return -EEXIST; @@ -8848,6 +8898,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack return -EBUSY; } + /* put effective new program into new_prog */ + if (link) + new_prog = link->link.prog; + if (new_prog) { bool offload = mode == XDP_MODE_HW; enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB @@ -8884,13 +8938,116 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack return err; } - dev_xdp_set_prog(dev, mode, new_prog); + if (link) + dev_xdp_set_link(dev, mode, link); + else + dev_xdp_set_prog(dev, mode, new_prog); if (cur_prog) bpf_prog_put(cur_prog); return 0; } +static int dev_xdp_attach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags); +} + +static int dev_xdp_detach_link(struct net_device *dev, + struct netlink_ext_ack *extack, + struct bpf_xdp_link *link) +{ + enum bpf_xdp_mode mode; + bpf_op_t bpf_op; + + ASSERT_RTNL(); + + mode = dev_xdp_mode(link->flags); + if (dev_xdp_link(dev, mode) != link) + return -EINVAL; + + bpf_op = dev_xdp_bpf_op(dev, mode); + WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL)); + dev_xdp_set_link(dev, mode, NULL); + return 0; +} + +static void bpf_xdp_link_release(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + rtnl_lock(); + + /* if racing with net_device's tear down, xdp_link->dev might be + * already NULL, in which case link was already auto-detached + */ + if (xdp_link->dev) + WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + + rtnl_unlock(); +} + +static void bpf_xdp_link_dealloc(struct bpf_link *link) +{ + struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link); + + kfree(xdp_link); +} + +static const struct bpf_link_ops bpf_xdp_link_lops = { + .release = bpf_xdp_link_release, + .dealloc = bpf_xdp_link_dealloc, +}; + +int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct bpf_xdp_link *link; + struct net_device *dev; + int err, fd; + + dev = dev_get_by_index(net, attr->link_create.target_ifindex); + if (!dev) + return -EINVAL; + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto out_put_dev; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); + link->dev = dev; + link->flags = attr->link_create.flags; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + goto out_put_dev; + } + + rtnl_lock(); + err = dev_xdp_attach_link(dev, NULL, link); + rtnl_unlock(); + + if (err) { + bpf_link_cleanup(&link_primer); + goto out_put_dev; + } + + fd = bpf_link_settle(&link_primer); + /* link itself doesn't hold dev's refcnt to not complicate shutdown */ + dev_put(dev); + return fd; + +out_put_dev: + dev_put(dev); + return err; +} + /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device @@ -8927,7 +9084,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, } } - err = dev_xdp_attach(dev, extack, new_prog, old_prog, flags); + err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags); err_out: if (err && new_prog) -- cgit v1.2.3 From e8407fdeb9a6866784e249881f6c786a0835faba Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 21 Jul 2020 23:46:02 -0700 Subject: bpf, xdp: Remove XDP_QUERY_PROG and XDP_QUERY_PROG_HW XDP commands Now that BPF program/link management is centralized in generic net_device code, kernel code never queries program id from drivers, so XDP_QUERY_PROG/XDP_QUERY_PROG_HW commands are unnecessary. This patch removes all the implementations of those commands in kernel, along the xdp_attachment_query(). This patch was compile-tested on allyesconfig. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200722064603.3350758-10-andriin@fb.com --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 6 ------ drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 4 ---- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 3 --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 5 ----- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 --- drivers/net/ethernet/intel/ice/ice_main.c | 3 --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 4 ---- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 6 ------ drivers/net/ethernet/marvell/mvneta.c | 5 ----- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 3 --- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 24 ---------------------- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 18 ---------------- .../net/ethernet/netronome/nfp/nfp_net_common.c | 4 ---- drivers/net/ethernet/qlogic/qede/qede_filter.c | 3 --- drivers/net/ethernet/sfc/efx.c | 5 ----- drivers/net/ethernet/socionext/netsec.c | 3 --- drivers/net/ethernet/ti/cpsw_priv.c | 3 --- drivers/net/hyperv/netvsc_bpf.c | 21 +------------------ drivers/net/netdevsim/bpf.c | 4 ---- drivers/net/netdevsim/netdevsim.h | 2 +- drivers/net/tun.c | 15 -------------- drivers/net/veth.c | 15 -------------- drivers/net/virtio_net.c | 17 --------------- drivers/net/xen-netfront.c | 21 ------------------- include/linux/netdevice.h | 8 -------- include/net/xdp.h | 2 -- net/core/dev.c | 4 ---- net/core/xdp.c | 9 -------- 28 files changed, 2 insertions(+), 218 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 6478c1e0d137..2a6c9725e092 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -576,15 +576,9 @@ static int ena_xdp_set(struct net_device *netdev, struct netdev_bpf *bpf) */ static int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf) { - struct ena_adapter *adapter = netdev_priv(netdev); - switch (bpf->command) { case XDP_SETUP_PROG: return ena_xdp_set(netdev, bpf); - case XDP_QUERY_PROG: - bpf->prog_id = adapter->xdp_bpf_prog ? - adapter->xdp_bpf_prog->aux->id : 0; - break; default: return -EINVAL; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 5e3b4a3b69ea..2704a4709bc7 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -330,10 +330,6 @@ int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp) case XDP_SETUP_PROG: rc = bnxt_xdp_set(bp, xdp->prog); break; - case XDP_QUERY_PROG: - xdp->prog_id = bp->xdp_prog ? bp->xdp_prog->aux->id : 0; - rc = 0; - break; default: rc = -EINVAL; break; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 2ba0ce115e63..1c6163934e20 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1906,9 +1906,6 @@ static int nicvf_xdp(struct net_device *netdev, struct netdev_bpf *xdp) switch (xdp->command) { case XDP_SETUP_PROG: return nicvf_xdp_setup(nic, xdp->prog); - case XDP_QUERY_PROG: - xdp->prog_id = nic->xdp_prog ? nic->xdp_prog->aux->id : 0; - return 0; default: return -EINVAL; } diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 9b4028c0e34c..17f6bcafc944 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -2077,14 +2077,9 @@ out_err: static int dpaa2_eth_xdp(struct net_device *dev, struct netdev_bpf *xdp) { - struct dpaa2_eth_priv *priv = netdev_priv(dev); - switch (xdp->command) { case XDP_SETUP_PROG: return setup_xdp(dev, xdp->prog); - case XDP_QUERY_PROG: - xdp->prog_id = priv->xdp_prog ? priv->xdp_prog->aux->id : 0; - break; default: return -EINVAL; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index dadbfb3d2a2b..d8315811cbdf 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -12923,9 +12923,6 @@ static int i40e_xdp(struct net_device *dev, switch (xdp->command) { case XDP_SETUP_PROG: return i40e_xdp_setup(vsi, xdp->prog); - case XDP_QUERY_PROG: - xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0; - return 0; case XDP_SETUP_XSK_UMEM: return i40e_xsk_umem_setup(vsi, xdp->xsk.umem, xdp->xsk.queue_id); diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 16a4096bb780..231f4b6e93d0 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2292,9 +2292,6 @@ static int ice_xdp(struct net_device *dev, struct netdev_bpf *xdp) switch (xdp->command) { case XDP_SETUP_PROG: return ice_xdp_setup_prog(vsi, xdp->prog, xdp->extack); - case XDP_QUERY_PROG: - xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0; - return 0; case XDP_SETUP_XSK_UMEM: return ice_xsk_umem_setup(vsi, xdp->xsk.umem, xdp->xsk.queue_id); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 4d898ff21a46..6f32b1706ab9 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -10190,10 +10190,6 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) switch (xdp->command) { case XDP_SETUP_PROG: return ixgbe_xdp_setup(dev, xdp->prog); - case XDP_QUERY_PROG: - xdp->prog_id = adapter->xdp_prog ? - adapter->xdp_prog->aux->id : 0; - return 0; case XDP_SETUP_XSK_UMEM: return ixgbe_xsk_umem_setup(adapter, xdp->xsk.umem, xdp->xsk.queue_id); diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 6e9a397db583..a6267569bfa9 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -4502,15 +4502,9 @@ static int ixgbevf_xdp_setup(struct net_device *dev, struct bpf_prog *prog) static int ixgbevf_xdp(struct net_device *dev, struct netdev_bpf *xdp) { - struct ixgbevf_adapter *adapter = netdev_priv(dev); - switch (xdp->command) { case XDP_SETUP_PROG: return ixgbevf_xdp_setup(dev, xdp->prog); - case XDP_QUERY_PROG: - xdp->prog_id = adapter->xdp_prog ? - adapter->xdp_prog->aux->id : 0; - return 0; default: return -EINVAL; } diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 2c9277e73cef..6e3f9e2f883b 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -4442,14 +4442,9 @@ static int mvneta_xdp_setup(struct net_device *dev, struct bpf_prog *prog, static int mvneta_xdp(struct net_device *dev, struct netdev_bpf *xdp) { - struct mvneta_port *pp = netdev_priv(dev); - switch (xdp->command) { case XDP_SETUP_PROG: return mvneta_xdp_setup(dev, xdp->prog, xdp->extack); - case XDP_QUERY_PROG: - xdp->prog_id = pp->xdp_prog ? pp->xdp_prog->aux->id : 0; - return 0; default: return -EINVAL; } diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 6a3f356640a0..cd5e9d60307e 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -4656,9 +4656,6 @@ static int mvpp2_xdp(struct net_device *dev, struct netdev_bpf *xdp) switch (xdp->command) { case XDP_SETUP_PROG: return mvpp2_xdp_setup(port, xdp); - case XDP_QUERY_PROG: - xdp->prog_id = port->xdp_prog ? port->xdp_prog->aux->id : 0; - return 0; default: return -EINVAL; } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 2b8608f8f0a9..106513f772c3 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2802,35 +2802,11 @@ unlock_out: return err; } -static u32 mlx4_xdp_query(struct net_device *dev) -{ - struct mlx4_en_priv *priv = netdev_priv(dev); - struct mlx4_en_dev *mdev = priv->mdev; - const struct bpf_prog *xdp_prog; - u32 prog_id = 0; - - if (!priv->tx_ring_num[TX_XDP]) - return prog_id; - - mutex_lock(&mdev->state_lock); - xdp_prog = rcu_dereference_protected( - priv->rx_ring[0]->xdp_prog, - lockdep_is_held(&mdev->state_lock)); - if (xdp_prog) - prog_id = xdp_prog->aux->id; - mutex_unlock(&mdev->state_lock); - - return prog_id; -} - static int mlx4_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return mlx4_xdp_set(dev, xdp->prog); - case XDP_QUERY_PROG: - xdp->prog_id = mlx4_xdp_query(dev); - return 0; default: return -EINVAL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 9d5d8b28bcd8..aa4fb503dac3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4518,29 +4518,11 @@ unlock: return err; } -static u32 mlx5e_xdp_query(struct net_device *dev) -{ - struct mlx5e_priv *priv = netdev_priv(dev); - const struct bpf_prog *xdp_prog; - u32 prog_id = 0; - - mutex_lock(&priv->state_lock); - xdp_prog = priv->channels.params.xdp_prog; - if (xdp_prog) - prog_id = xdp_prog->aux->id; - mutex_unlock(&priv->state_lock); - - return prog_id; -} - static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return mlx5e_xdp_set(dev, xdp->prog); - case XDP_QUERY_PROG: - xdp->prog_id = mlx5e_xdp_query(dev); - return 0; case XDP_SETUP_XSK_UMEM: return mlx5e_xsk_setup_umem(dev, xdp->xsk.umem, xdp->xsk.queue_id); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 44608873d3d9..39ee23e8c0bf 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3614,10 +3614,6 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp) return nfp_net_xdp_setup_drv(nn, xdp); case XDP_SETUP_PROG_HW: return nfp_net_xdp_setup_hw(nn, xdp); - case XDP_QUERY_PROG: - return xdp_attachment_query(&nn->xdp, xdp); - case XDP_QUERY_PROG_HW: - return xdp_attachment_query(&nn->xdp_hw, xdp); default: return nfp_app_bpf(nn->app, nn, xdp); } diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index b7d0b6ccebd3..f961f65d9372 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -1045,9 +1045,6 @@ int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp) switch (xdp->command) { case XDP_SETUP_PROG: return qede_xdp_set(edev, xdp->prog); - case XDP_QUERY_PROG: - xdp->prog_id = edev->xdp_prog ? edev->xdp_prog->aux->id : 0; - return 0; default: return -EINVAL; } diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index f16b4f236031..d60acaa3879d 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -653,15 +653,10 @@ static int efx_xdp_setup_prog(struct efx_nic *efx, struct bpf_prog *prog) static int efx_xdp(struct net_device *dev, struct netdev_bpf *xdp) { struct efx_nic *efx = netdev_priv(dev); - struct bpf_prog *xdp_prog; switch (xdp->command) { case XDP_SETUP_PROG: return efx_xdp_setup_prog(efx, xdp->prog); - case XDP_QUERY_PROG: - xdp_prog = rtnl_dereference(efx->xdp_prog); - xdp->prog_id = xdp_prog ? xdp_prog->aux->id : 0; - return 0; default: return -EINVAL; } diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index 0f366cc50b74..25db667fa879 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -1811,9 +1811,6 @@ static int netsec_xdp(struct net_device *ndev, struct netdev_bpf *xdp) switch (xdp->command) { case XDP_SETUP_PROG: return netsec_xdp_setup(priv, xdp->prog, xdp->extack); - case XDP_QUERY_PROG: - xdp->prog_id = priv->xdp_prog ? priv->xdp_prog->aux->id : 0; - return 0; default: return -EINVAL; } diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index a399f3659346..d6d7a7d9c7ad 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -1286,9 +1286,6 @@ int cpsw_ndo_bpf(struct net_device *ndev, struct netdev_bpf *bpf) case XDP_SETUP_PROG: return cpsw_xdp_prog_setup(priv, bpf); - case XDP_QUERY_PROG: - return xdp_attachment_query(&priv->xdpi, bpf); - default: return -EINVAL; } diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index 8e4141552423..440486d9c999 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -163,16 +163,6 @@ int netvsc_vf_setxdp(struct net_device *vf_netdev, struct bpf_prog *prog) return ret; } -static u32 netvsc_xdp_query(struct netvsc_device *nvdev) -{ - struct bpf_prog *prog = netvsc_xdp_get(nvdev); - - if (prog) - return prog->aux->id; - - return 0; -} - int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf) { struct net_device_context *ndevctx = netdev_priv(dev); @@ -182,12 +172,7 @@ int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf) int ret; if (!nvdev || nvdev->destroy) { - if (bpf->command == XDP_QUERY_PROG) { - bpf->prog_id = 0; - return 0; /* Query must always succeed */ - } else { - return -ENODEV; - } + return -ENODEV; } switch (bpf->command) { @@ -208,10 +193,6 @@ int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf) return ret; - case XDP_QUERY_PROG: - bpf->prog_id = netvsc_xdp_query(nvdev); - return 0; - default: return -EINVAL; } diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 0b362b8dac17..2e90512f3bbe 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -551,10 +551,6 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) ASSERT_RTNL(); switch (bpf->command) { - case XDP_QUERY_PROG: - return xdp_attachment_query(&ns->xdp, bpf); - case XDP_QUERY_PROG_HW: - return xdp_attachment_query(&ns->xdp_hw, bpf); case XDP_SETUP_PROG: err = nsim_setup_prog_checks(ns, bpf); if (err) diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index d164052e0393..284f7092241d 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -121,7 +121,7 @@ static inline void nsim_bpf_uninit(struct netdevsim *ns) static inline int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) { - return bpf->command == XDP_QUERY_PROG ? 0 : -EOPNOTSUPP; + return -EOPNOTSUPP; } static inline int nsim_bpf_disable_tc(struct netdevsim *ns) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 7adeb91bd368..061bebe25cb1 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1184,26 +1184,11 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, return 0; } -static u32 tun_xdp_query(struct net_device *dev) -{ - struct tun_struct *tun = netdev_priv(dev); - const struct bpf_prog *xdp_prog; - - xdp_prog = rtnl_dereference(tun->xdp_prog); - if (xdp_prog) - return xdp_prog->aux->id; - - return 0; -} - static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return tun_xdp_set(dev, xdp->prog, xdp->extack); - case XDP_QUERY_PROG: - xdp->prog_id = tun_xdp_query(dev); - return 0; default: return -EINVAL; } diff --git a/drivers/net/veth.c b/drivers/net/veth.c index b594f03eeddb..e56cd562a664 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1198,26 +1198,11 @@ err: return err; } -static u32 veth_xdp_query(struct net_device *dev) -{ - struct veth_priv *priv = netdev_priv(dev); - const struct bpf_prog *xdp_prog; - - xdp_prog = priv->_xdp_prog; - if (xdp_prog) - return xdp_prog->aux->id; - - return 0; -} - static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return veth_xdp_set(dev, xdp->prog, xdp->extack); - case XDP_QUERY_PROG: - xdp->prog_id = veth_xdp_query(dev); - return 0; default: return -EINVAL; } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index ba38765dc490..6fa8fe5ef160 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2490,28 +2490,11 @@ err: return err; } -static u32 virtnet_xdp_query(struct net_device *dev) -{ - struct virtnet_info *vi = netdev_priv(dev); - const struct bpf_prog *xdp_prog; - int i; - - for (i = 0; i < vi->max_queue_pairs; i++) { - xdp_prog = rtnl_dereference(vi->rq[i].xdp_prog); - if (xdp_prog) - return xdp_prog->aux->id; - } - return 0; -} - static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return virtnet_xdp_set(dev, xdp->prog, xdp->extack); - case XDP_QUERY_PROG: - xdp->prog_id = virtnet_xdp_query(dev); - return 0; default: return -EINVAL; } diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index a63e550c370e..458be6882b98 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -1480,32 +1480,11 @@ static int xennet_xdp_set(struct net_device *dev, struct bpf_prog *prog, return 0; } -static u32 xennet_xdp_query(struct net_device *dev) -{ - unsigned int num_queues = dev->real_num_tx_queues; - struct netfront_info *np = netdev_priv(dev); - const struct bpf_prog *xdp_prog; - struct netfront_queue *queue; - unsigned int i; - - for (i = 0; i < num_queues; ++i) { - queue = &np->queues[i]; - xdp_prog = rtnl_dereference(queue->xdp_prog); - if (xdp_prog) - return xdp_prog->aux->id; - } - - return 0; -} - static int xennet_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return xennet_xdp_set(dev, xdp->prog, xdp->extack); - case XDP_QUERY_PROG: - xdp->prog_id = xennet_xdp_query(dev); - return 0; default: return -EINVAL; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7d3c412fcfe5..1046763cd0dc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -876,8 +876,6 @@ enum bpf_netdev_command { */ XDP_SETUP_PROG, XDP_SETUP_PROG_HW, - XDP_QUERY_PROG, - XDP_QUERY_PROG_HW, /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, @@ -911,12 +909,6 @@ struct netdev_bpf { struct bpf_prog *prog; struct netlink_ext_ack *extack; }; - /* XDP_QUERY_PROG, XDP_QUERY_PROG_HW */ - struct { - u32 prog_id; - /* flags with which program was installed */ - u32 prog_flags; - }; /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */ struct { struct bpf_offloaded_map *offmap; diff --git a/include/net/xdp.h b/include/net/xdp.h index dbe9c60797e1..3814fb631d52 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -240,8 +240,6 @@ struct xdp_attachment_info { }; struct netdev_bpf; -int xdp_attachment_query(struct xdp_attachment_info *info, - struct netdev_bpf *bpf); bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, struct netdev_bpf *bpf); void xdp_attachment_setup(struct xdp_attachment_info *info, diff --git a/net/core/dev.c b/net/core/dev.c index 82ce0920b172..a2a57988880a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5468,10 +5468,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) } break; - case XDP_QUERY_PROG: - xdp->prog_id = old ? old->aux->id : 0; - break; - default: ret = -EINVAL; break; diff --git a/net/core/xdp.c b/net/core/xdp.c index 3c45f99e26d5..48aba933a5a8 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -400,15 +400,6 @@ void __xdp_release_frame(void *data, struct xdp_mem_info *mem) } EXPORT_SYMBOL_GPL(__xdp_release_frame); -int xdp_attachment_query(struct xdp_attachment_info *info, - struct netdev_bpf *bpf) -{ - bpf->prog_id = info->prog ? info->prog->aux->id : 0; - bpf->prog_flags = info->prog ? info->flags : 0; - return 0; -} -EXPORT_SYMBOL_GPL(xdp_attachment_query); - bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, struct netdev_bpf *bpf) { -- cgit v1.2.3 From 829eb208e80d6db95c0201cb8fa00c2f9ad87faf Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Fri, 31 Jul 2020 17:34:01 -0700 Subject: rtnetlink: add support for protodown reason netdev protodown is a mechanism that allows protocols to hold an interface down. It was initially introduced in the kernel to hold links down by a multihoming protocol. There was also an attempt to introduce protodown reason at the time but was rejected. protodown and protodown reason is supported by almost every switching and routing platform. It was ok for a while to live without a protodown reason. But, its become more critical now given more than one protocol may need to keep a link down on a system at the same time. eg: vrrp peer node, port security, multihoming protocol. Its common for Network operators and protocol developers to look for such a reason on a networking box (Its also known as errDisable by most networking operators) This patch adds support for link protodown reason attribute. There are two ways to maintain protodown reasons. (a) enumerate every possible reason code in kernel - A protocol developer has to make a request and have that appear in a certain kernel version (b) provide the bits in the kernel, and allow user-space (sysadmin or NOS distributions) to manage the bit-to-reasonname map. - This makes extending reason codes easier (kind of like the iproute2 table to vrf-name map /etc/iproute2/rt_tables.d/) This patch takes approach (b). a few things about the patch: - It treats the protodown reason bits as counter to indicate active protodown users - Since protodown attribute is already an exposed UAPI, the reason is not enforced on a protodown set. Its a no-op if not used. the patch follows the below algorithm: - presence of reason bits set indicates protodown is in use - user can set protodown and protodown reason in a single or multiple setlink operations - setlink operation to clear protodown, will return -EBUSY if there are active protodown reason bits - reason is not included in link dumps if not used example with patched iproute2: $cat /etc/iproute2/protodown_reasons.d/r.conf 0 mlag 1 evpn 2 vrrp 3 psecurity $ip link set dev vxlan0 protodown on protodown_reason vrrp on $ip link set dev vxlan0 protodown_reason mlag on $ip link show 14: vxlan0: mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether f6:06:be:17:91:e7 brd ff:ff:ff:ff:ff:ff protodown on $ip link set dev vxlan0 protodown_reason mlag off $ip link set dev vxlan0 protodown off protodown_reason vrrp off Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++ include/uapi/linux/if_link.h | 10 ++++ net/core/dev.c | 25 ++++++++++ net/core/rtnetlink.c | 113 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 147 insertions(+), 5 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ac2cd3f49aba..ba0fa6b22787 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2058,6 +2058,8 @@ struct net_device { struct timer_list watchdog_timer; int watchdog_timeo; + u32 proto_down_reason; + struct list_head todo_list; int __percpu *pcpu_refcnt; @@ -3810,6 +3812,8 @@ int dev_get_port_parent_id(struct net_device *dev, bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b); int dev_change_proto_down(struct net_device *dev, bool proto_down); int dev_change_proto_down_generic(struct net_device *dev, bool proto_down); +void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, + u32 value); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 63af64646358..7fba4de511de 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -170,12 +170,22 @@ enum { IFLA_PROP_LIST, IFLA_ALT_IFNAME, /* Alternative ifname */ IFLA_PERM_ADDRESS, + IFLA_PROTO_DOWN_REASON, __IFLA_MAX }; #define IFLA_MAX (__IFLA_MAX - 1) +enum { + IFLA_PROTO_DOWN_REASON_UNSPEC, + IFLA_PROTO_DOWN_REASON_MASK, /* u32, mask for reason bits */ + IFLA_PROTO_DOWN_REASON_VALUE, /* u32, reason bit value */ + + __IFLA_PROTO_DOWN_REASON_CNT, + IFLA_PROTO_DOWN_REASON_MAX = __IFLA_PROTO_DOWN_REASON_CNT - 1 +}; + /* backwards compatibility for userspace */ #ifndef __KERNEL__ #define IFLA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg)))) diff --git a/net/core/dev.c b/net/core/dev.c index 38a6371d9bc5..f7ef0f5c5569 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8715,6 +8715,31 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down_generic); +/** + * dev_change_proto_down_reason - proto down reason + * + * @dev: device + * @mask: proto down mask + * @value: proto down value + */ +void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, + u32 value) +{ + int b; + + if (!mask) { + dev->proto_down_reason = value; + } else { + for_each_set_bit(b, &mask, 32) { + if (value & (1 << b)) + dev->proto_down_reason |= BIT(b); + else + dev->proto_down_reason &= ~BIT(b); + } + } +} +EXPORT_SYMBOL(dev_change_proto_down_reason); + u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, enum bpf_netdev_command cmd) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 85a4b0101f76..a54c3e0f2ee1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1000,6 +1000,16 @@ static size_t rtnl_prop_list_size(const struct net_device *dev) return size; } +static size_t rtnl_proto_down_size(const struct net_device *dev) +{ + size_t size = nla_total_size(1); + + if (dev->proto_down_reason) + size += nla_total_size(0) + nla_total_size(4); + + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -1041,7 +1051,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(4) /* IFLA_NEW_IFINDEX */ - + nla_total_size(1) /* IFLA_PROTO_DOWN */ + + rtnl_proto_down_size(dev) /* proto down */ + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ @@ -1658,6 +1668,35 @@ nest_cancel: return ret; } +static int rtnl_fill_proto_down(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *pr; + u32 preason; + + if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + goto nla_put_failure; + + preason = dev->proto_down_reason; + if (!preason) + return 0; + + pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON); + if (!pr) + return -EMSGSIZE; + + if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) { + nla_nest_cancel(skb, pr); + goto nla_put_failure; + } + + nla_nest_end(skb, pr); + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, @@ -1708,13 +1747,15 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + atomic_read(&dev->carrier_down_count)) || - nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) || nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, atomic_read(&dev->carrier_down_count))) goto nla_put_failure; + if (rtnl_fill_proto_down(skb, dev)) + goto nla_put_failure; + if (event != IFLA_EVENT_NONE) { if (nla_put_u32(skb, IFLA_EVENT, event)) goto nla_put_failure; @@ -1834,6 +1875,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_ALT_IFNAME] = { .type = NLA_STRING, .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, + [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2483,6 +2525,67 @@ static int do_set_master(struct net_device *dev, int ifindex, return 0; } +static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = { + [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 }, + [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 }, +}; + +static int do_set_proto_down(struct net_device *dev, + struct nlattr *nl_proto_down, + struct nlattr *nl_proto_down_reason, + struct netlink_ext_ack *extack) +{ + struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; + const struct net_device_ops *ops = dev->netdev_ops; + unsigned long mask = 0; + u32 value; + bool proto_down; + int err; + + if (!ops->ndo_change_proto_down) { + NL_SET_ERR_MSG(extack, "Protodown not supported by device"); + return -EOPNOTSUPP; + } + + if (nl_proto_down_reason) { + err = nla_parse_nested_deprecated(pdreason, + IFLA_PROTO_DOWN_REASON_MAX, + nl_proto_down_reason, + ifla_proto_down_reason_policy, + NULL); + if (err < 0) + return err; + + if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) { + NL_SET_ERR_MSG(extack, "Invalid protodown reason value"); + return -EINVAL; + } + + value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]); + + if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) + mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); + + dev_change_proto_down_reason(dev, mask, value); + } + + if (nl_proto_down) { + proto_down = nla_get_u8(nl_proto_down); + + /* Dont turn off protodown if there are active reasons */ + if (!proto_down && dev->proto_down_reason) { + NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); + return -EBUSY; + } + err = dev_change_proto_down(dev, + proto_down); + if (err) + return err; + } + + return 0; +} + #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 @@ -2771,9 +2874,9 @@ static int do_setlink(const struct sk_buff *skb, } err = 0; - if (tb[IFLA_PROTO_DOWN]) { - err = dev_change_proto_down(dev, - nla_get_u8(tb[IFLA_PROTO_DOWN])); + if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) { + err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN], + tb[IFLA_PROTO_DOWN_REASON], extack); if (err) goto errout; status |= DO_SETLINK_NOTIFY; -- cgit v1.2.3 From df23bb18b44b9a1f2b54358201730e710a9df57f Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Tue, 4 Aug 2020 07:53:42 +0200 Subject: ipv4: route: Ignore output interface in FIB lookup for PMTU route Currently, processes sending traffic to a local bridge with an encapsulation device as a port don't get ICMP errors if they exceed the PMTU of the encapsulated link. David Ahern suggested this as a hack, but it actually looks like the correct solution: when we update the PMTU for a given destination by means of updating or creating a route exception, the encapsulation might trigger this because of PMTU discovery happening either on the encapsulation device itself, or its lower layer. This happens on bridged encapsulations only. The output interface shouldn't matter, because we already have a valid destination. Drop the output interface restriction from the associated route lookup. For UDP tunnels, we will now have a route exception created for the encapsulation itself, with a MTU value reflecting its headroom, which allows a bridge forwarding IP packets originated locally to deliver errors back to the sending socket. The behaviour is now consistent with IPv6 and verified with selftests pmtu_ipv{4,6}_br_{geneve,vxlan}{4,6}_exception introduced later in this series. v2: - reset output interface only for bridge ports (David Ahern) - add and use netif_is_any_bridge_port() helper (David Ahern) Suggested-by: David Ahern Signed-off-by: Stefano Brivio Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ net/ipv4/route.c | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 88d40b9abaa1..90444622b703 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4840,6 +4840,11 @@ static inline bool netif_is_ovs_port(const struct net_device *dev) return dev->priv_flags & IFF_OVS_DATAPATH; } +static inline bool netif_is_any_bridge_port(const struct net_device *dev) +{ + return netif_is_bridge_port(dev) || netif_is_ovs_port(dev); +} + static inline bool netif_is_team_master(const struct net_device *dev) { return dev->priv_flags & IFF_TEAM; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a01efa062f6b..8ca6bcab7b03 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1050,6 +1050,11 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); + + /* Don't make lookup fail for bridged encapsulations */ + if (skb && netif_is_any_bridge_port(skb->dev)) + fl4.flowi4_oif = 0; + __ip_rt_update_pmtu(rt, &fl4, mtu); } -- cgit v1.2.3 From eb02d39ad3099c3dcd96c5b3fdc0303541766ed1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 6 Sep 2020 20:31:16 -0700 Subject: netdevice.h: fix proto_down_reason kernel-doc warning Fix kernel-doc warning in : ../include/linux/netdevice.h:2158: warning: Function parameter or member 'proto_down_reason' not described in 'net_device' Fixes: 829eb208e80d ("rtnetlink: add support for protodown reason") Signed-off-by: Randy Dunlap Acked-by: Roopa Prabhu Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b0e303f6603f..bf0e313486be 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1784,6 +1784,7 @@ enum netdev_priv_flags { * the watchdog (see dev_watchdog()) * @watchdog_timer: List of timers * + * @proto_down_reason: reason a netdev interface is held down * @pcpu_refcnt: Number of references to this device * @todo_list: Delayed register/unregister * @link_watch_list: XXX: need comments on this one -- cgit v1.2.3 From ffa59b0b396cb2c0ea6712ff30ee05c35d4c9c8d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 6 Sep 2020 20:32:30 -0700 Subject: netdevice.h: fix xdp_state kernel-doc warning Fix kernel-doc warning in : ../include/linux/netdevice.h:2158: warning: Function parameter or member 'xdp_state' not described in 'net_device' Fixes: 7f0a838254bd ("bpf, xdp: Maintain info on attached XDP BPF programs in net_device") Signed-off-by: Randy Dunlap Cc: Andrii Nakryiko Cc: Alexei Starovoitov Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bf0e313486be..7bd4fcdd0738 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1849,6 +1849,7 @@ enum netdev_priv_flags { * @udp_tunnel_nic_info: static structure describing the UDP tunnel * offload capabilities of the device * @udp_tunnel_nic: UDP tunnel offload state + * @xdp_state: stores info on attached XDP BPF programs * * FIXME: cleanup struct net_device such that network protocol info * moves out. -- cgit v1.2.3 From eff7423365a6938d2d34dbce989febed2ae1f957 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 25 Sep 2020 18:13:12 +0000 Subject: net: core: introduce struct netdev_nested_priv for nested interface infrastructure Functions related to nested interface infrastructure such as netdev_walk_all_{ upper | lower }_dev() pass both private functions and "data" pointer to handle their own things. At this point, the data pointer type is void *. In order to make it easier to expand common variables and functions, this new netdev_nested_priv structure is added. In the following patch, a new member variable will be added into this struct to fix the lockdep issue. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/infiniband/core/cache.c | 10 ++-- drivers/infiniband/core/cma.c | 9 ++-- drivers/infiniband/core/roce_gid_mgmt.c | 9 ++-- drivers/infiniband/ulp/ipoib/ipoib_main.c | 9 ++-- drivers/net/bonding/bond_alb.c | 9 ++-- drivers/net/bonding/bond_main.c | 10 ++-- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 37 ++++++++++---- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 24 +++++---- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 11 ++-- .../ethernet/mellanox/mlxsw/spectrum_switchdev.c | 10 ++-- drivers/net/ethernet/rocker/rocker_main.c | 9 ++-- drivers/net/wireless/quantenna/qtnfmac/core.c | 10 ++-- include/linux/netdevice.h | 16 +++--- net/bridge/br_arp_nd_proxy.c | 26 +++++++--- net/bridge/br_vlan.c | 20 +++++--- net/core/dev.c | 59 ++++++++++++++-------- 16 files changed, 183 insertions(+), 95 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index ffad73bb40ff..5a76611e684a 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1320,9 +1320,10 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) } EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu); -static int get_lower_dev_vlan(struct net_device *lower_dev, void *data) +static int get_lower_dev_vlan(struct net_device *lower_dev, + struct netdev_nested_priv *priv) { - u16 *vlan_id = data; + u16 *vlan_id = (u16 *)priv->data; if (is_vlan_dev(lower_dev)) *vlan_id = vlan_dev_vlan_id(lower_dev); @@ -1348,6 +1349,9 @@ static int get_lower_dev_vlan(struct net_device *lower_dev, void *data) int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, u16 *vlan_id, u8 *smac) { + struct netdev_nested_priv priv = { + .data = (void *)vlan_id, + }; struct net_device *ndev; rcu_read_lock(); @@ -1368,7 +1372,7 @@ int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, * the lower vlan device for this gid entry. */ netdev_walk_all_lower_dev_rcu(attr->ndev, - get_lower_dev_vlan, vlan_id); + get_lower_dev_vlan, &priv); } } rcu_read_unlock(); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 7f0e91e92968..5888311b2119 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2865,9 +2865,10 @@ struct iboe_prio_tc_map { bool found; }; -static int get_lower_vlan_dev_tc(struct net_device *dev, void *data) +static int get_lower_vlan_dev_tc(struct net_device *dev, + struct netdev_nested_priv *priv) { - struct iboe_prio_tc_map *map = data; + struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data; if (is_vlan_dev(dev)) map->output_tc = get_vlan_ndev_tc(dev, map->input_prio); @@ -2886,16 +2887,18 @@ static int iboe_tos_to_sl(struct net_device *ndev, int tos) { struct iboe_prio_tc_map prio_tc_map = {}; int prio = rt_tos2priority(tos); + struct netdev_nested_priv priv; /* If VLAN device, get it directly from the VLAN netdev */ if (is_vlan_dev(ndev)) return get_vlan_ndev_tc(ndev, prio); prio_tc_map.input_prio = prio; + priv.data = (void *)&prio_tc_map; rcu_read_lock(); netdev_walk_all_lower_dev_rcu(ndev, get_lower_vlan_dev_tc, - &prio_tc_map); + &priv); rcu_read_unlock(); /* If map is found from lower device, use it; Otherwise * continue with the current netdevice to get priority to tc map. diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 2860def84f4d..6b8364bb032d 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -531,10 +531,11 @@ struct upper_list { struct net_device *upper; }; -static int netdev_upper_walk(struct net_device *upper, void *data) +static int netdev_upper_walk(struct net_device *upper, + struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - struct list_head *upper_list = data; + struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; @@ -553,12 +554,14 @@ static void handle_netdev_upper(struct ib_device *ib_dev, u8 port, struct net_device *ndev)) { struct net_device *ndev = cookie; + struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); + priv.data = &upper_list; rcu_read_lock(); - netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &upper_list); + netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index ab75b7f745d4..f772fe8c5b66 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -342,9 +342,10 @@ struct ipoib_walk_data { struct net_device *result; }; -static int ipoib_upper_walk(struct net_device *upper, void *_data) +static int ipoib_upper_walk(struct net_device *upper, + struct netdev_nested_priv *priv) { - struct ipoib_walk_data *data = _data; + struct ipoib_walk_data *data = (struct ipoib_walk_data *)priv->data; int ret = 0; if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) { @@ -368,10 +369,12 @@ static int ipoib_upper_walk(struct net_device *upper, void *_data) static struct net_device *ipoib_get_net_dev_match_addr( const struct sockaddr *addr, struct net_device *dev) { + struct netdev_nested_priv priv; struct ipoib_walk_data data = { .addr = addr, }; + priv.data = (void *)&data; rcu_read_lock(); if (ipoib_is_dev_match_addr_rcu(addr, dev)) { dev_hold(dev); @@ -379,7 +382,7 @@ static struct net_device *ipoib_get_net_dev_match_addr( goto out; } - netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &data); + netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &priv); out: rcu_read_unlock(); return data.result; diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index 4e1b7deb724b..c3091e00dd5f 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -942,9 +942,10 @@ struct alb_walk_data { bool strict_match; }; -static int alb_upper_dev_walk(struct net_device *upper, void *_data) +static int alb_upper_dev_walk(struct net_device *upper, + struct netdev_nested_priv *priv) { - struct alb_walk_data *data = _data; + struct alb_walk_data *data = (struct alb_walk_data *)priv->data; bool strict_match = data->strict_match; struct bonding *bond = data->bond; struct slave *slave = data->slave; @@ -983,6 +984,7 @@ static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[], bool strict_match) { struct bonding *bond = bond_get_bond_by_slave(slave); + struct netdev_nested_priv priv; struct alb_walk_data data = { .strict_match = strict_match, .mac_addr = mac_addr, @@ -990,6 +992,7 @@ static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[], .bond = bond, }; + priv.data = (void *)&data; /* send untagged */ alb_send_lp_vid(slave, mac_addr, 0, 0); @@ -997,7 +1000,7 @@ static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[], * for that device. */ rcu_read_lock(); - netdev_walk_all_upper_dev_rcu(bond->dev, alb_upper_dev_walk, &data); + netdev_walk_all_upper_dev_rcu(bond->dev, alb_upper_dev_walk, &priv); rcu_read_unlock(); } diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 14740d3053b8..84ecbc6fa0ff 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2511,22 +2511,26 @@ re_arm: } } -static int bond_upper_dev_walk(struct net_device *upper, void *data) +static int bond_upper_dev_walk(struct net_device *upper, + struct netdev_nested_priv *priv) { - __be32 ip = *((__be32 *)data); + __be32 ip = *(__be32 *)priv->data; return ip == bond_confirm_addr(upper, 0, ip); } static bool bond_has_this_ip(struct bonding *bond, __be32 ip) { + struct netdev_nested_priv priv = { + .data = (void *)&ip, + }; bool ret = false; if (ip == bond_confirm_addr(bond->dev, 0, ip)) return true; rcu_read_lock(); - if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_upper_dev_walk, &ip)) + if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_upper_dev_walk, &priv)) ret = true; rcu_read_unlock(); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 2f8a4cfc5fa1..86ca8b9ea1b8 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -5396,9 +5396,10 @@ static int ixgbe_fwd_ring_up(struct ixgbe_adapter *adapter, return err; } -static int ixgbe_macvlan_up(struct net_device *vdev, void *data) +static int ixgbe_macvlan_up(struct net_device *vdev, + struct netdev_nested_priv *priv) { - struct ixgbe_adapter *adapter = data; + struct ixgbe_adapter *adapter = (struct ixgbe_adapter *)priv->data; struct ixgbe_fwd_adapter *accel; if (!netif_is_macvlan(vdev)) @@ -5415,8 +5416,12 @@ static int ixgbe_macvlan_up(struct net_device *vdev, void *data) static void ixgbe_configure_dfwd(struct ixgbe_adapter *adapter) { + struct netdev_nested_priv priv = { + .data = (void *)adapter, + }; + netdev_walk_all_upper_dev_rcu(adapter->netdev, - ixgbe_macvlan_up, adapter); + ixgbe_macvlan_up, &priv); } static void ixgbe_configure(struct ixgbe_adapter *adapter) @@ -9023,9 +9028,10 @@ static void ixgbe_set_prio_tc_map(struct ixgbe_adapter *adapter) } #endif /* CONFIG_IXGBE_DCB */ -static int ixgbe_reassign_macvlan_pool(struct net_device *vdev, void *data) +static int ixgbe_reassign_macvlan_pool(struct net_device *vdev, + struct netdev_nested_priv *priv) { - struct ixgbe_adapter *adapter = data; + struct ixgbe_adapter *adapter = (struct ixgbe_adapter *)priv->data; struct ixgbe_fwd_adapter *accel; int pool; @@ -9062,13 +9068,16 @@ static int ixgbe_reassign_macvlan_pool(struct net_device *vdev, void *data) static void ixgbe_defrag_macvlan_pools(struct net_device *dev) { struct ixgbe_adapter *adapter = netdev_priv(dev); + struct netdev_nested_priv priv = { + .data = (void *)adapter, + }; /* flush any stale bits out of the fwd bitmask */ bitmap_clear(adapter->fwd_bitmask, 1, 63); /* walk through upper devices reassigning pools */ netdev_walk_all_upper_dev_rcu(dev, ixgbe_reassign_macvlan_pool, - adapter); + &priv); } /** @@ -9242,14 +9251,18 @@ struct upper_walk_data { u8 queue; }; -static int get_macvlan_queue(struct net_device *upper, void *_data) +static int get_macvlan_queue(struct net_device *upper, + struct netdev_nested_priv *priv) { if (netif_is_macvlan(upper)) { struct ixgbe_fwd_adapter *vadapter = macvlan_accel_priv(upper); - struct upper_walk_data *data = _data; - struct ixgbe_adapter *adapter = data->adapter; - int ifindex = data->ifindex; + struct ixgbe_adapter *adapter; + struct upper_walk_data *data; + int ifindex; + data = (struct upper_walk_data *)priv->data; + ifindex = data->ifindex; + adapter = data->adapter; if (vadapter && upper->ifindex == ifindex) { data->queue = adapter->rx_ring[vadapter->rx_base_queue]->reg_idx; data->action = data->queue; @@ -9265,6 +9278,7 @@ static int handle_redirect_action(struct ixgbe_adapter *adapter, int ifindex, { struct ixgbe_ring_feature *vmdq = &adapter->ring_feature[RING_F_VMDQ]; unsigned int num_vfs = adapter->num_vfs, vf; + struct netdev_nested_priv priv; struct upper_walk_data data; struct net_device *upper; @@ -9284,8 +9298,9 @@ static int handle_redirect_action(struct ixgbe_adapter *adapter, int ifindex, data.ifindex = ifindex; data.action = 0; data.queue = 0; + priv.data = (void *)&data; if (netdev_walk_all_upper_dev_rcu(adapter->netdev, - get_macvlan_queue, &data)) { + get_macvlan_queue, &priv)) { *action = data.action; *queue = data.queue; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 4186e29119c2..f3c0e241e1b4 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3690,13 +3690,13 @@ bool mlxsw_sp_port_dev_check(const struct net_device *dev) return dev->netdev_ops == &mlxsw_sp_port_netdev_ops; } -static int mlxsw_sp_lower_dev_walk(struct net_device *lower_dev, void *data) +static int mlxsw_sp_lower_dev_walk(struct net_device *lower_dev, + struct netdev_nested_priv *priv) { - struct mlxsw_sp_port **p_mlxsw_sp_port = data; int ret = 0; if (mlxsw_sp_port_dev_check(lower_dev)) { - *p_mlxsw_sp_port = netdev_priv(lower_dev); + priv->data = (void *)netdev_priv(lower_dev); ret = 1; } @@ -3705,15 +3705,16 @@ static int mlxsw_sp_lower_dev_walk(struct net_device *lower_dev, void *data) struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find(struct net_device *dev) { - struct mlxsw_sp_port *mlxsw_sp_port; + struct netdev_nested_priv priv = { + .data = NULL, + }; if (mlxsw_sp_port_dev_check(dev)) return netdev_priv(dev); - mlxsw_sp_port = NULL; - netdev_walk_all_lower_dev(dev, mlxsw_sp_lower_dev_walk, &mlxsw_sp_port); + netdev_walk_all_lower_dev(dev, mlxsw_sp_lower_dev_walk, &priv); - return mlxsw_sp_port; + return (struct mlxsw_sp_port *)priv.data; } struct mlxsw_sp *mlxsw_sp_lower_get(struct net_device *dev) @@ -3726,16 +3727,17 @@ struct mlxsw_sp *mlxsw_sp_lower_get(struct net_device *dev) struct mlxsw_sp_port *mlxsw_sp_port_dev_lower_find_rcu(struct net_device *dev) { - struct mlxsw_sp_port *mlxsw_sp_port; + struct netdev_nested_priv priv = { + .data = NULL, + }; if (mlxsw_sp_port_dev_check(dev)) return netdev_priv(dev); - mlxsw_sp_port = NULL; netdev_walk_all_lower_dev_rcu(dev, mlxsw_sp_lower_dev_walk, - &mlxsw_sp_port); + &priv); - return mlxsw_sp_port; + return (struct mlxsw_sp_port *)priv.data; } struct mlxsw_sp_port *mlxsw_sp_port_lower_dev_hold(struct net_device *dev) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 24f1fd1f8d56..460cb523312f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -7351,9 +7351,10 @@ int mlxsw_sp_netdevice_vrf_event(struct net_device *l3_dev, unsigned long event, return err; } -static int __mlxsw_sp_rif_macvlan_flush(struct net_device *dev, void *data) +static int __mlxsw_sp_rif_macvlan_flush(struct net_device *dev, + struct netdev_nested_priv *priv) { - struct mlxsw_sp_rif *rif = data; + struct mlxsw_sp_rif *rif = (struct mlxsw_sp_rif *)priv->data; if (!netif_is_macvlan(dev)) return 0; @@ -7364,12 +7365,16 @@ static int __mlxsw_sp_rif_macvlan_flush(struct net_device *dev, void *data) static int mlxsw_sp_rif_macvlan_flush(struct mlxsw_sp_rif *rif) { + struct netdev_nested_priv priv = { + .data = (void *)rif, + }; + if (!netif_is_macvlan_port(rif->dev)) return 0; netdev_warn(rif->dev, "Router interface is deleted. Upper macvlans will not work\n"); return netdev_walk_all_upper_dev_rcu(rif->dev, - __mlxsw_sp_rif_macvlan_flush, rif); + __mlxsw_sp_rif_macvlan_flush, &priv); } static void mlxsw_sp_rif_subport_setup(struct mlxsw_sp_rif *rif, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 72912afa6f72..6501ce94ace5 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -136,9 +136,9 @@ bool mlxsw_sp_bridge_device_is_offloaded(const struct mlxsw_sp *mlxsw_sp, } static int mlxsw_sp_bridge_device_upper_rif_destroy(struct net_device *dev, - void *data) + struct netdev_nested_priv *priv) { - struct mlxsw_sp *mlxsw_sp = data; + struct mlxsw_sp *mlxsw_sp = priv->data; mlxsw_sp_rif_destroy_by_dev(mlxsw_sp, dev); return 0; @@ -147,10 +147,14 @@ static int mlxsw_sp_bridge_device_upper_rif_destroy(struct net_device *dev, static void mlxsw_sp_bridge_device_rifs_destroy(struct mlxsw_sp *mlxsw_sp, struct net_device *dev) { + struct netdev_nested_priv priv = { + .data = (void *)mlxsw_sp, + }; + mlxsw_sp_rif_destroy_by_dev(mlxsw_sp, dev); netdev_walk_all_upper_dev_rcu(dev, mlxsw_sp_bridge_device_upper_rif_destroy, - mlxsw_sp); + &priv); } static int mlxsw_sp_bridge_device_vxlan_init(struct mlxsw_sp_bridge *bridge, diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 42458a46ffaf..9cc31f7e0df1 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -3099,9 +3099,10 @@ struct rocker_walk_data { struct rocker_port *port; }; -static int rocker_lower_dev_walk(struct net_device *lower_dev, void *_data) +static int rocker_lower_dev_walk(struct net_device *lower_dev, + struct netdev_nested_priv *priv) { - struct rocker_walk_data *data = _data; + struct rocker_walk_data *data = (struct rocker_walk_data *)priv->data; int ret = 0; if (rocker_port_dev_check_under(lower_dev, data->rocker)) { @@ -3115,6 +3116,7 @@ static int rocker_lower_dev_walk(struct net_device *lower_dev, void *_data) struct rocker_port *rocker_port_dev_lower_find(struct net_device *dev, struct rocker *rocker) { + struct netdev_nested_priv priv; struct rocker_walk_data data; if (rocker_port_dev_check_under(dev, rocker)) @@ -3122,7 +3124,8 @@ struct rocker_port *rocker_port_dev_lower_find(struct net_device *dev, data.rocker = rocker; data.port = NULL; - netdev_walk_all_lower_dev(dev, rocker_lower_dev_walk, &data); + priv.data = (void *)&data; + netdev_walk_all_lower_dev(dev, rocker_lower_dev_walk, &priv); return data.port; } diff --git a/drivers/net/wireless/quantenna/qtnfmac/core.c b/drivers/net/wireless/quantenna/qtnfmac/core.c index 6aafff9d4231..e013ebe3079c 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/core.c +++ b/drivers/net/wireless/quantenna/qtnfmac/core.c @@ -671,9 +671,10 @@ bool qtnf_netdev_is_qtn(const struct net_device *ndev) return ndev->netdev_ops == &qtnf_netdev_ops; } -static int qtnf_check_br_ports(struct net_device *dev, void *data) +static int qtnf_check_br_ports(struct net_device *dev, + struct netdev_nested_priv *priv) { - struct net_device *ndev = data; + struct net_device *ndev = (struct net_device *)priv->data; if (dev != ndev && netdev_port_same_parent_id(dev, ndev)) return -ENOTSUPP; @@ -686,6 +687,9 @@ static int qtnf_core_netdevice_event(struct notifier_block *nb, { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); const struct netdev_notifier_changeupper_info *info; + struct netdev_nested_priv priv = { + .data = (void *)ndev, + }; struct net_device *brdev; struct qtnf_vif *vif; struct qtnf_bus *bus; @@ -725,7 +729,7 @@ static int qtnf_core_netdevice_event(struct notifier_block *nb, } else { ret = netdev_walk_all_lower_dev(brdev, qtnf_check_br_ports, - ndev); + &priv); } break; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7bd4fcdd0738..313803d6c781 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4455,6 +4455,10 @@ extern int dev_rx_weight; extern int dev_tx_weight; extern int gro_normal_batch; +struct netdev_nested_priv { + void *data; +}; + bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev); struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter); @@ -4470,8 +4474,8 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, int netdev_walk_all_upper_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *upper_dev, - void *data), - void *data); + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv); bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev); @@ -4508,12 +4512,12 @@ struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter); int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *lower_dev, - void *data), - void *data); + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv); int netdev_walk_all_lower_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *lower_dev, - void *data), - void *data); + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv); void *netdev_adjacent_get_private(struct list_head *adj_list); void *netdev_lower_get_first_private_rcu(struct net_device *dev); diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index b18cdf03edb3..dfec65eca8a6 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -88,9 +88,10 @@ static void br_arp_send(struct net_bridge *br, struct net_bridge_port *p, } } -static int br_chk_addr_ip(struct net_device *dev, void *data) +static int br_chk_addr_ip(struct net_device *dev, + struct netdev_nested_priv *priv) { - __be32 ip = *(__be32 *)data; + __be32 ip = *(__be32 *)priv->data; struct in_device *in_dev; __be32 addr = 0; @@ -107,11 +108,15 @@ static int br_chk_addr_ip(struct net_device *dev, void *data) static bool br_is_local_ip(struct net_device *dev, __be32 ip) { - if (br_chk_addr_ip(dev, &ip)) + struct netdev_nested_priv priv = { + .data = (void *)&ip, + }; + + if (br_chk_addr_ip(dev, &priv)) return true; /* check if ip is configured on upper dev */ - if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip, &ip)) + if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip, &priv)) return true; return false; @@ -361,9 +366,10 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, } } -static int br_chk_addr_ip6(struct net_device *dev, void *data) +static int br_chk_addr_ip6(struct net_device *dev, + struct netdev_nested_priv *priv) { - struct in6_addr *addr = (struct in6_addr *)data; + struct in6_addr *addr = (struct in6_addr *)priv->data; if (ipv6_chk_addr(dev_net(dev), addr, dev, 0)) return 1; @@ -374,11 +380,15 @@ static int br_chk_addr_ip6(struct net_device *dev, void *data) static bool br_is_local_ip6(struct net_device *dev, struct in6_addr *addr) { - if (br_chk_addr_ip6(dev, addr)) + struct netdev_nested_priv priv = { + .data = (void *)addr, + }; + + if (br_chk_addr_ip6(dev, &priv)) return true; /* check if ip is configured on upper dev */ - if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip6, addr)) + if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip6, &priv)) return true; return false; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 61c94cefa843..ee8780080be5 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1360,7 +1360,7 @@ static int br_vlan_is_bind_vlan_dev(const struct net_device *dev) } static int br_vlan_is_bind_vlan_dev_fn(struct net_device *dev, - __always_unused void *data) + __always_unused struct netdev_nested_priv *priv) { return br_vlan_is_bind_vlan_dev(dev); } @@ -1383,9 +1383,9 @@ struct br_vlan_bind_walk_data { }; static int br_vlan_match_bind_vlan_dev_fn(struct net_device *dev, - void *data_in) + struct netdev_nested_priv *priv) { - struct br_vlan_bind_walk_data *data = data_in; + struct br_vlan_bind_walk_data *data = priv->data; int found = 0; if (br_vlan_is_bind_vlan_dev(dev) && @@ -1403,10 +1403,13 @@ br_vlan_get_upper_bind_vlan_dev(struct net_device *dev, u16 vid) struct br_vlan_bind_walk_data data = { .vid = vid, }; + struct netdev_nested_priv priv = { + .data = (void *)&data, + }; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(dev, br_vlan_match_bind_vlan_dev_fn, - &data); + &priv); rcu_read_unlock(); return data.result; @@ -1487,9 +1490,9 @@ struct br_vlan_link_state_walk_data { }; static int br_vlan_link_state_change_fn(struct net_device *vlan_dev, - void *data_in) + struct netdev_nested_priv *priv) { - struct br_vlan_link_state_walk_data *data = data_in; + struct br_vlan_link_state_walk_data *data = priv->data; if (br_vlan_is_bind_vlan_dev(vlan_dev)) br_vlan_set_vlan_dev_state(data->br, vlan_dev); @@ -1503,10 +1506,13 @@ static void br_vlan_link_state_change(struct net_device *dev, struct br_vlan_link_state_walk_data data = { .br = br }; + struct netdev_nested_priv priv = { + .data = (void *)&data, + }; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(dev, br_vlan_link_state_change_fn, - &data); + &priv); rcu_read_unlock(); } diff --git a/net/core/dev.c b/net/core/dev.c index b7b3d6e15cda..a4a1fa806c5c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6812,9 +6812,10 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, return NULL; } -static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data) +static int ____netdev_has_upper_dev(struct net_device *upper_dev, + struct netdev_nested_priv *priv) { - struct net_device *dev = data; + struct net_device *dev = (struct net_device *)priv->data; return upper_dev == dev; } @@ -6831,10 +6832,14 @@ static int ____netdev_has_upper_dev(struct net_device *upper_dev, void *data) bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { + struct netdev_nested_priv priv = { + .data = (void *)upper_dev, + }; + ASSERT_RTNL(); return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, - upper_dev); + &priv); } EXPORT_SYMBOL(netdev_has_upper_dev); @@ -6851,8 +6856,12 @@ EXPORT_SYMBOL(netdev_has_upper_dev); bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev) { + struct netdev_nested_priv priv = { + .data = (void *)upper_dev, + }; + return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev, - upper_dev); + &priv); } EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); @@ -6997,8 +7006,8 @@ static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, static int __netdev_walk_all_upper_dev(struct net_device *dev, int (*fn)(struct net_device *dev, - void *data), - void *data) + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; @@ -7010,7 +7019,7 @@ static int __netdev_walk_all_upper_dev(struct net_device *dev, while (1) { if (now != dev) { - ret = fn(now, data); + ret = fn(now, priv); if (ret) return ret; } @@ -7046,8 +7055,8 @@ static int __netdev_walk_all_upper_dev(struct net_device *dev, int netdev_walk_all_upper_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, - void *data), - void *data) + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; @@ -7058,7 +7067,7 @@ int netdev_walk_all_upper_dev_rcu(struct net_device *dev, while (1) { if (now != dev) { - ret = fn(now, data); + ret = fn(now, priv); if (ret) return ret; } @@ -7094,10 +7103,14 @@ EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); static bool __netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { + struct netdev_nested_priv priv = { + .data = (void *)upper_dev, + }; + ASSERT_RTNL(); return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev, - upper_dev); + &priv); } /** @@ -7215,8 +7228,8 @@ static struct net_device *__netdev_next_lower_dev(struct net_device *dev, int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, - void *data), - void *data) + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; @@ -7227,7 +7240,7 @@ int netdev_walk_all_lower_dev(struct net_device *dev, while (1) { if (now != dev) { - ret = fn(now, data); + ret = fn(now, priv); if (ret) return ret; } @@ -7262,8 +7275,8 @@ EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); static int __netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, - void *data), - void *data) + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; @@ -7275,7 +7288,7 @@ static int __netdev_walk_all_lower_dev(struct net_device *dev, while (1) { if (now != dev) { - ret = fn(now, data); + ret = fn(now, priv); if (ret) return ret; } @@ -7364,13 +7377,15 @@ static u8 __netdev_lower_depth(struct net_device *dev) return max_depth; } -static int __netdev_update_upper_level(struct net_device *dev, void *data) +static int __netdev_update_upper_level(struct net_device *dev, + struct netdev_nested_priv *__unused) { dev->upper_level = __netdev_upper_depth(dev) + 1; return 0; } -static int __netdev_update_lower_level(struct net_device *dev, void *data) +static int __netdev_update_lower_level(struct net_device *dev, + struct netdev_nested_priv *__unused) { dev->lower_level = __netdev_lower_depth(dev) + 1; return 0; @@ -7378,8 +7393,8 @@ static int __netdev_update_lower_level(struct net_device *dev, void *data) int netdev_walk_all_lower_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, - void *data), - void *data) + struct netdev_nested_priv *priv), + struct netdev_nested_priv *priv) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; @@ -7390,7 +7405,7 @@ int netdev_walk_all_lower_dev_rcu(struct net_device *dev, while (1) { if (now != dev) { - ret = fn(now, data); + ret = fn(now, priv); if (ret) return ret; } -- cgit v1.2.3 From 1fc70edb7d7b5ce1ae32b0cf90183f4879ad421a Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Fri, 25 Sep 2020 18:13:29 +0000 Subject: net: core: add nested_level variable in net_device This patch is to add a new variable 'nested_level' into the net_device structure. This variable will be used as a parameter of spin_lock_nested() of dev->addr_list_lock. netif_addr_lock() can be called recursively so spin_lock_nested() is used instead of spin_lock() and dev->lower_level is used as a parameter of spin_lock_nested(). But, dev->lower_level value can be updated while it is being used. So, lockdep would warn a possible deadlock scenario. When a stacked interface is deleted, netif_{uc | mc}_sync() is called recursively. So, spin_lock_nested() is called recursively too. At this moment, the dev->lower_level variable is used as a parameter of it. dev->lower_level value is updated when interfaces are being unlinked/linked immediately. Thus, After unlinking, dev->lower_level shouldn't be a parameter of spin_lock_nested(). A (macvlan) | B (vlan) | C (bridge) | D (macvlan) | E (vlan) | F (bridge) A->lower_level : 6 B->lower_level : 5 C->lower_level : 4 D->lower_level : 3 E->lower_level : 2 F->lower_level : 1 When an interface 'A' is removed, it releases resources. At this moment, netif_addr_lock() would be called. Then, netdev_upper_dev_unlink() is called recursively. Then dev->lower_level is updated. There is no problem. But, when the bridge module is removed, 'C' and 'F' interfaces are removed at once. If 'F' is removed first, a lower_level value is like below. A->lower_level : 5 B->lower_level : 4 C->lower_level : 3 D->lower_level : 2 E->lower_level : 1 F->lower_level : 1 Then, 'C' is removed. at this moment, netif_addr_lock() is called recursively. The ordering is like this. C(3)->D(2)->E(1)->F(1) At this moment, the lower_level value of 'E' and 'F' are the same. So, lockdep warns a possible deadlock scenario. In order to avoid this problem, a new variable 'nested_level' is added. This value is the same as dev->lower_level - 1. But this value is updated in rtnl_unlock(). So, this variable can be used as a parameter of spin_lock_nested() safely in the rtnl context. Test commands: ip link add br0 type bridge vlan_filtering 1 ip link add vlan1 link br0 type vlan id 10 ip link add macvlan2 link vlan1 type macvlan ip link add br3 type bridge vlan_filtering 1 ip link set macvlan2 master br3 ip link add vlan4 link br3 type vlan id 10 ip link add macvlan5 link vlan4 type macvlan ip link add br6 type bridge vlan_filtering 1 ip link set macvlan5 master br6 ip link add vlan7 link br6 type vlan id 10 ip link add macvlan8 link vlan7 type macvlan ip link set br0 up ip link set vlan1 up ip link set macvlan2 up ip link set br3 up ip link set vlan4 up ip link set macvlan5 up ip link set br6 up ip link set vlan7 up ip link set macvlan8 up modprobe -rv bridge Splat looks like: [ 36.057436][ T744] WARNING: possible recursive locking detected [ 36.058848][ T744] 5.9.0-rc6+ #728 Not tainted [ 36.059959][ T744] -------------------------------------------- [ 36.061391][ T744] ip/744 is trying to acquire lock: [ 36.062590][ T744] ffff8c4767509280 (&vlan_netdev_addr_lock_key){+...}-{2:2}, at: dev_set_rx_mode+0x19/0x30 [ 36.064922][ T744] [ 36.064922][ T744] but task is already holding lock: [ 36.066626][ T744] ffff8c4767769280 (&vlan_netdev_addr_lock_key){+...}-{2:2}, at: dev_uc_add+0x1e/0x60 [ 36.068851][ T744] [ 36.068851][ T744] other info that might help us debug this: [ 36.070731][ T744] Possible unsafe locking scenario: [ 36.070731][ T744] [ 36.072497][ T744] CPU0 [ 36.073238][ T744] ---- [ 36.074007][ T744] lock(&vlan_netdev_addr_lock_key); [ 36.075290][ T744] lock(&vlan_netdev_addr_lock_key); [ 36.076590][ T744] [ 36.076590][ T744] *** DEADLOCK *** [ 36.076590][ T744] [ 36.078515][ T744] May be due to missing lock nesting notation [ 36.078515][ T744] [ 36.080491][ T744] 3 locks held by ip/744: [ 36.081471][ T744] #0: ffffffff98571df0 (rtnl_mutex){+.+.}-{3:3}, at: rtnetlink_rcv_msg+0x236/0x490 [ 36.083614][ T744] #1: ffff8c4767769280 (&vlan_netdev_addr_lock_key){+...}-{2:2}, at: dev_uc_add+0x1e/0x60 [ 36.085942][ T744] #2: ffff8c476c8da280 (&bridge_netdev_addr_lock_key/4){+...}-{2:2}, at: dev_uc_sync+0x39/0x80 [ 36.088400][ T744] [ 36.088400][ T744] stack backtrace: [ 36.089772][ T744] CPU: 6 PID: 744 Comm: ip Not tainted 5.9.0-rc6+ #728 [ 36.091364][ T744] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 36.093630][ T744] Call Trace: [ 36.094416][ T744] dump_stack+0x77/0x9b [ 36.095385][ T744] __lock_acquire+0xbc3/0x1f40 [ 36.096522][ T744] lock_acquire+0xb4/0x3b0 [ 36.097540][ T744] ? dev_set_rx_mode+0x19/0x30 [ 36.098657][ T744] ? rtmsg_ifinfo+0x1f/0x30 [ 36.099711][ T744] ? __dev_notify_flags+0xa5/0xf0 [ 36.100874][ T744] ? rtnl_is_locked+0x11/0x20 [ 36.101967][ T744] ? __dev_set_promiscuity+0x7b/0x1a0 [ 36.103230][ T744] _raw_spin_lock_bh+0x38/0x70 [ 36.104348][ T744] ? dev_set_rx_mode+0x19/0x30 [ 36.105461][ T744] dev_set_rx_mode+0x19/0x30 [ 36.106532][ T744] dev_set_promiscuity+0x36/0x50 [ 36.107692][ T744] __dev_set_promiscuity+0x123/0x1a0 [ 36.108929][ T744] dev_set_promiscuity+0x1e/0x50 [ 36.110093][ T744] br_port_set_promisc+0x1f/0x40 [bridge] [ 36.111415][ T744] br_manage_promisc+0x8b/0xe0 [bridge] [ 36.112728][ T744] __dev_set_promiscuity+0x123/0x1a0 [ 36.113967][ T744] ? __hw_addr_sync_one+0x23/0x50 [ 36.115135][ T744] __dev_set_rx_mode+0x68/0x90 [ 36.116249][ T744] dev_uc_sync+0x70/0x80 [ 36.117244][ T744] dev_uc_add+0x50/0x60 [ 36.118223][ T744] macvlan_open+0x18e/0x1f0 [macvlan] [ 36.119470][ T744] __dev_open+0xd6/0x170 [ 36.120470][ T744] __dev_change_flags+0x181/0x1d0 [ 36.121644][ T744] dev_change_flags+0x23/0x60 [ 36.122741][ T744] do_setlink+0x30a/0x11e0 [ 36.123778][ T744] ? __lock_acquire+0x92c/0x1f40 [ 36.124929][ T744] ? __nla_validate_parse.part.6+0x45/0x8e0 [ 36.126309][ T744] ? __lock_acquire+0x92c/0x1f40 [ 36.127457][ T744] __rtnl_newlink+0x546/0x8e0 [ 36.128560][ T744] ? lock_acquire+0xb4/0x3b0 [ 36.129623][ T744] ? deactivate_slab.isra.85+0x6a1/0x850 [ 36.130946][ T744] ? __lock_acquire+0x92c/0x1f40 [ 36.132102][ T744] ? lock_acquire+0xb4/0x3b0 [ 36.133176][ T744] ? is_bpf_text_address+0x5/0xe0 [ 36.134364][ T744] ? rtnl_newlink+0x2e/0x70 [ 36.135445][ T744] ? rcu_read_lock_sched_held+0x32/0x60 [ 36.136771][ T744] ? kmem_cache_alloc_trace+0x2d8/0x380 [ 36.138070][ T744] ? rtnl_newlink+0x2e/0x70 [ 36.139164][ T744] rtnl_newlink+0x47/0x70 [ ... ] Fixes: 845e0ebb4408 ("net: change addr_list_lock back to static key") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/netdevice.h | 52 ++++++++++++++++++++++++----- net/core/dev.c | 85 +++++++++++++++++++++++++++++++++++++++-------- net/core/dev_addr_lists.c | 12 +++---- 3 files changed, 122 insertions(+), 27 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 313803d6c781..9fdb3ebef306 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1955,6 +1955,7 @@ struct net_device { unsigned short type; unsigned short hard_header_len; unsigned char min_header_len; + unsigned char name_assign_type; unsigned short needed_headroom; unsigned short needed_tailroom; @@ -1965,21 +1966,28 @@ struct net_device { unsigned char addr_len; unsigned char upper_level; unsigned char lower_level; + unsigned short neigh_priv_len; unsigned short dev_id; unsigned short dev_port; spinlock_t addr_list_lock; - unsigned char name_assign_type; - bool uc_promisc; + struct netdev_hw_addr_list uc; struct netdev_hw_addr_list mc; struct netdev_hw_addr_list dev_addrs; #ifdef CONFIG_SYSFS struct kset *queues_kset; +#endif +#ifdef CONFIG_LOCKDEP + struct list_head unlink_list; #endif unsigned int promiscuity; unsigned int allmulti; + bool uc_promisc; +#ifdef CONFIG_LOCKDEP + unsigned char nested_level; +#endif /* Protocol-specific pointers */ @@ -4260,17 +4268,23 @@ static inline void netif_tx_disable(struct net_device *dev) static inline void netif_addr_lock(struct net_device *dev) { - spin_lock(&dev->addr_list_lock); -} + unsigned char nest_level = 0; -static inline void netif_addr_lock_nested(struct net_device *dev) -{ - spin_lock_nested(&dev->addr_list_lock, dev->lower_level); +#ifdef CONFIG_LOCKDEP + nest_level = dev->nested_level; +#endif + spin_lock_nested(&dev->addr_list_lock, nest_level); } static inline void netif_addr_lock_bh(struct net_device *dev) { - spin_lock_bh(&dev->addr_list_lock); + unsigned char nest_level = 0; + +#ifdef CONFIG_LOCKDEP + nest_level = dev->nested_level; +#endif + local_bh_disable(); + spin_lock_nested(&dev->addr_list_lock, nest_level); } static inline void netif_addr_unlock(struct net_device *dev) @@ -4455,7 +4469,19 @@ extern int dev_rx_weight; extern int dev_tx_weight; extern int gro_normal_batch; +enum { + NESTED_SYNC_IMM_BIT, + NESTED_SYNC_TODO_BIT, +}; + +#define __NESTED_SYNC_BIT(bit) ((u32)1 << (bit)) +#define __NESTED_SYNC(name) __NESTED_SYNC_BIT(NESTED_SYNC_ ## name ## _BIT) + +#define NESTED_SYNC_IMM __NESTED_SYNC(IMM) +#define NESTED_SYNC_TODO __NESTED_SYNC(TODO) + struct netdev_nested_priv { + unsigned char flags; void *data; }; @@ -4465,6 +4491,16 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter); +#ifdef CONFIG_LOCKDEP +static LIST_HEAD(net_unlink_list); + +static inline void net_unlink_todo(struct net_device *dev) +{ + if (list_empty(&dev->unlink_list)) + list_add_tail(&dev->unlink_list, &net_unlink_list); +} +#endif + /* iterate through upper list, must be called under RCU read lock */ #define netdev_for_each_upper_dev_rcu(dev, updev, iter) \ for (iter = &(dev)->adj_list.upper, \ diff --git a/net/core/dev.c b/net/core/dev.c index a4a1fa806c5c..4906b44af850 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7104,6 +7104,7 @@ static bool __netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { struct netdev_nested_priv priv = { + .flags = 0, .data = (void *)upper_dev, }; @@ -7385,9 +7386,19 @@ static int __netdev_update_upper_level(struct net_device *dev, } static int __netdev_update_lower_level(struct net_device *dev, - struct netdev_nested_priv *__unused) + struct netdev_nested_priv *priv) { dev->lower_level = __netdev_lower_depth(dev) + 1; + +#ifdef CONFIG_LOCKDEP + if (!priv) + return 0; + + if (priv->flags & NESTED_SYNC_IMM) + dev->nested_level = dev->lower_level - 1; + if (priv->flags & NESTED_SYNC_TODO) + net_unlink_todo(dev); +#endif return 0; } @@ -7665,6 +7676,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, void *upper_priv, void *upper_info, + struct netdev_nested_priv *priv, struct netlink_ext_ack *extack) { struct netdev_notifier_changeupper_info changeupper_info = { @@ -7721,9 +7733,9 @@ static int __netdev_upper_dev_link(struct net_device *dev, __netdev_update_upper_level(dev, NULL); __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); - __netdev_update_lower_level(upper_dev, NULL); + __netdev_update_lower_level(upper_dev, priv); __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, - NULL); + priv); return 0; @@ -7748,8 +7760,13 @@ int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, struct netlink_ext_ack *extack) { + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, + .data = NULL, + }; + return __netdev_upper_dev_link(dev, upper_dev, false, - NULL, NULL, extack); + NULL, NULL, &priv, extack); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -7772,13 +7789,19 @@ int netdev_master_upper_dev_link(struct net_device *dev, void *upper_priv, void *upper_info, struct netlink_ext_ack *extack) { + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, + .data = NULL, + }; + return __netdev_upper_dev_link(dev, upper_dev, true, - upper_priv, upper_info, extack); + upper_priv, upper_info, &priv, extack); } EXPORT_SYMBOL(netdev_master_upper_dev_link); static void __netdev_upper_dev_unlink(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + struct netdev_nested_priv *priv) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { @@ -7803,9 +7826,9 @@ static void __netdev_upper_dev_unlink(struct net_device *dev, __netdev_update_upper_level(dev, NULL); __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); - __netdev_update_lower_level(upper_dev, NULL); + __netdev_update_lower_level(upper_dev, priv); __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, - NULL); + priv); } /** @@ -7819,7 +7842,12 @@ static void __netdev_upper_dev_unlink(struct net_device *dev, void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - __netdev_upper_dev_unlink(dev, upper_dev); + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_TODO, + .data = NULL, + }; + + __netdev_upper_dev_unlink(dev, upper_dev, &priv); } EXPORT_SYMBOL(netdev_upper_dev_unlink); @@ -7855,6 +7883,10 @@ int netdev_adjacent_change_prepare(struct net_device *old_dev, struct net_device *dev, struct netlink_ext_ack *extack) { + struct netdev_nested_priv priv = { + .flags = 0, + .data = NULL, + }; int err; if (!new_dev) @@ -7862,8 +7894,8 @@ int netdev_adjacent_change_prepare(struct net_device *old_dev, if (old_dev && new_dev != old_dev) netdev_adjacent_dev_disable(dev, old_dev); - - err = netdev_upper_dev_link(new_dev, dev, extack); + err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv, + extack); if (err) { if (old_dev && new_dev != old_dev) netdev_adjacent_dev_enable(dev, old_dev); @@ -7878,6 +7910,11 @@ void netdev_adjacent_change_commit(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev) { + struct netdev_nested_priv priv = { + .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO, + .data = NULL, + }; + if (!new_dev || !old_dev) return; @@ -7885,7 +7922,7 @@ void netdev_adjacent_change_commit(struct net_device *old_dev, return; netdev_adjacent_dev_enable(dev, old_dev); - netdev_upper_dev_unlink(old_dev, dev); + __netdev_upper_dev_unlink(old_dev, dev, &priv); } EXPORT_SYMBOL(netdev_adjacent_change_commit); @@ -7893,13 +7930,18 @@ void netdev_adjacent_change_abort(struct net_device *old_dev, struct net_device *new_dev, struct net_device *dev) { + struct netdev_nested_priv priv = { + .flags = 0, + .data = NULL, + }; + if (!new_dev) return; if (old_dev && new_dev != old_dev) netdev_adjacent_dev_enable(dev, old_dev); - netdev_upper_dev_unlink(new_dev, dev); + __netdev_upper_dev_unlink(new_dev, dev, &priv); } EXPORT_SYMBOL(netdev_adjacent_change_abort); @@ -10083,6 +10125,19 @@ static void netdev_wait_allrefs(struct net_device *dev) void netdev_run_todo(void) { struct list_head list; +#ifdef CONFIG_LOCKDEP + struct list_head unlink_list; + + list_replace_init(&net_unlink_list, &unlink_list); + + while (!list_empty(&unlink_list)) { + struct net_device *dev = list_first_entry(&unlink_list, + struct net_device, + unlink_list); + list_del(&dev->unlink_list); + dev->nested_level = dev->lower_level - 1; + } +#endif /* Snapshot list, allow later requests */ list_replace_init(&net_todo_list, &list); @@ -10295,6 +10350,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_segs = GSO_MAX_SEGS; dev->upper_level = 1; dev->lower_level = 1; +#ifdef CONFIG_LOCKDEP + dev->nested_level = 0; + INIT_LIST_HEAD(&dev->unlink_list); +#endif INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 54cd568e7c2f..fa1c37ec40c9 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -637,7 +637,7 @@ int dev_uc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -667,7 +667,7 @@ int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -700,7 +700,7 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from) * larger. */ netif_addr_lock_bh(from); - netif_addr_lock_nested(to); + netif_addr_lock(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); @@ -867,7 +867,7 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -897,7 +897,7 @@ int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_nested(to); + netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -922,7 +922,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from) /* See the above comments inside dev_uc_unsync(). */ netif_addr_lock_bh(from); - netif_addr_lock_nested(to); + netif_addr_lock(to); __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); -- cgit v1.2.3 From a93bdcb94a0b3ca72046151412c2389dca681d2a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 2 Oct 2020 07:49:45 +0200 Subject: net: core: document two new elements of struct net_device As warned by "make htmldocs", there are two new struct elements that aren't documented: ../include/linux/netdevice.h:2159: warning: Function parameter or member 'unlink_list' not described in 'net_device' ../include/linux/netdevice.h:2159: warning: Function parameter or member 'nested_level' not described in 'net_device' Fixes: 1fc70edb7d7b ("net: core: add nested_level variable in net_device") Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9fdb3ebef306..18dec08439f9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1851,6 +1851,11 @@ enum netdev_priv_flags { * @udp_tunnel_nic: UDP tunnel offload state * @xdp_state: stores info on attached XDP BPF programs * + * @nested_level: Used as as a parameter of spin_lock_nested() of + * dev->addr_list_lock. + * @unlink_list: As netif_addr_lock() can be called recursively, + * keep a list of interfaces to be deleted. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ -- cgit v1.2.3