From d445516966dcb2924741b13b27738b54df2af01a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:26:45 -0700 Subject: net: xdp: support xdp generic on virtual devices XDP generic allows users to test XDP programs and/or run them with degraded performance on devices that do not yet support XDP. For testing I typically test eBPF programs using a set of veth devices. This allows testing topologies that would otherwise be difficult to setup especially in the early stages of development. This patch adds a xdp generic hook to the netif_rx_internal() function which is called from dev_forward_skb(). With this addition attaching XDP programs to veth devices works as expected! Also I noticed multiple drivers using netif_rx(). These devices will also benefit and generic XDP will work for them as well. Signed-off-by: John Fastabend Tested-by: Andy Gospodarek Acked-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/dev.c | 208 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 113 insertions(+), 95 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 02440518dd69..a1ed7b41b3e8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3865,6 +3865,107 @@ drop: return NET_RX_DROP; } +static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + struct xdp_buff xdp; + u32 act = XDP_DROP; + void *orig_data; + int hlen, off; + u32 mac_len; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_cloned(skb)) + return XDP_PASS; + + if (skb_linearize(skb)) + goto do_drop; + + /* The XDP program wants to see the packet starting at the MAC + * header. + */ + mac_len = skb->data - skb_mac_header(skb); + hlen = skb_headlen(skb) + mac_len; + xdp.data = skb->data - mac_len; + xdp.data_end = xdp.data + hlen; + xdp.data_hard_start = skb->data - skb_headroom(skb); + orig_data = xdp.data; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + off = xdp.data - orig_data; + if (off > 0) + __skb_pull(skb, off); + else if (off < 0) + __skb_push(skb, -off); + + switch (act) { + case XDP_TX: + __skb_push(skb, mac_len); + /* fall through */ + case XDP_PASS: + break; + + default: + bpf_warn_invalid_xdp_action(act); + /* fall through */ + case XDP_ABORTED: + trace_xdp_exception(skb->dev, xdp_prog, act); + /* fall through */ + case XDP_DROP: + do_drop: + kfree_skb(skb); + break; + } + + return act; +} + +/* When doing generic XDP we have to bypass the qdisc layer and the + * network taps in order to match in-driver-XDP behavior. + */ +static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +{ + struct net_device *dev = skb->dev; + struct netdev_queue *txq; + bool free_skb = true; + int cpu, rc; + + txq = netdev_pick_tx(dev, skb, NULL); + cpu = smp_processor_id(); + HARD_TX_LOCK(dev, txq, cpu); + if (!netif_xmit_stopped(txq)) { + rc = netdev_start_xmit(skb, dev, txq, 0); + if (dev_xmit_complete(rc)) + free_skb = false; + } + HARD_TX_UNLOCK(dev, txq); + if (free_skb) { + trace_xdp_exception(dev, xdp_prog, XDP_TX); + kfree_skb(skb); + } +} + +static struct static_key generic_xdp_needed __read_mostly; + +static int do_xdp_generic(struct sk_buff *skb) +{ + struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); + + if (xdp_prog) { + u32 act = netif_receive_generic_xdp(skb, xdp_prog); + + if (act != XDP_PASS) { + if (act == XDP_TX) + generic_xdp_tx(skb, xdp_prog); + return XDP_DROP; + } + } + return XDP_PASS; +} + static int netif_rx_internal(struct sk_buff *skb) { int ret; @@ -3872,6 +3973,14 @@ static int netif_rx_internal(struct sk_buff *skb) net_timestamp_check(netdev_tstamp_prequeue, skb); trace_netif_rx(skb); + + if (static_key_false(&generic_xdp_needed)) { + int ret = do_xdp_generic(skb); + + if (ret != XDP_PASS) + return NET_RX_DROP; + } + #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -4338,8 +4447,6 @@ static int __netif_receive_skb(struct sk_buff *skb) return ret; } -static struct static_key generic_xdp_needed __read_mostly; - static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); @@ -4373,89 +4480,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) return ret; } -static u32 netif_receive_generic_xdp(struct sk_buff *skb, - struct bpf_prog *xdp_prog) -{ - struct xdp_buff xdp; - u32 act = XDP_DROP; - void *orig_data; - int hlen, off; - u32 mac_len; - - /* Reinjected packets coming from act_mirred or similar should - * not get XDP generic processing. - */ - if (skb_cloned(skb)) - return XDP_PASS; - - if (skb_linearize(skb)) - goto do_drop; - - /* The XDP program wants to see the packet starting at the MAC - * header. - */ - mac_len = skb->data - skb_mac_header(skb); - hlen = skb_headlen(skb) + mac_len; - xdp.data = skb->data - mac_len; - xdp.data_end = xdp.data + hlen; - xdp.data_hard_start = skb->data - skb_headroom(skb); - orig_data = xdp.data; - - act = bpf_prog_run_xdp(xdp_prog, &xdp); - - off = xdp.data - orig_data; - if (off > 0) - __skb_pull(skb, off); - else if (off < 0) - __skb_push(skb, -off); - - switch (act) { - case XDP_TX: - __skb_push(skb, mac_len); - /* fall through */ - case XDP_PASS: - break; - - default: - bpf_warn_invalid_xdp_action(act); - /* fall through */ - case XDP_ABORTED: - trace_xdp_exception(skb->dev, xdp_prog, act); - /* fall through */ - case XDP_DROP: - do_drop: - kfree_skb(skb); - break; - } - - return act; -} - -/* When doing generic XDP we have to bypass the qdisc layer and the - * network taps in order to match in-driver-XDP behavior. - */ -static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) -{ - struct net_device *dev = skb->dev; - struct netdev_queue *txq; - bool free_skb = true; - int cpu, rc; - - txq = netdev_pick_tx(dev, skb, NULL); - cpu = smp_processor_id(); - HARD_TX_LOCK(dev, txq, cpu); - if (!netif_xmit_stopped(txq)) { - rc = netdev_start_xmit(skb, dev, txq, 0); - if (dev_xmit_complete(rc)) - free_skb = false; - } - HARD_TX_UNLOCK(dev, txq); - if (free_skb) { - trace_xdp_exception(dev, xdp_prog, XDP_TX); - kfree_skb(skb); - } -} - static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; @@ -4468,17 +4492,11 @@ static int netif_receive_skb_internal(struct sk_buff *skb) rcu_read_lock(); if (static_key_false(&generic_xdp_needed)) { - struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); - - if (xdp_prog) { - u32 act = netif_receive_generic_xdp(skb, xdp_prog); + int ret = do_xdp_generic(skb); - if (act != XDP_PASS) { - rcu_read_unlock(); - if (act == XDP_TX) - generic_xdp_tx(skb, xdp_prog); - return NET_RX_DROP; - } + if (ret != XDP_PASS) { + rcu_read_unlock(); + return NET_RX_DROP; } } -- cgit v1.2.3 From 6103aa96ec077c976e851e0b89cc2446cb76573d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:27:50 -0700 Subject: net: implement XDP_REDIRECT for xdp generic Add support for redirect to xdp generic creating a fall back for devices that do not yet have support and allowing test infrastructure using veth pairs to be built. Signed-off-by: John Fastabend Tested-by: Andy Gospodarek Acked-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/filter.h | 1 + net/core/dev.c | 22 ++++++++++++++++++++-- net/core/filter.c | 26 ++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/filter.h b/include/linux/filter.h index 64cae7a08148..10df7daf5ec6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -712,6 +712,7 @@ bool bpf_helper_changes_pkt_data(void *func); struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp); void bpf_warn_invalid_xdp_action(u32 act); diff --git a/net/core/dev.c b/net/core/dev.c index a1ed7b41b3e8..9f3f4083ada5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3902,6 +3902,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, __skb_push(skb, -off); switch (act) { + case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); /* fall through */ @@ -3956,14 +3957,27 @@ static int do_xdp_generic(struct sk_buff *skb) if (xdp_prog) { u32 act = netif_receive_generic_xdp(skb, xdp_prog); + int err; if (act != XDP_PASS) { - if (act == XDP_TX) + switch (act) { + case XDP_REDIRECT: + err = xdp_do_generic_redirect(skb->dev, skb); + if (err) + goto out_redir; + /* fallthru to submit skb */ + case XDP_TX: generic_xdp_tx(skb, xdp_prog); + break; + } return XDP_DROP; } } return XDP_PASS; +out_redir: + trace_xdp_exception(skb->dev, xdp_prog, XDP_REDIRECT); + kfree_skb(skb); + return XDP_DROP; } static int netif_rx_internal(struct sk_buff *skb) @@ -3977,8 +3991,12 @@ static int netif_rx_internal(struct sk_buff *skb) if (static_key_false(&generic_xdp_needed)) { int ret = do_xdp_generic(skb); + /* Consider XDP consuming the packet a success from + * the netdev point of view we do not want to count + * this as an error. + */ if (ret != XDP_PASS) - return NET_RX_DROP; + return NET_RX_SUCCESS; } #ifdef CONFIG_RPS diff --git a/net/core/filter.c b/net/core/filter.c index d606a66d1040..eeb713461c25 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2437,6 +2437,32 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp) } EXPORT_SYMBOL_GPL(xdp_do_redirect); +int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + unsigned int len; + + dev = dev_get_by_index_rcu(dev_net(dev), ri->ifindex); + ri->ifindex = 0; + if (unlikely(!dev)) { + bpf_warn_invalid_xdp_redirect(ri->ifindex); + goto err; + } + + if (unlikely(!(dev->flags & IFF_UP))) + goto err; + + len = dev->mtu + dev->hard_header_len + VLAN_HLEN; + if (skb->len > len) + goto err; + + skb->dev = dev; + return 0; +err: + return -EINVAL; +} +EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); + BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); -- cgit v1.2.3 From d4c023f4f3dd96734ef53d4b588136a872300046 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 3 Jul 2017 07:04:22 -0700 Subject: net: Remove references to NETIF_F_UFO in netdev_fix_features(). It is going away. Signed-off-by: David S. Miller --- net/core/dev.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 9f3f4083ada5..467420eda02e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7271,24 +7271,6 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_GSO; } - /* UFO needs SG and checksumming */ - if (features & NETIF_F_UFO) { - /* maybe split UFO into V4 and V6? */ - if (!(features & NETIF_F_HW_CSUM) && - ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) != - (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) { - netdev_dbg(dev, - "Dropping NETIF_F_UFO since no checksum offload features.\n"); - features &= ~NETIF_F_UFO; - } - - if (!(features & NETIF_F_SG)) { - netdev_dbg(dev, - "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); - features &= ~NETIF_F_UFO; - } - } - /* GSO partial features require GSO partial be set */ if ((features & dev->gso_partial_features) && !(features & NETIF_F_GSO_PARTIAL)) { -- cgit v1.2.3 From 7051b88a35c7dde5705923833117e14f9cc17d92 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 18 Jul 2017 15:59:27 -0700 Subject: net: make dev_close and related functions void There is no useful return value from dev_close. All paths return 0. Change dev_close and helper functions to void. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- net/core/dev.c | 26 +++++++++++--------------- 2 files changed, 13 insertions(+), 17 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c60351b84323..614642eb7eb7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2432,8 +2432,8 @@ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name); struct net_device *__dev_get_by_name(struct net *net, const char *name); int dev_alloc_name(struct net_device *dev, const char *name); int dev_open(struct net_device *dev); -int dev_close(struct net_device *dev); -int dev_close_many(struct list_head *head, bool unlink); +void dev_close(struct net_device *dev); +void dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); int dev_queue_xmit(struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 467420eda02e..d1b9c9b6c970 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1413,7 +1413,7 @@ int dev_open(struct net_device *dev) } EXPORT_SYMBOL(dev_open); -static int __dev_close_many(struct list_head *head) +static void __dev_close_many(struct list_head *head) { struct net_device *dev; @@ -1455,23 +1455,18 @@ static int __dev_close_many(struct list_head *head) dev->flags &= ~IFF_UP; netpoll_poll_enable(dev); } - - return 0; } -static int __dev_close(struct net_device *dev) +static void __dev_close(struct net_device *dev) { - int retval; LIST_HEAD(single); list_add(&dev->close_list, &single); - retval = __dev_close_many(&single); + __dev_close_many(&single); list_del(&single); - - return retval; } -int dev_close_many(struct list_head *head, bool unlink) +void dev_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; @@ -1488,8 +1483,6 @@ int dev_close_many(struct list_head *head, bool unlink) if (unlink) list_del_init(&dev->close_list); } - - return 0; } EXPORT_SYMBOL(dev_close_many); @@ -1502,7 +1495,7 @@ EXPORT_SYMBOL(dev_close_many); * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier * chain. */ -int dev_close(struct net_device *dev) +void dev_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); @@ -1511,7 +1504,6 @@ int dev_close(struct net_device *dev) dev_close_many(&single, true); list_del(&single); } - return 0; } EXPORT_SYMBOL(dev_close); @@ -6725,8 +6717,12 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) */ ret = 0; - if ((old_flags ^ flags) & IFF_UP) - ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); + if ((old_flags ^ flags) & IFF_UP) { + if (old_flags & IFF_UP) + __dev_close(dev); + else + ret = __dev_open(dev); + } if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; -- cgit v1.2.3 From d764a122cc7af7ab1c40c08745f0fcd33cc2f7db Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 21 Jul 2017 12:49:28 +0200 Subject: net: add new netdevice feature for offload of RX port for UDP tunnels This adds a new netdevice feature, so that the offloading of RX port for UDP tunnels can be disabled by the administrator on some netdevices, using the "rx-udp_tunnel-port-offload" feature in ethtool. This feature is set for all devices that provide ndo_udp_tunnel_add. Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 2 ++ net/core/dev.c | 6 ++++++ net/core/ethtool.c | 1 + 3 files changed, 9 insertions(+) (limited to 'net/core/dev.c') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index ebd273627334..dc8b4896b77b 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -75,6 +75,7 @@ enum { NETIF_F_HW_TC_BIT, /* Offload TC infrastructure */ NETIF_F_HW_ESP_BIT, /* Hardware ESP transformation offload */ NETIF_F_HW_ESP_TX_CSUM_BIT, /* ESP with TX checksum offload */ + NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */ /* * Add your fresh new feature above and remember to update @@ -138,6 +139,7 @@ enum { #define NETIF_F_HW_TC __NETIF_F(HW_TC) #define NETIF_F_HW_ESP __NETIF_F(HW_ESP) #define NETIF_F_HW_ESP_TX_CSUM __NETIF_F(HW_ESP_TX_CSUM) +#define NETIF_F_RX_UDP_TUNNEL_PORT __NETIF_F(RX_UDP_TUNNEL_PORT) #define for_each_netdev_feature(mask_addr, bit) \ for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) diff --git a/net/core/dev.c b/net/core/dev.c index 509af6ce8831..9081134adc0d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7530,6 +7530,12 @@ int register_netdevice(struct net_device *dev) */ dev->hw_features |= NETIF_F_SOFT_FEATURES; dev->features |= NETIF_F_SOFT_FEATURES; + + if (dev->netdev_ops->ndo_udp_tunnel_add) { + dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; + dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; + } + dev->wanted_features = dev->features & dev->hw_features; if (!(dev->flags & IFF_LOOPBACK)) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 78408ab77a10..b987bc475fc8 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -105,6 +105,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_HW_TC_BIT] = "hw-tc-offload", [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", + [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", }; static const char -- cgit v1.2.3 From ae847f40b6418a7d6e197f6ef0d85f40e313c4d4 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 21 Jul 2017 12:49:31 +0200 Subject: net: call udp_tunnel_get_rx_info when NETIF_F_RX_UDP_TUNNEL_PORT is toggled NETIF_F_RX_UDP_TUNNEL_PORT is special, in that we need to do more than just flip the bit in dev->features. When disabling we must also clear currently offloaded ports from the device, and when enabling we must tell the device to offload the ports it can. Because vxlan stores its sockets in a hashtable and they are inserted at the head of per-bucket lists, switching the feature off and then on can result in a different set of ports being offloaded (depending on the HW's limits). Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/core/dev.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 9081134adc0d..8ea6b4b42611 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -144,6 +144,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -7327,8 +7328,27 @@ sync_lower: netdev_for_each_lower_dev(dev, lower, iter) netdev_sync_lower_features(dev, lower, features); - if (!err) + if (!err) { + netdev_features_t diff = features ^ dev->features; + + if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) { + /* udp_tunnel_{get,drop}_rx_info both need + * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the + * device, or they won't do anything. + * Thus we need to update dev->features + * *before* calling udp_tunnel_get_rx_info, + * but *after* calling udp_tunnel_drop_rx_info. + */ + if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { + dev->features = features; + udp_tunnel_get_rx_info(dev); + } else { + udp_tunnel_drop_rx_info(dev); + } + } + dev->features = features; + } return err < 0 ? 0 : 1; } -- cgit v1.2.3 From 1f8b977ab32dc5d148f103326e80d9097f1cefb5 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:41 -0400 Subject: sock: enable MSG_ZEROCOPY Prepare the datapath for refcounted ubuf_info. Clone ubuf_info with skb_zerocopy_clone() wherever needed due to skb split, merge, resize or clone. Split skb_orphan_frags into two variants. The split, merge, .. paths support reference counted zerocopy buffers, so do not do a deep copy. Add skb_orphan_frags_rx for paths that may loop packets to receive sockets. That is not allowed, as it may cause unbounded latency. Deep copy all zerocopy copy buffers, ref-counted or not, in this path. The exact locations to modify were chosen by exhaustively searching through all code that might modify skb_frag references and/or the the SKBTX_DEV_ZEROCOPY tx_flags bit. The changes err on the safe side, in two ways. (1) legacy ubuf_info paths virtio and tap are not modified. They keep a 1:1 ubuf_info to sk_buff relationship. Calls to skb_orphan_frags still call skb_copy_ubufs and thus copy frags in this case. (2) not all copies deep in the stack are addressed yet. skb_shift, skb_split and skb_try_coalesce can be refined to avoid copying. These are not in the hot path and this patch is hairy enough as is, so that is left for future refinement. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/tun.c | 2 +- drivers/vhost/net.c | 1 + include/linux/skbuff.h | 14 +++++++++++++- net/core/dev.c | 4 ++-- net/core/skbuff.c | 48 +++++++++++++++++++----------------------------- 5 files changed, 36 insertions(+), 33 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 68add55f8460..d21510d47aa2 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -892,7 +892,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) sk_filter(tfile->socket.sk, skb)) goto drop; - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; skb_tx_timestamp(skb); diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 06d044862e58..ba08b78ed630 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -533,6 +533,7 @@ static void handle_tx(struct vhost_net *net) ubuf->callback = vhost_zerocopy_callback; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; + atomic_set(&ubuf->refcnt, 1); msg.msg_control = ubuf; msg.msg_controllen = sizeof(ubuf); ubufs = nvq->ubufs; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 59cff7aa494e..e5387932c266 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2512,7 +2512,17 @@ static inline void skb_orphan(struct sk_buff *skb) */ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask) { - if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY))) + if (likely(!skb_zcopy(skb))) + return 0; + if (skb_uarg(skb)->callback == sock_zerocopy_callback) + return 0; + return skb_copy_ubufs(skb, gfp_mask); +} + +/* Frags must be orphaned, even if refcounted, if skb might loop to rx path */ +static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask) +{ + if (likely(!skb_zcopy(skb))) return 0; return skb_copy_ubufs(skb, gfp_mask); } @@ -2944,6 +2954,8 @@ static inline int skb_add_data(struct sk_buff *skb, static inline bool skb_can_coalesce(struct sk_buff *skb, int i, const struct page *page, int off) { + if (skb_zcopy(skb)) + return false; if (i) { const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1]; diff --git a/net/core/dev.c b/net/core/dev.c index 8ea6b4b42611..1d75499add72 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1853,7 +1853,7 @@ static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) { - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); @@ -4412,7 +4412,7 @@ skip_classify: } if (pt_prev) { - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; else ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 29e34bc6a17c..74d3c36f8419 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -567,21 +567,10 @@ static void skb_release_data(struct sk_buff *skb) for (i = 0; i < shinfo->nr_frags; i++) __skb_frag_unref(&shinfo->frags[i]); - /* - * If skb buf is from userspace, we need to notify the caller - * the lower device DMA has done; - */ - if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) { - struct ubuf_info *uarg; - - uarg = shinfo->destructor_arg; - if (uarg->callback) - uarg->callback(uarg, true); - } - if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); + skb_zcopy_clear(skb, true); skb_free_head(skb); } @@ -695,14 +684,7 @@ EXPORT_SYMBOL(kfree_skb_list); */ void skb_tx_error(struct sk_buff *skb) { - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - struct ubuf_info *uarg; - - uarg = skb_shinfo(skb)->destructor_arg; - if (uarg->callback) - uarg->callback(uarg, false); - skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; - } + skb_zcopy_clear(skb, true); } EXPORT_SYMBOL(skb_tx_error); @@ -1029,9 +1011,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); -/* unused only until next patch in the series; will remove attribute */ -static int __attribute__((unused)) - skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, +static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, gfp_t gfp_mask) { if (skb_zcopy(orig)) { @@ -1068,7 +1048,6 @@ static int __attribute__((unused)) */ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) { - struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg; int num_frags = skb_shinfo(skb)->nr_frags; struct page *page, *head = NULL; int i, new_frags; @@ -1127,8 +1106,6 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) for (i = 0; i < num_frags; i++) skb_frag_unref(skb, i); - uarg->callback(uarg, false); - /* skb frags point to kernel buffers */ for (i = 0; i < new_frags - 1; i++) { __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE); @@ -1137,7 +1114,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); skb_shinfo(skb)->nr_frags = new_frags; - skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; + skb_zcopy_clear(skb, false); return 0; } EXPORT_SYMBOL_GPL(skb_copy_ubufs); @@ -1298,7 +1275,8 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, if (skb_shinfo(skb)->nr_frags) { int i; - if (skb_orphan_frags(skb, gfp_mask)) { + if (skb_orphan_frags(skb, gfp_mask) || + skb_zerocopy_clone(n, skb, gfp_mask)) { kfree_skb(n); n = NULL; goto out; @@ -1375,9 +1353,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, * be since all we did is relocate the values */ if (skb_cloned(skb)) { - /* copy this zero copy skb frags */ if (skb_orphan_frags(skb, gfp_mask)) goto nofrags; + if (skb_zcopy(skb)) + atomic_inc(&skb_uarg(skb)->refcnt); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); @@ -1872,6 +1851,9 @@ end: skb->tail += delta; skb->data_len -= delta; + if (!skb->data_len) + skb_zcopy_clear(skb, false); + return skb_tail_pointer(skb); } EXPORT_SYMBOL(__pskb_pull_tail); @@ -2627,6 +2609,7 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) skb_tx_error(from); return -ENOMEM; } + skb_zerocopy_clone(to, from, GFP_ATOMIC); for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { if (!len) @@ -2924,6 +2907,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; + skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ @@ -2967,6 +2951,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) if (skb_headlen(skb)) return 0; + if (skb_zcopy(tgt) || skb_zcopy(skb)) + return 0; todo = shiftlen; from = 0; @@ -3540,6 +3526,8 @@ normal: skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & SKBTX_SHARED_FRAG; + if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC)) + goto err; while (pos < offset + len) { if (i >= nfrags) { @@ -4663,6 +4651,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_has_frag_list(to) || skb_has_frag_list(from)) return false; + if (skb_zcopy(to) || skb_zcopy(from)) + return false; if (skb_headlen(from) != 0) { struct page *page; -- cgit v1.2.3 From 8d63bee643f1fb53e472f0e135cae4eb99d62d19 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 8 Aug 2017 14:22:55 -0400 Subject: net: avoid skb_warn_bad_offload false positives on UFO skb_warn_bad_offload triggers a warning when an skb enters the GSO stack at __skb_gso_segment that does not have CHECKSUM_PARTIAL checksum offload set. Commit b2504a5dbef3 ("net: reduce skb_warn_bad_offload() noise") observed that SKB_GSO_DODGY producers can trigger the check and that passing those packets through the GSO handlers will fix it up. But, the software UFO handler will set ip_summed to CHECKSUM_NONE. When __skb_gso_segment is called from the receive path, this triggers the warning again. Make UFO set CHECKSUM_UNNECESSARY instead of CHECKSUM_NONE. On Tx these two are equivalent. On Rx, this better matches the skb state (checksum computed), as CHECKSUM_NONE here means no checksum computed. See also this thread for context: http://patchwork.ozlabs.org/patch/799015/ Fixes: b2504a5dbef3 ("net: reduce skb_warn_bad_offload() noise") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/ipv4/udp_offload.c | 2 +- net/ipv6/udp_offload.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 8515f8fe0460..ce15a06d5558 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2739,7 +2739,7 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) { if (tx_path) return skb->ip_summed != CHECKSUM_PARTIAL && - skb->ip_summed != CHECKSUM_NONE; + skb->ip_summed != CHECKSUM_UNNECESSARY; return skb->ip_summed == CHECKSUM_NONE; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 781250151d40..0932c85b42af 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -235,7 +235,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (uh->check == 0) uh->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; /* If there is no outer header we can fake a checksum offload * due to the fact that we have already done the checksum in diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index a2267f80febb..e7d378c032cb 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -72,7 +72,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (uh->check == 0) uh->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; /* If there is no outer header we can fake a checksum offload * due to the fact that we have already done the checksum in -- cgit v1.2.3 From 939912216fa8f62331de7d04edff492d5dc8e6e9 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Thu, 10 Aug 2017 20:16:29 -0700 Subject: net: skb_needs_check() removes CHECKSUM_UNNECESSARY check for tx. Because we remove the UFO support, we will also remove the CHECKSUM_UNNECESSARY check in skb_needs_check(). Cc: Willem de Bruijn Signed-off-by: Tonghao Zhang Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 3f69f6e71824..1024d3741d12 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2731,8 +2731,7 @@ EXPORT_SYMBOL(skb_mac_gso_segment); static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) { if (tx_path) - return skb->ip_summed != CHECKSUM_PARTIAL && - skb->ip_summed != CHECKSUM_UNNECESSARY; + return skb->ip_summed != CHECKSUM_PARTIAL; return skb->ip_summed == CHECKSUM_NONE; } -- cgit v1.2.3 From 7c4974786f4794178f04e96318fc3b2f2850cbc6 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 11 Aug 2017 19:41:17 +0800 Subject: net: export some generic xdp helpers This patch tries to export some generic xdp helpers to drivers. This can let driver to do XDP for a specific skb. This is useful for the case when the packet is hard to be processed at page level directly (e.g jumbo/GSO frame). With this patch, there's no need for driver to forbid the XDP set when configuration is not suitable. Instead, it can defer the XDP for packets that is hard to be processed directly after skb is created. Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 14 ++++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1d238d54c484..0f1c4cb2441e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3243,6 +3243,8 @@ static inline void dev_consume_skb_any(struct sk_buff *skb) __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED); } +void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); int netif_rx(struct sk_buff *skb); int netif_rx_ni(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); diff --git a/net/core/dev.c b/net/core/dev.c index 1024d3741d12..40b28e417072 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3919,7 +3919,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* When doing generic XDP we have to bypass the qdisc layer and the * network taps in order to match in-driver-XDP behavior. */ -static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -3940,13 +3940,12 @@ static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) kfree_skb(skb); } } +EXPORT_SYMBOL_GPL(generic_xdp_tx); static struct static_key generic_xdp_needed __read_mostly; -static int do_xdp_generic(struct sk_buff *skb) +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) { - struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); - if (xdp_prog) { u32 act = netif_receive_generic_xdp(skb, xdp_prog); int err; @@ -3971,6 +3970,7 @@ out_redir: kfree_skb(skb); return XDP_DROP; } +EXPORT_SYMBOL_GPL(do_xdp_generic); static int netif_rx_internal(struct sk_buff *skb) { @@ -3981,7 +3981,8 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(skb); + int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + skb); /* Consider XDP consuming the packet a success from * the netdev point of view we do not want to count @@ -4502,7 +4503,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb) rcu_read_lock(); if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(skb); + int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + skb); if (ret != XDP_PASS) { rcu_read_unlock(); -- cgit v1.2.3 From 2facaad6000f2322eb40ca379aced31c957f0a41 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 Aug 2017 12:33:08 +0200 Subject: xdp: make generic xdp redirect use tracepoint trace_xdp_redirect If the xdp_do_generic_redirect() call fails, it trigger the trace_xdp_exception tracepoint. It seems better to use the same tracepoint trace_xdp_redirect, as the native xdp_do_redirect{,_map} does. Signed-off-by: Jesper Dangaard Brouer Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 3 ++- net/core/dev.c | 4 ++-- net/core/filter.c | 35 +++++++++++++++++++++++------------ 3 files changed, 27 insertions(+), 15 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/filter.h b/include/linux/filter.h index 7015116331af..d29e58fde364 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -718,7 +718,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * because we only track one map and force a flush when the map changes. * This does not appear to be a real limitation for existing software. */ -int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb); +int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); diff --git a/net/core/dev.c b/net/core/dev.c index 40b28e417072..270b54754821 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3953,7 +3953,8 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: - err = xdp_do_generic_redirect(skb->dev, skb); + err = xdp_do_generic_redirect(skb->dev, skb, + xdp_prog); if (err) goto out_redir; /* fallthru to submit skb */ @@ -3966,7 +3967,6 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) } return XDP_PASS; out_redir: - trace_xdp_exception(skb->dev, xdp_prog, XDP_REDIRECT); kfree_skb(skb); return XDP_DROP; } diff --git a/net/core/filter.c b/net/core/filter.c index a5e5f31b41b9..a04680331033 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2553,26 +2553,37 @@ out: } EXPORT_SYMBOL_GPL(xdp_do_redirect); -int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb) +int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); - unsigned int len; u32 index = ri->ifindex; + struct net_device *fwd; + unsigned int len; + int err = 0; - dev = dev_get_by_index_rcu(dev_net(dev), index); + fwd = dev_get_by_index_rcu(dev_net(dev), index); ri->ifindex = 0; - if (unlikely(!dev)) { - return -EINVAL; + if (unlikely(!fwd)) { + err = -EINVAL; + goto out; } - if (unlikely(!(dev->flags & IFF_UP))) - return -ENETDOWN; - len = dev->mtu + dev->hard_header_len + VLAN_HLEN; - if (skb->len > len) - return -E2BIG; + if (unlikely(!(fwd->flags & IFF_UP))) { + err = -ENETDOWN; + goto out; + } - skb->dev = dev; - return 0; + len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; + if (skb->len > len) { + err = -EMSGSIZE; + goto out; + } + + skb->dev = fwd; +out: + trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT, err); + return err; } EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); -- cgit v1.2.3 From 1e22391e8fbec9c3709bad82b997b108d1c6228b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 25 Aug 2017 15:04:32 +0200 Subject: net: missing call of trace_napi_poll in busy_poll_stop Noticed that busy_poll_stop() also invoke the drivers napi->poll() function pointer, but didn't have an associated call to trace_napi_poll() like all other call sites. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index ce15a06d5558..818dfa6e7ab5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5289,6 +5289,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ rc = napi->poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); netpoll_poll_unlock(have_poll_lock); if (rc == BUSY_POLL_BUDGET) __napi_schedule(napi); -- cgit v1.2.3 From 25cc72a33835ed8a6f53180a822cadab855852ac Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 1 Sep 2017 10:52:31 +0200 Subject: mlxsw: spectrum: Forbid linking to devices that have uppers The mlxsw driver relies on NETDEV_CHANGEUPPER events to configure the device in case a port is enslaved to a master netdev such as bridge or bond. Since the driver ignores events unrelated to its ports and their uppers, it's possible to engineer situations in which the device's data path differs from the kernel's. One example to such a situation is when a port is enslaved to a bond that is already enslaved to a bridge. When the bond was enslaved the driver ignored the event - as the bond wasn't one of its uppers - and therefore a bridge port instance isn't created in the device. Until such configurations are supported forbid them by checking that the upper device doesn't have uppers of its own. Fixes: 0d65fc13042f ("mlxsw: spectrum: Implement LAG port join/leave") Signed-off-by: Ido Schimmel Reported-by: Nogah Frankel Tested-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 6 ++++++ include/linux/netdevice.h | 2 ++ net/core/dev.c | 3 ++- 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 60bf8f27cc00..c6a3e61b53bd 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4139,6 +4139,8 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev, return -EINVAL; if (!info->linking) break; + if (netdev_has_any_upper_dev(upper_dev)) + return -EINVAL; if (netif_is_lag_master(upper_dev) && !mlxsw_sp_master_lag_check(mlxsw_sp, upper_dev, info->upper_info)) @@ -4258,6 +4260,10 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev, upper_dev = info->upper_dev; if (!netif_is_bridge_master(upper_dev)) return -EINVAL; + if (!info->linking) + break; + if (netdev_has_any_upper_dev(upper_dev)) + return -EINVAL; break; case NETDEV_CHANGEUPPER: upper_dev = info->upper_dev; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 779b23595596..c99ba7914c0a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3866,6 +3866,8 @@ int netdev_walk_all_upper_dev_rcu(struct net_device *dev, bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev); +bool netdev_has_any_upper_dev(struct net_device *dev); + void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter); void *netdev_lower_get_next_private_rcu(struct net_device *dev, diff --git a/net/core/dev.c b/net/core/dev.c index 818dfa6e7ab5..86b4b0a79e7a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5668,12 +5668,13 @@ EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); * Find out if a device is linked to an upper device and return true in case * it is. The caller must hold the RTNL lock. */ -static bool netdev_has_any_upper_dev(struct net_device *dev) +bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.upper); } +EXPORT_SYMBOL(netdev_has_any_upper_dev); /** * netdev_master_upper_dev_get - Get master upper device -- cgit v1.2.3 From bbbe211c295ffb309247adb7b871dda60d92d2d5 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 8 Sep 2017 14:00:30 -0700 Subject: net: rcu lock and preempt disable missing around generic xdp do_xdp_generic must be called inside rcu critical section with preempt disabled to ensure BPF programs are valid and per-cpu variables used for redirect operations are consistent. This patch ensures this is true and fixes the splat below. The netif_receive_skb_internal() code path is now broken into two rcu critical sections. I decided it was better to limit the preempt_enable/disable block to just the xdp static key portion and the fallout is more rcu_read_lock/unlock calls. Seems like the best option to me. [ 607.596901] ============================= [ 607.596906] WARNING: suspicious RCU usage [ 607.596912] 4.13.0-rc4+ #570 Not tainted [ 607.596917] ----------------------------- [ 607.596923] net/core/dev.c:3948 suspicious rcu_dereference_check() usage! [ 607.596927] [ 607.596927] other info that might help us debug this: [ 607.596927] [ 607.596933] [ 607.596933] rcu_scheduler_active = 2, debug_locks = 1 [ 607.596938] 2 locks held by pool/14624: [ 607.596943] #0: (rcu_read_lock_bh){......}, at: [] ip_finish_output2+0x14d/0x890 [ 607.596973] #1: (rcu_read_lock_bh){......}, at: [] __dev_queue_xmit+0x14a/0xfd0 [ 607.597000] [ 607.597000] stack backtrace: [ 607.597006] CPU: 5 PID: 14624 Comm: pool Not tainted 4.13.0-rc4+ #570 [ 607.597011] Hardware name: Dell Inc. Precision Tower 5810/0HHV7N, BIOS A17 03/01/2017 [ 607.597016] Call Trace: [ 607.597027] dump_stack+0x67/0x92 [ 607.597040] lockdep_rcu_suspicious+0xdd/0x110 [ 607.597054] do_xdp_generic+0x313/0xa50 [ 607.597068] ? time_hardirqs_on+0x5b/0x150 [ 607.597076] ? mark_held_locks+0x6b/0xc0 [ 607.597088] ? netdev_pick_tx+0x150/0x150 [ 607.597117] netif_rx_internal+0x205/0x3f0 [ 607.597127] ? do_xdp_generic+0xa50/0xa50 [ 607.597144] ? lock_downgrade+0x2b0/0x2b0 [ 607.597158] ? __lock_is_held+0x93/0x100 [ 607.597187] netif_rx+0x119/0x190 [ 607.597202] loopback_xmit+0xfd/0x1b0 [ 607.597214] dev_hard_start_xmit+0x127/0x4e0 Fixes: d445516966dc ("net: xdp: support xdp generic on virtual devices") Fixes: b5cdae3291f7 ("net: Generic XDP") Acked-by: Daniel Borkmann Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/dev.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 6f845e4fec17..fb766d906148 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3981,8 +3981,13 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), - skb); + int ret; + + preempt_disable(); + rcu_read_lock(); + ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + rcu_read_unlock(); + preempt_enable(); /* Consider XDP consuming the packet a success from * the netdev point of view we do not want to count @@ -4500,18 +4505,20 @@ static int netif_receive_skb_internal(struct sk_buff *skb) if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; - rcu_read_lock(); - if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), - skb); + int ret; - if (ret != XDP_PASS) { - rcu_read_unlock(); + preempt_disable(); + rcu_read_lock(); + ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + rcu_read_unlock(); + preempt_enable(); + + if (ret != XDP_PASS) return NET_RX_DROP; - } } + rcu_read_lock(); #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; -- cgit v1.2.3 From 92dd5452c1be873a1193561f4f691763103d22ac Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 19 Sep 2017 18:45:56 +0100 Subject: net: change skb->mac_header when Generic XDP calls adjust_head Since XDP's view of the packet includes the MAC header, moving the start- of-packet with bpf_xdp_adjust_head needs to also update the offset of the MAC header (which is relative to skb->head, not to the skb->data that was changed). Without this, tcpdump sees packets starting from the old MAC header rather than the new one, at least in my tests on the loopback device. Fixes: b5cdae3291f7 ("net: Generic XDP") Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index fb766d906148..9a2254f9802f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3892,6 +3892,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); + skb->mac_header += off; switch (act) { case XDP_REDIRECT: -- cgit v1.2.3 From 581fe0ea61584d88072527ae9fb9dcb9d1f2783e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Sep 2017 19:42:37 -0400 Subject: net: orphan frags on stand-alone ptype in dev_queue_xmit_nit Zerocopy skbs frags are copied when the skb is looped to a local sock. Commit 1080e512d44d ("net: orphan frags on receive") introduced calls to skb_orphan_frags to deliver_skb and __netif_receive_skb for this. With msg_zerocopy, these skbs can also exist in the tx path and thus loop from dev_queue_xmit_nit. This already calls deliver_skb in its loop. But it does not orphan before a separate pt_prev->func(). Add the missing skb_orphan_frags_rx. Changes v1->v2: handle skb_orphan_frags_rx failure Fixes: 1f8b977ab32d ("sock: enable MSG_ZEROCOPY") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 9a2254f9802f..588b473194a8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1948,8 +1948,12 @@ again: goto again; } out_unlock: - if (pt_prev) - pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + if (pt_prev) { + if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + else + kfree_skb(skb2); + } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); -- cgit v1.2.3