From 5be5515a8ea198de6eb204a0ff25faf98b8ff719 Mon Sep 17 00:00:00 2001 From: Julio Faracco Date: Tue, 1 Oct 2019 11:39:04 -0300 Subject: net: core: dev: replace state xoff flag comparison by netif_xmit_stopped method Function netif_schedule_queue() has a hardcoded comparison between queue state and any xoff flag. This comparison does the same thing as method netif_xmit_stopped(). In terms of code clarity, it is better. See other methods like: generic_xdp_tx() and dev_direct_xmit(). Signed-off-by: Julio Faracco Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bf3ed413abaf..21a9c2987cbb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2771,7 +2771,7 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) void netif_schedule_queue(struct netdev_queue *txq) { rcu_read_lock(); - if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { + if (!netif_xmit_stopped(txq)) { struct Qdisc *q = rcu_dereference(txq->qdisc); __netif_schedule(q); -- cgit v1.2.3 From 6958c97a488c69c2421760e4b73834fb63d6a935 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 11:48:14 +0200 Subject: net: procfs: use index hashlist instead of name hashlist Name hashlist is going to be used for more than just dev->name, so use rather index hashlist for iteration over net_device instances. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/net-procfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 36347933ec3a..6bbd06f7dc7d 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -20,8 +20,8 @@ static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff struct hlist_head *h; unsigned int count = 0, offset = get_offset(*pos); - h = &net->dev_name_head[get_bucket(*pos)]; - hlist_for_each_entry_rcu(dev, h, name_hlist) { + h = &net->dev_index_head[get_bucket(*pos)]; + hlist_for_each_entry_rcu(dev, h, index_hlist) { if (++count == offset) return dev; } -- cgit v1.2.3 From ff92741270bf8b6e78aa885f166b68c7a67ab13a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 11:48:15 +0200 Subject: net: introduce name_node struct to be used in hashlist Introduce name_node structure to hold name of device and put it into hashlist instead of putting there struct net_device directly. Add a necessary infrastructure to manipulate the hashlist. This prepares the code to use the same hashlist for alternative names introduced later in this set. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++- net/core/dev.c | 97 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 87 insertions(+), 20 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9eda1c31d1f7..e92bc5467256 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -925,6 +925,12 @@ struct dev_ifalias { struct devlink; struct tlsdev_ops; +struct netdev_name_node { + struct hlist_node hlist; + struct net_device *dev; + const char *name; +}; + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1564,7 +1570,7 @@ enum netdev_priv_flags { * (i.e. as seen by users in the "Space.c" file). It is the name * of the interface. * - * @name_hlist: Device name hash chain, please keep it close to name[] + * @name_node: Name hashlist node * @ifalias: SNMP alias * @mem_end: Shared memory end * @mem_start: Shared memory start @@ -1774,7 +1780,7 @@ enum netdev_priv_flags { struct net_device { char name[IFNAMSIZ]; - struct hlist_node name_hlist; + struct netdev_name_node *name_node; struct dev_ifalias __rcu *ifalias; /* * I/O specific fields diff --git a/net/core/dev.c b/net/core/dev.c index 21a9c2987cbb..d2053d07c94a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -228,6 +228,67 @@ static inline void rps_unlock(struct softnet_data *sd) #endif } +static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, + const char *name) +{ + struct netdev_name_node *name_node; + + name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); + if (!name_node) + return NULL; + INIT_HLIST_NODE(&name_node->hlist); + name_node->dev = dev; + name_node->name = name; + return name_node; +} + +static struct netdev_name_node * +netdev_name_node_head_alloc(struct net_device *dev) +{ + return netdev_name_node_alloc(dev, dev->name); +} + +static void netdev_name_node_free(struct netdev_name_node *name_node) +{ + kfree(name_node); +} + +static void netdev_name_node_add(struct net *net, + struct netdev_name_node *name_node) +{ + hlist_add_head_rcu(&name_node->hlist, + dev_name_hash(net, name_node->name)); +} + +static void netdev_name_node_del(struct netdev_name_node *name_node) +{ + hlist_del_rcu(&name_node->hlist); +} + +static struct netdev_name_node *netdev_name_node_lookup(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry_rcu(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + /* Device list insertion */ static void list_netdevice(struct net_device *dev) { @@ -237,7 +298,7 @@ static void list_netdevice(struct net_device *dev) write_lock_bh(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); write_unlock_bh(&dev_base_lock); @@ -255,7 +316,7 @@ static void unlist_netdevice(struct net_device *dev) /* Unlink dev from the device chain */ write_lock_bh(&dev_base_lock); list_del_rcu(&dev->dev_list); - hlist_del_rcu(&dev->name_hlist); + netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); write_unlock_bh(&dev_base_lock); @@ -733,14 +794,10 @@ EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); struct net_device *__dev_get_by_name(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *node_name; - hlist_for_each_entry(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; - - return NULL; + node_name = netdev_name_node_lookup(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(__dev_get_by_name); @@ -758,14 +815,10 @@ EXPORT_SYMBOL(__dev_get_by_name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); - - hlist_for_each_entry_rcu(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; + struct netdev_name_node *node_name; - return NULL; + node_name = netdev_name_node_lookup_rcu(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(dev_get_by_name_rcu); @@ -1232,13 +1285,13 @@ rollback: netdev_adjacent_rename_links(dev, oldname); write_lock_bh(&dev_base_lock); - hlist_del_rcu(&dev->name_hlist); + netdev_name_node_del(dev->name_node); write_unlock_bh(&dev_base_lock); synchronize_rcu(); write_lock_bh(&dev_base_lock); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + netdev_name_node_add(net, dev->name_node); write_unlock_bh(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); @@ -8264,6 +8317,8 @@ static void rollback_registered_many(struct list_head *head) dev_uc_flush(dev); dev_mc_flush(dev); + netdev_name_node_free(dev->name_node); + if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -8706,6 +8761,10 @@ int register_netdevice(struct net_device *dev) if (ret < 0) goto out; + dev->name_node = netdev_name_node_head_alloc(dev); + if (!dev->name_node) + goto out; + /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -8827,6 +8886,8 @@ out: return ret; err_uninit: + if (dev->name_node) + netdev_name_node_free(dev->name_node); if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); if (dev->priv_destructor) -- cgit v1.2.3 From 36fbf1e52bd3ff8a5cb604955eedfc9350c2e6cc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 11:48:16 +0200 Subject: net: rtnetlink: add linkprop commands to add and delete alternative ifnames Add two commands to add and delete list of link properties. Implement the first property type along - alternative ifnames. Each net device can have multiple alternative names. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++ include/uapi/linux/if.h | 1 + include/uapi/linux/if_link.h | 2 + include/uapi/linux/rtnetlink.h | 7 +++ net/core/dev.c | 58 ++++++++++++++++++++++- net/core/rtnetlink.c | 103 +++++++++++++++++++++++++++++++++++++++++ security/selinux/nlmsgtab.c | 4 +- 7 files changed, 177 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e92bc5467256..48cc71aae466 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -927,10 +927,14 @@ struct tlsdev_ops; struct netdev_name_node { struct hlist_node hlist; + struct list_head list; struct net_device *dev; const char *name; }; +int netdev_name_node_alt_create(struct net_device *dev, const char *name); +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index 7fea0fd7d6f5..4bf33344aab1 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -33,6 +33,7 @@ #define IFNAMSIZ 16 #endif /* __UAPI_DEF_IF_IFNAMSIZ */ #define IFALIASZ 256 +#define ALTIFNAMSIZ 128 #include /* For glibc compatibility. An empty enum does not compile. */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 4a8c02cafa9a..8aec8769d944 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -167,6 +167,8 @@ enum { IFLA_NEW_IFINDEX, IFLA_MIN_MTU, IFLA_MAX_MTU, + IFLA_PROP_LIST, + IFLA_ALT_IFNAME, /* Alternative ifname */ __IFLA_MAX }; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index ce2a623abb75..1418a8362bb7 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -164,6 +164,13 @@ enum { RTM_GETNEXTHOP, #define RTM_GETNEXTHOP RTM_GETNEXTHOP + RTM_NEWLINKPROP = 108, +#define RTM_NEWLINKPROP RTM_NEWLINKPROP + RTM_DELLINKPROP, +#define RTM_DELLINKPROP RTM_DELLINKPROP + RTM_GETLINKPROP, +#define RTM_GETLINKPROP RTM_GETLINKPROP + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; diff --git a/net/core/dev.c b/net/core/dev.c index d2053d07c94a..7a456c6a7ad8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -245,7 +245,13 @@ static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, static struct netdev_name_node * netdev_name_node_head_alloc(struct net_device *dev) { - return netdev_name_node_alloc(dev, dev->name); + struct netdev_name_node *name_node; + + name_node = netdev_name_node_alloc(dev, dev->name); + if (!name_node) + return NULL; + INIT_LIST_HEAD(&name_node->list); + return name_node; } static void netdev_name_node_free(struct netdev_name_node *name_node) @@ -289,6 +295,55 @@ static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, return NULL; } +int netdev_name_node_alt_create(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (name_node) + return -EEXIST; + name_node = netdev_name_node_alloc(dev, name); + if (!name_node) + return -ENOMEM; + netdev_name_node_add(net, name_node); + /* The node that holds dev->name acts as a head of per-device list. */ + list_add_tail(&name_node->list, &dev->name_node->list); + + return 0; +} +EXPORT_SYMBOL(netdev_name_node_alt_create); + +static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +{ + list_del(&name_node->list); + netdev_name_node_del(name_node); + kfree(name_node->name); + netdev_name_node_free(name_node); +} + +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (!name_node) + return -ENOENT; + __netdev_name_node_alt_destroy(name_node); + + return 0; +} +EXPORT_SYMBOL(netdev_name_node_alt_destroy); + +static void netdev_name_node_alt_flush(struct net_device *dev) +{ + struct netdev_name_node *name_node, *tmp; + + list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) + __netdev_name_node_alt_destroy(name_node); +} + /* Device list insertion */ static void list_netdevice(struct net_device *dev) { @@ -8317,6 +8372,7 @@ static void rollback_registered_many(struct list_head *head) dev_uc_flush(dev); dev_mc_flush(dev); + netdev_name_node_alt_flush(dev); netdev_name_node_free(dev->name_node); if (dev->netdev_ops->ndo_uninit) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1ee6460f8275..e13646993d82 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1750,6 +1750,9 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, [IFLA_MIN_MTU] = { .type = NLA_U32 }, [IFLA_MAX_MTU] = { .type = NLA_U32 }, + [IFLA_PROP_LIST] = { .type = NLA_NESTED }, + [IFLA_ALT_IFNAME] = { .type = NLA_STRING, + .len = ALTIFNAMSIZ - 1 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -3373,6 +3376,103 @@ out: return err; } +static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, + bool *changed, struct netlink_ext_ack *extack) +{ + char *alt_ifname; + int err; + + err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); + if (err) + return err; + + alt_ifname = nla_data(attr); + if (cmd == RTM_NEWLINKPROP) { + alt_ifname = kstrdup(alt_ifname, GFP_KERNEL); + if (!alt_ifname) + return -ENOMEM; + err = netdev_name_node_alt_create(dev, alt_ifname); + if (err) { + kfree(alt_ifname); + return err; + } + } else if (cmd == RTM_DELLINKPROP) { + err = netdev_name_node_alt_destroy(dev, alt_ifname); + if (err) + return err; + } else { + WARN_ON(1); + return 0; + } + + *changed = true; + return 0; +} + +static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFLA_MAX + 1]; + struct net_device *dev; + struct ifinfomsg *ifm; + bool changed = false; + struct nlattr *attr; + int err, rem; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); + if (err) + return err; + + err = rtnl_ensure_unique_netns(tb, extack, true); + if (err) + return err; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) { + dev = __dev_get_by_index(net, ifm->ifi_index); + } else if (tb[IFLA_IFNAME]) { + char ifname[IFNAMSIZ]; + + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, ifname); + } else { + return -EINVAL; + } + + if (!dev) + return -ENODEV; + + if (!tb[IFLA_PROP_LIST]) + return 0; + + nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) { + switch (nla_type(attr)) { + case IFLA_ALT_IFNAME: + err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack); + if (err) + return err; + break; + } + } + + if (changed) + netdev_state_change(dev); + return 0; +} + +static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack); +} + +static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); +} + static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); @@ -5331,6 +5431,9 @@ void __init rtnetlink_init(void) rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0); rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0); + rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0); + rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0); rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 58345ba0528e..c97fdae8f71b 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -83,6 +83,8 @@ static const struct nlmsg_perm nlmsg_route_perms[] = { RTM_NEWNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_READ }, + { RTM_NEWLINKPROP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_DELLINKPROP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, }; static const struct nlmsg_perm nlmsg_tcpdiag_perms[] = @@ -166,7 +168,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) * structures at the top of this file with the new mappings * before updating the BUILD_BUG_ON() macro! */ - BUILD_BUG_ON(RTM_MAX != (RTM_NEWNEXTHOP + 3)); + BUILD_BUG_ON(RTM_MAX != (RTM_NEWLINKPROP + 3)); err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms, sizeof(nlmsg_route_perms)); break; -- cgit v1.2.3 From 88f4fb0c7496a13b887bdc5052e3aabe3e4dcc5f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 11:48:17 +0200 Subject: net: rtnetlink: put alternative names to getlink message Extend exiting getlink info message with list of properties. Now the only ones are alternative names. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e13646993d82..c38917371b84 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -980,6 +980,19 @@ static size_t rtnl_xdp_size(void) return xdp_size; } +static size_t rtnl_prop_list_size(const struct net_device *dev) +{ + struct netdev_name_node *name_node; + size_t size; + + if (list_empty(&dev->name_node->list)) + return 0; + size = nla_total_size(0); + list_for_each_entry(name_node, &dev->name_node->list, list) + size += nla_total_size(ALTIFNAMSIZ); + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -1027,6 +1040,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + nla_total_size(4) /* IFLA_MIN_MTU */ + nla_total_size(4) /* IFLA_MAX_MTU */ + + rtnl_prop_list_size(dev) + 0; } @@ -1584,6 +1598,42 @@ static int rtnl_fill_link_af(struct sk_buff *skb, return 0; } +static int rtnl_fill_alt_ifnames(struct sk_buff *skb, + const struct net_device *dev) +{ + struct netdev_name_node *name_node; + int count = 0; + + list_for_each_entry(name_node, &dev->name_node->list, list) { + if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) + return -EMSGSIZE; + count++; + } + return count; +} + +static int rtnl_fill_prop_list(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *prop_list; + int ret; + + prop_list = nla_nest_start(skb, IFLA_PROP_LIST); + if (!prop_list) + return -EMSGSIZE; + + ret = rtnl_fill_alt_ifnames(skb, dev); + if (ret <= 0) + goto nest_cancel; + + nla_nest_end(skb, prop_list); + return 0; + +nest_cancel: + nla_nest_cancel(skb, prop_list); + return ret; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, @@ -1697,6 +1747,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, goto nla_put_failure_rcu; rcu_read_unlock(); + if (rtnl_fill_prop_list(skb, dev)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; -- cgit v1.2.3 From 7af12cba4ef0caf20bddf84f90509e71006d5408 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 11:48:18 +0200 Subject: net: rtnetlink: unify the code in __rtnl_newlink get dev with the rest __rtnl_newlink() code flow is a bit different around tb[IFLA_IFNAME] processing comparing to the other places. Change that to be unified with the rest. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c38917371b84..a0017737442f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3080,12 +3080,10 @@ replay: ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - else { - if (ifname[0]) - dev = __dev_get_by_name(net, ifname); - else - dev = NULL; - } + else if (tb[IFLA_IFNAME]) + dev = __dev_get_by_name(net, ifname); + else + dev = NULL; if (dev) { master_dev = netdev_master_upper_dev_get(dev); -- cgit v1.2.3 From cc6090e985d7d62bf2f1230c507d5fbe9899c9ec Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 11:48:19 +0200 Subject: net: rtnetlink: introduce helper to get net_device instance by ifname Introduce helper function rtnl_get_dev() that gets net_device structure instance pointer according to passed ifname or ifname attribute. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a0017737442f..77d4719e5be0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2778,6 +2778,23 @@ errout: return err; } +static struct net_device *rtnl_dev_get(struct net *net, + struct nlattr *ifname_attr, + char *ifname) +{ + char buffer[IFNAMSIZ]; + + if (!ifname) { + ifname = buffer; + if (ifname_attr) + nla_strlcpy(ifname, ifname_attr, IFNAMSIZ); + else + return NULL; + } + + return __dev_get_by_name(net, ifname); +} + static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -2807,7 +2824,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(net, ifname); + dev = rtnl_dev_get(net, NULL, ifname); else goto errout; @@ -2880,7 +2897,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *tgt_net = net; struct net_device *dev = NULL; struct ifinfomsg *ifm; - char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; int err; int netnsid = -1; @@ -2894,9 +2910,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); @@ -2909,7 +2922,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(tgt_net, ifname); + dev = rtnl_dev_get(net, tb[IFLA_IFNAME], NULL); else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else @@ -3081,7 +3094,7 @@ replay: if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(net, ifname); + dev = rtnl_dev_get(net, NULL, ifname); else dev = NULL; @@ -3363,7 +3376,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct ifinfomsg *ifm; - char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; @@ -3386,9 +3398,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(tgt_net); } - if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); @@ -3397,7 +3406,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(tgt_net, ifname); + dev = rtnl_dev_get(tgt_net, tb[IFLA_IFNAME], NULL); else goto out; @@ -3480,16 +3489,12 @@ static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, return err; ifm = nlmsg_data(nlh); - if (ifm->ifi_index > 0) { + if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - } else if (tb[IFLA_IFNAME]) { - char ifname[IFNAMSIZ]; - - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, ifname); - } else { + else if (tb[IFLA_IFNAME]) + dev = rtnl_dev_get(net, tb[IFLA_IFNAME], NULL); + else return -EINVAL; - } if (!dev) return -ENODEV; -- cgit v1.2.3 From 76c9ac0ee878f6693d398d3a95ccaf85e1f597a6 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 11:48:20 +0200 Subject: net: rtnetlink: add possibility to use alternative names as message handle Extend the basic rtnetlink commands to use alternative interface names as a handle instead of ifindex and ifname. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 77d4719e5be0..49fa910b58af 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2780,14 +2780,17 @@ errout: static struct net_device *rtnl_dev_get(struct net *net, struct nlattr *ifname_attr, + struct nlattr *altifname_attr, char *ifname) { - char buffer[IFNAMSIZ]; + char buffer[ALTIFNAMSIZ]; if (!ifname) { ifname = buffer; if (ifname_attr) nla_strlcpy(ifname, ifname_attr, IFNAMSIZ); + else if (altifname_attr) + nla_strlcpy(ifname, altifname_attr, ALTIFNAMSIZ); else return NULL; } @@ -2823,8 +2826,8 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = rtnl_dev_get(net, NULL, ifname); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname); else goto errout; @@ -2921,8 +2924,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = rtnl_dev_get(net, tb[IFLA_IFNAME], NULL); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, tb[IFLA_IFNAME], + tb[IFLA_ALT_IFNAME], NULL); else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else @@ -3093,8 +3097,8 @@ replay: ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = rtnl_dev_get(net, NULL, ifname); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, NULL, tb[IFLA_ALT_IFNAME], ifname); else dev = NULL; @@ -3358,6 +3362,7 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, switch (i) { case IFLA_IFNAME: + case IFLA_ALT_IFNAME: case IFLA_EXT_MASK: case IFLA_TARGET_NETNSID: break; @@ -3405,8 +3410,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = rtnl_dev_get(tgt_net, tb[IFLA_IFNAME], NULL); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(tgt_net, tb[IFLA_IFNAME], + tb[IFLA_ALT_IFNAME], NULL); else goto out; @@ -3491,8 +3497,9 @@ static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - else if (tb[IFLA_IFNAME]) - dev = rtnl_dev_get(net, tb[IFLA_IFNAME], NULL); + else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + dev = rtnl_dev_get(net, tb[IFLA_IFNAME], + tb[IFLA_ALT_IFNAME], NULL); else return -EINVAL; -- cgit v1.2.3 From afa0df5998131153ec3036f41e76ece33bf1334f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 10:15:09 +0200 Subject: net: push loops and nb calls into helper functions Push iterations over net namespaces and netdevices from register_netdevice_notifier() and unregister_netdevice_notifier() into helper functions. Along with that introduce continue_reverse macros to make the code a bit nicer allowing to get rid of "last" marks. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++ include/net/net_namespace.h | 3 +- net/core/dev.c | 89 +++++++++++++++++++++++++++++++-------------- 3 files changed, 66 insertions(+), 29 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 48cc71aae466..7b183f724fc4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2574,6 +2574,9 @@ extern rwlock_t dev_base_lock; /* Device list lock */ list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list) #define for_each_netdev_continue(net, d) \ list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list) +#define for_each_netdev_continue_reverse(net, d) \ + list_for_each_entry_continue_reverse(d, &(net)->dev_base_head, \ + dev_list) #define for_each_netdev_continue_rcu(net, d) \ list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_in_bond_rcu(bond, slave) \ diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index f8712bbeb2e0..c5a98e03591d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -317,7 +317,8 @@ static inline struct net *read_pnet(const possible_net_t *pnet) /* Protected by net_rwsem */ #define for_each_net(VAR) \ list_for_each_entry(VAR, &net_namespace_list, list) - +#define for_each_net_continue_reverse(VAR) \ + list_for_each_entry_continue_reverse(VAR, &net_namespace_list, list) #define for_each_net_rcu(VAR) \ list_for_each_entry_rcu(VAR, &net_namespace_list, list) diff --git a/net/core/dev.c b/net/core/dev.c index 7a456c6a7ad8..a8b70cb6c732 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1725,6 +1725,62 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, return nb->notifier_call(nb, val, &info); } +static int call_netdevice_register_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + int err; + + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); + err = notifier_to_errno(err); + if (err) + return err; + + if (!(dev->flags & IFF_UP)) + return 0; + + call_netdevice_notifier(nb, NETDEV_UP, dev); + return 0; +} + +static void call_netdevice_unregister_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); + } + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); +} + +static int call_netdevice_register_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + int err; + + for_each_netdev(net, dev) { + err = call_netdevice_register_notifiers(nb, dev); + if (err) + goto rollback; + } + return 0; + +rollback: + for_each_netdev_continue_reverse(net, dev) + call_netdevice_unregister_notifiers(nb, dev); + return err; +} + +static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + + for_each_netdev(net, dev) + call_netdevice_unregister_notifiers(nb, dev); +} + static int dev_boot_phase = 1; /** @@ -1743,8 +1799,6 @@ static int dev_boot_phase = 1; int register_netdevice_notifier(struct notifier_block *nb) { - struct net_device *dev; - struct net_device *last; struct net *net; int err; @@ -1757,17 +1811,9 @@ int register_netdevice_notifier(struct notifier_block *nb) if (dev_boot_phase) goto unlock; for_each_net(net) { - for_each_netdev(net, dev) { - err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); - err = notifier_to_errno(err); - if (err) - goto rollback; - - if (!(dev->flags & IFF_UP)) - continue; - - call_netdevice_notifier(nb, NETDEV_UP, dev); - } + err = call_netdevice_register_net_notifiers(nb, net); + if (err) + goto rollback; } unlock: @@ -1776,22 +1822,9 @@ unlock: return err; rollback: - last = dev; - for_each_net(net) { - for_each_netdev(net, dev) { - if (dev == last) - goto outroll; - - if (dev->flags & IFF_UP) { - call_netdevice_notifier(nb, NETDEV_GOING_DOWN, - dev); - call_netdevice_notifier(nb, NETDEV_DOWN, dev); - } - call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); - } - } + for_each_net_continue_reverse(net) + call_netdevice_unregister_net_notifiers(nb, net); -outroll: raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } -- cgit v1.2.3 From a30c7b429f2dd980202c912fcb76442364937b4d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 10:15:10 +0200 Subject: net: introduce per-netns netdevice notifiers Often the code for example in drivers is interested in getting notifier call only from certain network namespace. In addition to the existing global netdevice notifier chain introduce per-netns chains and allow users to register to that. Eventually this would eliminate unnecessary overhead in case there are many netdevices in many network namespaces. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++ include/net/net_namespace.h | 3 ++ net/core/dev.c | 87 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7b183f724fc4..fe45b2c72315 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2504,6 +2504,9 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd); int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); +int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb); +int unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb); struct netdev_notifier_info { struct net_device *dev; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index c5a98e03591d..5ac2bb16d4b3 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -36,6 +36,7 @@ #include #include #include +#include struct user_namespace; struct proc_dir_entry; @@ -96,6 +97,8 @@ struct net { struct list_head dev_base_head; struct hlist_head *dev_name_head; struct hlist_head *dev_index_head; + struct raw_notifier_head netdev_chain; + unsigned int dev_base_seq; /* protected by rtnl_mutex */ int ifindex; unsigned int dev_unreg_count; diff --git a/net/core/dev.c b/net/core/dev.c index a8b70cb6c732..c680225e0da8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1874,6 +1874,80 @@ unlock: } EXPORT_SYMBOL(unregister_netdevice_notifier); +/** + * register_netdevice_notifier_net - register a per-netns network notifier block + * @net: network namespace + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + * + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. + */ + +int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = raw_notifier_chain_register(&net->netdev_chain, nb); + if (err) + goto unlock; + if (dev_boot_phase) + goto unlock; + + err = call_netdevice_register_net_notifiers(nb, net); + if (err) + goto chain_unregister; + +unlock: + rtnl_unlock(); + return err; + +chain_unregister: + raw_notifier_chain_unregister(&netdev_chain, nb); + goto unlock; +} +EXPORT_SYMBOL(register_netdevice_notifier_net); + +/** + * unregister_netdevice_notifier_net - unregister a per-netns + * network notifier block + * @net: network namespace + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + * + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. + */ + +int unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = raw_notifier_chain_unregister(&net->netdev_chain, nb); + if (err) + goto unlock; + + call_netdevice_unregister_net_notifiers(nb, net); + +unlock: + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_net); + /** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function @@ -1886,7 +1960,18 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); static int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info) { + struct net *net = dev_net(info->dev); + int ret; + ASSERT_RTNL(); + + /* Run per-netns notifier block chain first, then run the global one. + * Hopefully, one day, the global one is going to be removed after + * all notifier block registrators get converted to be per-netns. + */ + ret = raw_notifier_call_chain(&net->netdev_chain, val, info); + if (ret & NOTIFY_STOP_MASK) + return ret; return raw_notifier_call_chain(&netdev_chain, val, info); } @@ -9785,6 +9870,8 @@ static int __net_init netdev_init(struct net *net) if (net->dev_index_head == NULL) goto err_idx; + RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); + return 0; err_idx: -- cgit v1.2.3 From 9077f052abd5391a866dd99e27212213648becef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Oct 2019 08:59:24 -0700 Subject: net: propagate errors correctly in register_netdevice() If netdev_name_node_head_alloc() fails to allocate memory, we absolutely want register_netdevice() to return -ENOMEM instead of zero :/ One of the syzbot report looked like : general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 8760 Comm: syz-executor839 Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:ovs_vport_add+0x185/0x500 net/openvswitch/vport.c:205 Code: 89 c6 e8 3e b6 3a fa 49 81 fc 00 f0 ff ff 0f 87 6d 02 00 00 e8 8c b4 3a fa 4c 89 e2 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 d3 02 00 00 49 8d 7c 24 08 49 8b 34 24 48 b8 00 RSP: 0018:ffff88808fe5f4e0 EFLAGS: 00010247 RAX: dffffc0000000000 RBX: ffffffff89be8820 RCX: ffffffff87385162 RDX: 0000000000000000 RSI: ffffffff87385174 RDI: 0000000000000007 RBP: ffff88808fe5f510 R08: ffff8880933c6600 R09: fffffbfff14ee13c R10: fffffbfff14ee13b R11: ffffffff8a7709df R12: 0000000000000004 R13: ffffffff89be8850 R14: ffff88808fe5f5e0 R15: 0000000000000002 FS: 0000000001d71880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000280 CR3: 0000000096e4c000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: new_vport+0x1b/0x1d0 net/openvswitch/datapath.c:194 ovs_dp_cmd_new+0x5e5/0xe30 net/openvswitch/datapath.c:1644 genl_family_rcv_msg+0x74b/0xf90 net/netlink/genetlink.c:629 genl_rcv_msg+0xca/0x170 net/netlink/genetlink.c:654 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 genl_rcv+0x29/0x40 net/netlink/genetlink.c:665 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x531/0x710 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x8a5/0xd60 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:657 ___sys_sendmsg+0x803/0x920 net/socket.c:2311 __sys_sendmsg+0x105/0x1d0 net/socket.c:2356 __do_sys_sendmsg net/socket.c:2365 [inline] __se_sys_sendmsg net/socket.c:2363 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2363 Fixes: ff92741270bf ("net: introduce name_node struct to be used in hashlist") Signed-off-by: Eric Dumazet Cc: Jiri Pirko Reported-by: syzbot Tested-by: Willem de Bruijn Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c680225e0da8..944de67ee95d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8935,6 +8935,7 @@ int register_netdevice(struct net_device *dev) if (ret < 0) goto out; + ret = -ENOMEM; dev->name_node = netdev_name_node_head_alloc(dev); if (!dev->name_node) goto out; -- cgit v1.2.3 From 7c550daffe22a97282effa75fe7c1f6b83563ecb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:27 +0200 Subject: net: fib_notifier: make FIB notifier per-netns Currently all users of FIB notifier only cares about events in init_net. Later in this patchset, users get interested in other namespaces too. However, for every registered block user is interested only about one namespace. Make the FIB notifier registration per-netns and avoid unnecessary calls of notifier block for other namespaces. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c | 7 +- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 9 +-- drivers/net/ethernet/rocker/rocker_main.c | 9 +-- drivers/net/netdevsim/fib.c | 8 +- include/linux/mroute_base.h | 10 +-- include/net/fib_notifier.h | 7 +- include/net/ip6_fib.h | 2 +- include/net/ip_fib.h | 2 +- net/core/fib_notifier.c | 87 ++++++++++------------ net/core/fib_rules.c | 7 +- net/ipv4/fib_notifier.c | 4 +- net/ipv4/fib_trie.c | 17 ++--- net/ipv4/ipmr_base.c | 4 +- net/ipv6/fib6_notifier.c | 4 +- net/ipv6/ip6_fib.c | 6 +- 15 files changed, 78 insertions(+), 105 deletions(-) (limited to 'net/core') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c index 5d20d615663e..fe0cc969cf94 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c @@ -248,9 +248,6 @@ static int mlx5_lag_fib_event(struct notifier_block *nb, struct net_device *fib_dev; struct fib_info *fi; - if (!net_eq(info->net, &init_net)) - return NOTIFY_DONE; - if (info->family != AF_INET) return NOTIFY_DONE; @@ -311,7 +308,7 @@ int mlx5_lag_mp_init(struct mlx5_lag *ldev) return 0; mp->fib_nb.notifier_call = mlx5_lag_fib_event; - err = register_fib_notifier(&mp->fib_nb, + err = register_fib_notifier(&init_net, &mp->fib_nb, mlx5_lag_fib_event_flush); if (err) mp->fib_nb.notifier_call = NULL; @@ -326,6 +323,6 @@ void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev) if (!mp->fib_nb.notifier_call) return; - unregister_fib_notifier(&mp->fib_nb); + unregister_fib_notifier(&init_net, &mp->fib_nb); mp->fib_nb.notifier_call = NULL; } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index a330b369e899..d0db9ea71323 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -6213,7 +6213,7 @@ static int mlxsw_sp_router_fib_rule_event(unsigned long event, rule = fr_info->rule; /* Rule only affects locally generated traffic */ - if (rule->iifindex == info->net->loopback_dev->ifindex) + if (rule->iifindex == init_net.loopback_dev->ifindex) return 0; switch (info->family) { @@ -6250,8 +6250,7 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb, struct mlxsw_sp_router *router; int err; - if (!net_eq(info->net, &init_net) || - (info->family != AF_INET && info->family != AF_INET6 && + if ((info->family != AF_INET && info->family != AF_INET6 && info->family != RTNL_FAMILY_IPMR && info->family != RTNL_FAMILY_IP6MR)) return NOTIFY_DONE; @@ -8155,7 +8154,7 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp) goto err_dscp_init; mlxsw_sp->router->fib_nb.notifier_call = mlxsw_sp_router_fib_event; - err = register_fib_notifier(&mlxsw_sp->router->fib_nb, + err = register_fib_notifier(&init_net, &mlxsw_sp->router->fib_nb, mlxsw_sp_router_fib_dump_flush); if (err) goto err_register_fib_notifier; @@ -8195,7 +8194,7 @@ err_register_inetaddr_notifier: void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp) { - unregister_fib_notifier(&mlxsw_sp->router->fib_nb); + unregister_fib_notifier(&init_net, &mlxsw_sp->router->fib_nb); unregister_netevent_notifier(&mlxsw_sp->router->netevent_nb); mlxsw_sp_neigh_fini(mlxsw_sp); mlxsw_sp_vrs_fini(mlxsw_sp); diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 786b158bd305..e54f6341a785 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -2189,9 +2189,6 @@ static int rocker_router_fib_event(struct notifier_block *nb, struct rocker_fib_event_work *fib_work; struct fib_notifier_info *info = ptr; - if (!net_eq(info->net, &init_net)) - return NOTIFY_DONE; - if (info->family != AF_INET) return NOTIFY_DONE; @@ -2994,7 +2991,7 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id) * the device, so no need to pass a callback. */ rocker->fib_nb.notifier_call = rocker_router_fib_event; - err = register_fib_notifier(&rocker->fib_nb, NULL); + err = register_fib_notifier(&init_net, &rocker->fib_nb, NULL); if (err) goto err_register_fib_notifier; @@ -3021,7 +3018,7 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id) err_register_switchdev_blocking_notifier: unregister_switchdev_notifier(&rocker_switchdev_notifier); err_register_switchdev_notifier: - unregister_fib_notifier(&rocker->fib_nb); + unregister_fib_notifier(&init_net, &rocker->fib_nb); err_register_fib_notifier: rocker_remove_ports(rocker); err_probe_ports: @@ -3057,7 +3054,7 @@ static void rocker_remove(struct pci_dev *pdev) unregister_switchdev_blocking_notifier(nb); unregister_switchdev_notifier(&rocker_switchdev_notifier); - unregister_fib_notifier(&rocker->fib_nb); + unregister_fib_notifier(&init_net, &rocker->fib_nb); rocker_remove_ports(rocker); rocker_write32(rocker, CONTROL, ROCKER_CONTROL_RESET); destroy_workqueue(rocker->rocker_owq); diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c index 7de17e42d77a..01ee9cc54605 100644 --- a/drivers/net/netdevsim/fib.c +++ b/drivers/net/netdevsim/fib.c @@ -182,9 +182,6 @@ static int nsim_fib_event_nb(struct notifier_block *nb, unsigned long event, struct fib_notifier_info *info = ptr; int err = 0; - if (!net_eq(info->net, &init_net)) - return NOTIFY_DONE; - switch (event) { case FIB_EVENT_RULE_ADD: /* fall through */ case FIB_EVENT_RULE_DEL: @@ -258,7 +255,8 @@ struct nsim_fib_data *nsim_fib_create(struct devlink *devlink) data->ipv6.rules.max = (u64)-1; data->fib_nb.notifier_call = nsim_fib_event_nb; - err = register_fib_notifier(&data->fib_nb, nsim_fib_dump_inconsistent); + err = register_fib_notifier(&init_net, &data->fib_nb, + nsim_fib_dump_inconsistent); if (err) { pr_err("Failed to register fib notifier\n"); goto err_out; @@ -297,6 +295,6 @@ void nsim_fib_destroy(struct devlink *devlink, struct nsim_fib_data *data) NSIM_RESOURCE_IPV4_FIB_RULES); devlink_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV4_FIB); - unregister_fib_notifier(&data->fib_nb); + unregister_fib_notifier(&init_net, &data->fib_nb); kfree(data); } diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 34de06b426ef..0931631bbc13 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -47,7 +47,6 @@ struct vif_entry_notifier_info { }; static inline int mr_call_vif_notifier(struct notifier_block *nb, - struct net *net, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, @@ -56,7 +55,6 @@ static inline int mr_call_vif_notifier(struct notifier_block *nb, struct vif_entry_notifier_info info = { .info = { .family = family, - .net = net, }, .dev = vif->dev, .vif_index = vif_index, @@ -64,7 +62,7 @@ static inline int mr_call_vif_notifier(struct notifier_block *nb, .tb_id = tb_id, }; - return call_fib_notifier(nb, net, event_type, &info.info); + return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_vif_notifiers(struct net *net, @@ -77,7 +75,6 @@ static inline int mr_call_vif_notifiers(struct net *net, struct vif_entry_notifier_info info = { .info = { .family = family, - .net = net, }, .dev = vif->dev, .vif_index = vif_index, @@ -173,7 +170,6 @@ struct mfc_entry_notifier_info { }; static inline int mr_call_mfc_notifier(struct notifier_block *nb, - struct net *net, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id) @@ -181,13 +177,12 @@ static inline int mr_call_mfc_notifier(struct notifier_block *nb, struct mfc_entry_notifier_info info = { .info = { .family = family, - .net = net, }, .mfc = mfc, .tb_id = tb_id }; - return call_fib_notifier(nb, net, event_type, &info.info); + return call_fib_notifier(nb, event_type, &info.info); } static inline int mr_call_mfc_notifiers(struct net *net, @@ -199,7 +194,6 @@ static inline int mr_call_mfc_notifiers(struct net *net, struct mfc_entry_notifier_info info = { .info = { .family = family, - .net = net, }, .mfc = mfc, .tb_id = tb_id diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index c49d7bfb5c30..23353f67b2b0 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -8,7 +8,6 @@ struct module; struct fib_notifier_info { - struct net *net; int family; struct netlink_ext_ack *extack; }; @@ -35,14 +34,14 @@ struct fib_notifier_ops { struct rcu_head rcu; }; -int call_fib_notifier(struct notifier_block *nb, struct net *net, +int call_fib_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); -int register_fib_notifier(struct notifier_block *nb, +int register_fib_notifier(struct net *net, struct notifier_block *nb, void (*cb)(struct notifier_block *nb)); -int unregister_fib_notifier(struct notifier_block *nb); +int unregister_fib_notifier(struct net *net, struct notifier_block *nb); struct fib_notifier_ops * fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net); void fib_notifier_ops_unregister(struct fib_notifier_ops *ops); diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 4b5656c71abc..14e9fca0e326 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -478,7 +478,7 @@ struct ipv6_route_iter { extern const struct seq_operations ipv6_route_seq_ops; -int call_fib6_notifier(struct notifier_block *nb, struct net *net, +int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index ab1ca9e238d2..a9df85304f40 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -219,7 +219,7 @@ struct fib_nh_notifier_info { struct fib_nh *fib_nh; }; -int call_fib4_notifier(struct notifier_block *nb, struct net *net, +int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 470a606d5e8d..fbd029425638 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -12,17 +12,15 @@ static unsigned int fib_notifier_net_id; struct fib_notifier_net { struct list_head fib_notifier_ops; + struct atomic_notifier_head fib_chain; }; -static ATOMIC_NOTIFIER_HEAD(fib_chain); - -int call_fib_notifier(struct notifier_block *nb, struct net *net, +int call_fib_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { int err; - info->net = net; err = nb->notifier_call(nb, event_type, info); return notifier_to_errno(err); } @@ -31,35 +29,29 @@ EXPORT_SYMBOL(call_fib_notifier); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); int err; - info->net = net; - err = atomic_notifier_call_chain(&fib_chain, event_type, info); + err = atomic_notifier_call_chain(&fn_net->fib_chain, event_type, info); return notifier_to_errno(err); } EXPORT_SYMBOL(call_fib_notifiers); -static unsigned int fib_seq_sum(void) +static unsigned int fib_seq_sum(struct net *net) { - struct fib_notifier_net *fn_net; + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *ops; unsigned int fib_seq = 0; - struct net *net; rtnl_lock(); - down_read(&net_rwsem); - for_each_net(net) { - fn_net = net_generic(net, fib_notifier_net_id); - rcu_read_lock(); - list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { - if (!try_module_get(ops->owner)) - continue; - fib_seq += ops->fib_seq_read(net); - module_put(ops->owner); - } - rcu_read_unlock(); + rcu_read_lock(); + list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { + if (!try_module_get(ops->owner)) + continue; + fib_seq += ops->fib_seq_read(net); + module_put(ops->owner); } - up_read(&net_rwsem); + rcu_read_unlock(); rtnl_unlock(); return fib_seq; @@ -69,68 +61,66 @@ static int fib_net_dump(struct net *net, struct notifier_block *nb) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *ops; + int err = 0; + rcu_read_lock(); list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { - int err; - if (!try_module_get(ops->owner)) continue; err = ops->fib_dump(net, nb); module_put(ops->owner); if (err) - return err; + goto unlock; } - return 0; +unlock: + rcu_read_unlock(); + + return err; } -static bool fib_dump_is_consistent(struct notifier_block *nb, +static bool fib_dump_is_consistent(struct net *net, struct notifier_block *nb, void (*cb)(struct notifier_block *nb), unsigned int fib_seq) { - atomic_notifier_chain_register(&fib_chain, nb); - if (fib_seq == fib_seq_sum()) + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + atomic_notifier_chain_register(&fn_net->fib_chain, nb); + if (fib_seq == fib_seq_sum(net)) return true; - atomic_notifier_chain_unregister(&fib_chain, nb); + atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); if (cb) cb(nb); return false; } #define FIB_DUMP_MAX_RETRIES 5 -int register_fib_notifier(struct notifier_block *nb, +int register_fib_notifier(struct net *net, struct notifier_block *nb, void (*cb)(struct notifier_block *nb)) { int retries = 0; int err; do { - unsigned int fib_seq = fib_seq_sum(); - struct net *net; - - rcu_read_lock(); - for_each_net_rcu(net) { - err = fib_net_dump(net, nb); - if (err) - goto err_fib_net_dump; - } - rcu_read_unlock(); - - if (fib_dump_is_consistent(nb, cb, fib_seq)) + unsigned int fib_seq = fib_seq_sum(net); + + err = fib_net_dump(net, nb); + if (err) + return err; + + if (fib_dump_is_consistent(net, nb, cb, fib_seq)) return 0; } while (++retries < FIB_DUMP_MAX_RETRIES); return -EBUSY; - -err_fib_net_dump: - rcu_read_unlock(); - return err; } EXPORT_SYMBOL(register_fib_notifier); -int unregister_fib_notifier(struct notifier_block *nb) +int unregister_fib_notifier(struct net *net, struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&fib_chain, nb); + struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); + + return atomic_notifier_chain_unregister(&fn_net->fib_chain, nb); } EXPORT_SYMBOL(unregister_fib_notifier); @@ -181,6 +171,7 @@ static int __net_init fib_notifier_net_init(struct net *net) struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); INIT_LIST_HEAD(&fn_net->fib_notifier_ops); + ATOMIC_INIT_NOTIFIER_HEAD(&fn_net->fib_chain); return 0; } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index dd220ce7ca7a..28cbf07102bc 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -321,7 +321,7 @@ out: } EXPORT_SYMBOL_GPL(fib_rules_lookup); -static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net, +static int call_fib_rule_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_rule *rule, int family) { @@ -330,7 +330,7 @@ static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net, .rule = rule, }; - return call_fib_notifier(nb, net, event_type, &info.info); + return call_fib_notifier(nb, event_type, &info.info); } static int call_fib_rule_notifiers(struct net *net, @@ -359,8 +359,7 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family) if (!ops) return -EAFNOSUPPORT; list_for_each_entry_rcu(rule, &ops->rules_list, list) - call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule, - family); + call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, rule, family); rules_ops_put(ops); return 0; diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index b804ccbdb241..1a128c1346fb 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -9,12 +9,12 @@ #include #include -int call_fib4_notifier(struct notifier_block *nb, struct net *net, +int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET; - return call_fib_notifier(nb, net, event_type, info); + return call_fib_notifier(nb, event_type, info); } int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1ab2fb6bb37d..5b600b2a2aa3 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -74,7 +74,7 @@ #include #include "fib_lookup.h" -static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, +static int call_fib_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_alias *fa) { @@ -86,7 +86,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, .type = fa->fa_type, .tb_id = fa->tb_id, }; - return call_fib4_notifier(nb, net, event_type, &info.info); + return call_fib4_notifier(nb, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, @@ -2015,8 +2015,8 @@ void fib_info_notify_update(struct net *net, struct nl_info *info) } } -static void fib_leaf_notify(struct net *net, struct key_vector *l, - struct fib_table *tb, struct notifier_block *nb) +static void fib_leaf_notify(struct key_vector *l, struct fib_table *tb, + struct notifier_block *nb) { struct fib_alias *fa; @@ -2032,20 +2032,19 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l, if (tb->tb_id != fa->tb_id) continue; - call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key, + call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key, KEYLENGTH - fa->fa_slen, fa); } } -static void fib_table_notify(struct net *net, struct fib_table *tb, - struct notifier_block *nb) +static void fib_table_notify(struct fib_table *tb, struct notifier_block *nb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; t_key key = 0; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - fib_leaf_notify(net, l, tb, nb); + fib_leaf_notify(l, tb, nb); key = l->key + 1; /* stop in case of wrap around */ @@ -2063,7 +2062,7 @@ void fib_notify(struct net *net, struct notifier_block *nb) struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) - fib_table_notify(net, tb, nb); + fib_table_notify(tb, nb); } } diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index ea48bd15a575..4dcc3214e3cc 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -409,7 +409,7 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, if (!v->dev) continue; - mr_call_vif_notifier(nb, net, family, + mr_call_vif_notifier(nb, family, FIB_EVENT_VIF_ADD, v, vifi, mrt->id); } @@ -417,7 +417,7 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, /* Notify on table MFC entries */ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) - mr_call_mfc_notifier(nb, net, family, + mr_call_mfc_notifier(nb, family, FIB_EVENT_ENTRY_ADD, mfc, mrt->id); } diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c index 05f82baaa99e..4fe79296999a 100644 --- a/net/ipv6/fib6_notifier.c +++ b/net/ipv6/fib6_notifier.c @@ -7,12 +7,12 @@ #include #include -int call_fib6_notifier(struct notifier_block *nb, struct net *net, +int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET6; - return call_fib_notifier(nb, net, event_type, info); + return call_fib_notifier(nb, event_type, info); } int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 6e2af411cd9c..f6fae48b2e18 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -357,7 +357,7 @@ unsigned int fib6_tables_seq_read(struct net *net) return fib_seq; } -static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, +static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt) { @@ -365,7 +365,7 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, .rt = rt, }; - return call_fib6_notifier(nb, net, event_type, &info.info); + return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, @@ -407,7 +407,7 @@ static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { if (rt == arg->net->ipv6.fib6_null_entry) return; - call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt); + call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD, rt); } static int fib6_node_dump(struct fib6_walker *w) -- cgit v1.2.3 From 55c894f762a1a99fca80ee55d593083d78e7e4fb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:28 +0200 Subject: net: fib_notifier: propagate possible error during fib notifier registration Unlike events for registered notifier, during the registration, the errors that happened for the block being registered are not propagated up to the caller. Make sure the error is propagated for FIB rules and entries. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 +- net/core/fib_rules.c | 11 ++++++++--- net/ipv4/fib_notifier.c | 4 +--- net/ipv4/fib_trie.c | 31 ++++++++++++++++++++++--------- net/ipv4/ipmr_base.c | 22 +++++++++++++++------- net/ipv6/ip6_fib.c | 36 ++++++++++++++++++++++++------------ 6 files changed, 71 insertions(+), 35 deletions(-) (limited to 'net/core') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a9df85304f40..05c1fd9c5e23 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -229,7 +229,7 @@ int __net_init fib4_notifier_init(struct net *net); void __net_exit fib4_notifier_exit(struct net *net); void fib_info_notify_update(struct net *net, struct nl_info *info); -void fib_notify(struct net *net, struct notifier_block *nb); +int fib_notify(struct net *net, struct notifier_block *nb); struct fib_table { struct hlist_node tb_hlist; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 28cbf07102bc..592d8aef90e3 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -354,15 +354,20 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family) { struct fib_rules_ops *ops; struct fib_rule *rule; + int err = 0; ops = lookup_rules_ops(net, family); if (!ops) return -EAFNOSUPPORT; - list_for_each_entry_rcu(rule, &ops->rules_list, list) - call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, rule, family); + list_for_each_entry_rcu(rule, &ops->rules_list, list) { + err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, + rule, family); + if (err) + break; + } rules_ops_put(ops); - return 0; + return err; } EXPORT_SYMBOL_GPL(fib_rules_dump); diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index 1a128c1346fb..0c57f68a9340 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -42,9 +42,7 @@ static int fib4_dump(struct net *net, struct notifier_block *nb) if (err) return err; - fib_notify(net, nb); - - return 0; + return fib_notify(net, nb); } static const struct fib_notifier_ops fib4_notifier_ops_template = { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5b600b2a2aa3..568e59423773 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2015,10 +2015,11 @@ void fib_info_notify_update(struct net *net, struct nl_info *info) } } -static void fib_leaf_notify(struct key_vector *l, struct fib_table *tb, - struct notifier_block *nb) +static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, + struct notifier_block *nb) { struct fib_alias *fa; + int err; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; @@ -2032,38 +2033,50 @@ static void fib_leaf_notify(struct key_vector *l, struct fib_table *tb, if (tb->tb_id != fa->tb_id) continue; - call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, fa); + err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key, + KEYLENGTH - fa->fa_slen, fa); + if (err) + return err; } + return 0; } -static void fib_table_notify(struct fib_table *tb, struct notifier_block *nb) +static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; t_key key = 0; + int err; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - fib_leaf_notify(l, tb, nb); + err = fib_leaf_notify(l, tb, nb); + if (err) + return err; key = l->key + 1; /* stop in case of wrap around */ if (key < l->key) break; } + return 0; } -void fib_notify(struct net *net, struct notifier_block *nb) +int fib_notify(struct net *net, struct notifier_block *nb) { unsigned int h; + int err; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; - hlist_for_each_entry_rcu(tb, head, tb_hlist) - fib_table_notify(tb, nb); + hlist_for_each_entry_rcu(tb, head, tb_hlist) { + err = fib_table_notify(tb, nb); + if (err) + return err; + } } + return 0; } static void __trie_free_rcu(struct rcu_head *head) diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 4dcc3214e3cc..c4e23c2a0d5c 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -409,17 +409,25 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, if (!v->dev) continue; - mr_call_vif_notifier(nb, family, - FIB_EVENT_VIF_ADD, - v, vifi, mrt->id); + err = mr_call_vif_notifier(nb, family, + FIB_EVENT_VIF_ADD, + v, vifi, mrt->id); + if (err) + break; } read_unlock(mrt_lock); + if (err) + return err; + /* Notify on table MFC entries */ - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) - mr_call_mfc_notifier(nb, family, - FIB_EVENT_ENTRY_ADD, - mfc, mrt->id); + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { + err = mr_call_mfc_notifier(nb, family, + FIB_EVENT_ENTRY_ADD, + mfc, mrt->id); + if (err) + return err; + } } return 0; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index f6fae48b2e18..76124a909395 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -403,30 +403,37 @@ struct fib6_dump_arg { struct notifier_block *nb; }; -static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) +static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { if (rt == arg->net->ipv6.fib6_null_entry) - return; - call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD, rt); + return 0; + return call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD, rt); } static int fib6_node_dump(struct fib6_walker *w) { struct fib6_info *rt; + int err = 0; - for_each_fib6_walker_rt(w) - fib6_rt_dump(rt, w->args); + for_each_fib6_walker_rt(w) { + err = fib6_rt_dump(rt, w->args); + if (err) + break; + } w->leaf = NULL; - return 0; + return err; } -static void fib6_table_dump(struct net *net, struct fib6_table *tb, - struct fib6_walker *w) +static int fib6_table_dump(struct net *net, struct fib6_table *tb, + struct fib6_walker *w) { + int err; + w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); - fib6_walk(net, w); + err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); + return err; } /* Called with rcu_read_lock() */ @@ -435,6 +442,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb) struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; + int err = 0; w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) @@ -449,13 +457,17 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb) struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) - fib6_table_dump(net, tb, w); + hlist_for_each_entry_rcu(tb, head, tb6_hlist) { + err = fib6_table_dump(net, tb, w); + if (err < 0) + goto out; + } } +out: kfree(w); - return 0; + return err; } static int fib6_dump_node(struct fib6_walker *w) -- cgit v1.2.3 From b7a595577ef3dc9add2b3e6d00869d017306bfbe Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:30 +0200 Subject: net: fib_notifier: propagate extack down to the notifier block callback Since errors are propagated all the way up to the caller, propagate possible extack of the caller all the way down to the notifier block callback. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c | 2 +- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +- drivers/net/ethernet/rocker/rocker_main.c | 2 +- drivers/net/netdevsim/fib.c | 2 +- include/linux/mroute_base.h | 18 ++++++++++++------ include/net/fib_notifier.h | 6 ++++-- include/net/fib_rules.h | 3 ++- include/net/ip6_fib.h | 9 ++++++--- include/net/ip_fib.h | 9 ++++++--- net/core/fib_notifier.c | 10 ++++++---- net/core/fib_rules.c | 9 ++++++--- net/ipv4/fib_notifier.c | 7 ++++--- net/ipv4/fib_rules.c | 5 +++-- net/ipv4/fib_trie.c | 20 +++++++++++++------- net/ipv4/ipmr.c | 13 ++++++++----- net/ipv4/ipmr_base.c | 12 +++++++----- net/ipv6/fib6_notifier.c | 7 ++++--- net/ipv6/fib6_rules.c | 5 +++-- net/ipv6/ip6_fib.c | 12 +++++++++--- net/ipv6/ip6mr.c | 13 ++++++++----- 20 files changed, 105 insertions(+), 61 deletions(-) (limited to 'net/core') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c index fe0cc969cf94..13e2944b1274 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c @@ -309,7 +309,7 @@ int mlx5_lag_mp_init(struct mlx5_lag *ldev) mp->fib_nb.notifier_call = mlx5_lag_fib_event; err = register_fib_notifier(&init_net, &mp->fib_nb, - mlx5_lag_fib_event_flush); + mlx5_lag_fib_event_flush, NULL); if (err) mp->fib_nb.notifier_call = NULL; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 1eeff1d23b13..445e2daa54ac 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -8135,7 +8135,7 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp) mlxsw_sp->router->fib_nb.notifier_call = mlxsw_sp_router_fib_event; err = register_fib_notifier(&init_net, &mlxsw_sp->router->fib_nb, - mlxsw_sp_router_fib_dump_flush); + mlxsw_sp_router_fib_dump_flush, NULL); if (err) goto err_register_fib_notifier; diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index e54f6341a785..bc4f951315da 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -2991,7 +2991,7 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id) * the device, so no need to pass a callback. */ rocker->fib_nb.notifier_call = rocker_router_fib_event; - err = register_fib_notifier(&init_net, &rocker->fib_nb, NULL); + err = register_fib_notifier(&init_net, &rocker->fib_nb, NULL, NULL); if (err) goto err_register_fib_notifier; diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c index 01ee9cc54605..d2aeac0f4c2c 100644 --- a/drivers/net/netdevsim/fib.c +++ b/drivers/net/netdevsim/fib.c @@ -256,7 +256,7 @@ struct nsim_fib_data *nsim_fib_create(struct devlink *devlink) data->fib_nb.notifier_call = nsim_fib_event_nb; err = register_fib_notifier(&init_net, &data->fib_nb, - nsim_fib_dump_inconsistent); + nsim_fib_dump_inconsistent, NULL); if (err) { pr_err("Failed to register fib notifier\n"); goto err_out; diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 0931631bbc13..8071148f29a6 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -50,11 +50,13 @@ static inline int mr_call_vif_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, struct vif_device *vif, - unsigned short vif_index, u32 tb_id) + unsigned short vif_index, u32 tb_id, + struct netlink_ext_ack *extack) { struct vif_entry_notifier_info info = { .info = { .family = family, + .extack = extack, }, .dev = vif->dev, .vif_index = vif_index, @@ -172,11 +174,13 @@ struct mfc_entry_notifier_info { static inline int mr_call_mfc_notifier(struct notifier_block *nb, unsigned short family, enum fib_event_type event_type, - struct mr_mfc *mfc, u32 tb_id) + struct mr_mfc *mfc, u32 tb_id, + struct netlink_ext_ack *extack) { struct mfc_entry_notifier_info info = { .info = { .family = family, + .extack = extack, }, .mfc = mfc, .tb_id = tb_id @@ -295,10 +299,11 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, - struct notifier_block *nb), + struct notifier_block *nb, + struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), - rwlock_t *mrt_lock); + rwlock_t *mrt_lock, struct netlink_ext_ack *extack); #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, @@ -349,10 +354,11 @@ mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, static inline int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, - struct notifier_block *nb), + struct notifier_block *nb, + struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), - rwlock_t *mrt_lock) + rwlock_t *mrt_lock, struct netlink_ext_ack *extack) { return -EINVAL; } diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 23353f67b2b0..6d59221ff05a 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -29,7 +29,8 @@ struct fib_notifier_ops { int family; struct list_head list; unsigned int (*fib_seq_read)(struct net *net); - int (*fib_dump)(struct net *net, struct notifier_block *nb); + int (*fib_dump)(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); struct module *owner; struct rcu_head rcu; }; @@ -40,7 +41,8 @@ int call_fib_notifier(struct notifier_block *nb, int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); int register_fib_notifier(struct net *net, struct notifier_block *nb, - void (*cb)(struct notifier_block *nb)); + void (*cb)(struct notifier_block *nb), + struct netlink_ext_ack *extack); int unregister_fib_notifier(struct net *net, struct notifier_block *nb); struct fib_notifier_ops * fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net); diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 20dcadd8eed9..54e227e6b06a 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -194,7 +194,8 @@ int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table, u32 flags); bool fib_rule_matchall(const struct fib_rule *rule); -int fib_rules_dump(struct net *net, struct notifier_block *nb, int family); +int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, + struct netlink_ext_ack *extack); unsigned int fib_rules_seq_read(struct net *net, int family); int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 14e9fca0e326..5d1615463138 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -488,7 +488,8 @@ int __net_init fib6_notifier_init(struct net *net); void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); -int fib6_tables_dump(struct net *net, struct notifier_block *nb); +int fib6_tables_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); void fib6_update_sernum(struct net *net, struct fib6_info *rt); void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt); @@ -504,7 +505,8 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) int fib6_rules_init(void); void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); -int fib6_rules_dump(struct net *net, struct notifier_block *nb); +int fib6_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); unsigned int fib6_rules_seq_read(struct net *net); static inline bool fib6_rules_early_flow_dissect(struct net *net, @@ -537,7 +539,8 @@ static inline bool fib6_rule_default(const struct fib_rule *rule) { return true; } -static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb) +static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 05c1fd9c5e23..52b2406a5dfc 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -229,7 +229,8 @@ int __net_init fib4_notifier_init(struct net *net); void __net_exit fib4_notifier_exit(struct net *net); void fib_info_notify_update(struct net *net, struct nl_info *info); -int fib_notify(struct net *net, struct notifier_block *nb); +int fib_notify(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); struct fib_table { struct hlist_node tb_hlist; @@ -315,7 +316,8 @@ static inline bool fib4_rule_default(const struct fib_rule *rule) return true; } -static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb) +static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } @@ -377,7 +379,8 @@ out: } bool fib4_rule_default(const struct fib_rule *rule); -int fib4_rules_dump(struct net *net, struct notifier_block *nb); +int fib4_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack); unsigned int fib4_rules_seq_read(struct net *net); static inline bool fib4_rules_early_flow_dissect(struct net *net, diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index fbd029425638..fc96259807b6 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -57,7 +57,8 @@ static unsigned int fib_seq_sum(struct net *net) return fib_seq; } -static int fib_net_dump(struct net *net, struct notifier_block *nb) +static int fib_net_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id); struct fib_notifier_ops *ops; @@ -67,7 +68,7 @@ static int fib_net_dump(struct net *net, struct notifier_block *nb) list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { if (!try_module_get(ops->owner)) continue; - err = ops->fib_dump(net, nb); + err = ops->fib_dump(net, nb, extack); module_put(ops->owner); if (err) goto unlock; @@ -96,7 +97,8 @@ static bool fib_dump_is_consistent(struct net *net, struct notifier_block *nb, #define FIB_DUMP_MAX_RETRIES 5 int register_fib_notifier(struct net *net, struct notifier_block *nb, - void (*cb)(struct notifier_block *nb)) + void (*cb)(struct notifier_block *nb), + struct netlink_ext_ack *extack) { int retries = 0; int err; @@ -104,7 +106,7 @@ int register_fib_notifier(struct net *net, struct notifier_block *nb, do { unsigned int fib_seq = fib_seq_sum(net); - err = fib_net_dump(net, nb); + err = fib_net_dump(net, nb, extack); if (err) return err; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 592d8aef90e3..3e7e15278c46 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -323,10 +323,12 @@ EXPORT_SYMBOL_GPL(fib_rules_lookup); static int call_fib_rule_notifier(struct notifier_block *nb, enum fib_event_type event_type, - struct fib_rule *rule, int family) + struct fib_rule *rule, int family, + struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = family, + .info.extack = extack, .rule = rule, }; @@ -350,7 +352,8 @@ static int call_fib_rule_notifiers(struct net *net, } /* Called with rcu_read_lock() */ -int fib_rules_dump(struct net *net, struct notifier_block *nb, int family) +int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, + struct netlink_ext_ack *extack) { struct fib_rules_ops *ops; struct fib_rule *rule; @@ -361,7 +364,7 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family) return -EAFNOSUPPORT; list_for_each_entry_rcu(rule, &ops->rules_list, list) { err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, - rule, family); + rule, family, extack); if (err) break; } diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index 0c57f68a9340..0c28bd469a68 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -34,15 +34,16 @@ static unsigned int fib4_seq_read(struct net *net) return net->ipv4.fib_seq + fib4_rules_seq_read(net); } -static int fib4_dump(struct net *net, struct notifier_block *nb) +static int fib4_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { int err; - err = fib4_rules_dump(net, nb); + err = fib4_rules_dump(net, nb, extack); if (err) return err; - return fib_notify(net, nb); + return fib_notify(net, nb, extack); } static const struct fib_notifier_ops fib4_notifier_ops_template = { diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index b43a7ba5c6a4..f99e3bac5cab 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -65,9 +65,10 @@ bool fib4_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL_GPL(fib4_rule_default); -int fib4_rules_dump(struct net *net, struct notifier_block *nb) +int fib4_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, AF_INET); + return fib_rules_dump(net, nb, AF_INET, extack); } unsigned int fib4_rules_seq_read(struct net *net) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 568e59423773..b9df9c09b84e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -76,9 +76,11 @@ static int call_fib_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_alias *fa) + int dst_len, struct fib_alias *fa, + struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { + .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, @@ -2016,7 +2018,8 @@ void fib_info_notify_update(struct net *net, struct nl_info *info) } static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, - struct notifier_block *nb) + struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct fib_alias *fa; int err; @@ -2034,14 +2037,16 @@ static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, continue; err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, fa); + KEYLENGTH - fa->fa_slen, + fa, extack); if (err) return err; } return 0; } -static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb) +static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; @@ -2049,7 +2054,7 @@ static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb) int err; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - err = fib_leaf_notify(l, tb, nb); + err = fib_leaf_notify(l, tb, nb, extack); if (err) return err; @@ -2061,7 +2066,8 @@ static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb) return 0; } -int fib_notify(struct net *net, struct notifier_block *nb) +int fib_notify(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { unsigned int h; int err; @@ -2071,7 +2077,7 @@ int fib_notify(struct net *net, struct notifier_block *nb) struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { - err = fib_table_notify(tb, nb); + err = fib_table_notify(tb, nb, extack); if (err) return err; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 313470f6bb14..051f365b64d2 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -278,9 +278,10 @@ static void __net_exit ipmr_rules_exit(struct net *net) rtnl_unlock(); } -static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR); + return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack); } static unsigned int ipmr_rules_seq_read(struct net *net) @@ -336,7 +337,8 @@ static void __net_exit ipmr_rules_exit(struct net *net) rtnl_unlock(); } -static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } @@ -3040,10 +3042,11 @@ static unsigned int ipmr_seq_read(struct net *net) return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net); } -static int ipmr_dump(struct net *net, struct notifier_block *nb) +static int ipmr_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump, - ipmr_mr_table_iter, &mrt_lock); + ipmr_mr_table_iter, &mrt_lock, extack); } static const struct fib_notifier_ops ipmr_notifier_ops_template = { diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index c4e23c2a0d5c..aa8738a91210 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -386,15 +386,17 @@ EXPORT_SYMBOL(mr_rtm_dumproute); int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, - struct notifier_block *nb), + struct notifier_block *nb, + struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), - rwlock_t *mrt_lock) + rwlock_t *mrt_lock, + struct netlink_ext_ack *extack) { struct mr_table *mrt; int err; - err = rules_dump(net, nb); + err = rules_dump(net, nb, extack); if (err) return err; @@ -411,7 +413,7 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, err = mr_call_vif_notifier(nb, family, FIB_EVENT_VIF_ADD, - v, vifi, mrt->id); + v, vifi, mrt->id, extack); if (err) break; } @@ -424,7 +426,7 @@ int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { err = mr_call_mfc_notifier(nb, family, FIB_EVENT_ENTRY_ADD, - mfc, mrt->id); + mfc, mrt->id, extack); if (err) return err; } diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c index 4fe79296999a..f87ae33e1d01 100644 --- a/net/ipv6/fib6_notifier.c +++ b/net/ipv6/fib6_notifier.c @@ -27,15 +27,16 @@ static unsigned int fib6_seq_read(struct net *net) return fib6_tables_seq_read(net) + fib6_rules_seq_read(net); } -static int fib6_dump(struct net *net, struct notifier_block *nb) +static int fib6_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { int err; - err = fib6_rules_dump(net, nb); + err = fib6_rules_dump(net, nb, extack); if (err) return err; - return fib6_tables_dump(net, nb); + return fib6_tables_dump(net, nb, extack); } static const struct fib_notifier_ops fib6_notifier_ops_template = { diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index f9e8fe3ff0c5..fafe556d21e0 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -47,9 +47,10 @@ bool fib6_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL_GPL(fib6_rule_default); -int fib6_rules_dump(struct net *net, struct notifier_block *nb) +int fib6_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, AF_INET6); + return fib_rules_dump(net, nb, AF_INET6, extack); } unsigned int fib6_rules_seq_read(struct net *net) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 76124a909395..f66bc2af4e9d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -359,9 +359,11 @@ unsigned int fib6_tables_seq_read(struct net *net) static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, - struct fib6_info *rt) + struct fib6_info *rt, + struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { + .info.extack = extack, .rt = rt, }; @@ -401,13 +403,15 @@ int call_fib6_multipath_entry_notifiers(struct net *net, struct fib6_dump_arg { struct net *net; struct notifier_block *nb; + struct netlink_ext_ack *extack; }; static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { if (rt == arg->net->ipv6.fib6_null_entry) return 0; - return call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD, rt); + return call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD, + rt, arg->extack); } static int fib6_node_dump(struct fib6_walker *w) @@ -437,7 +441,8 @@ static int fib6_table_dump(struct net *net, struct fib6_table *tb, } /* Called with rcu_read_lock() */ -int fib6_tables_dump(struct net *net, struct notifier_block *nb) +int fib6_tables_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; @@ -451,6 +456,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb) w->func = fib6_node_dump; arg.net = net; arg.nb = nb; + arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 857a89ad4d6c..bfa49ff70531 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -265,9 +265,10 @@ static void __net_exit ip6mr_rules_exit(struct net *net) rtnl_unlock(); } -static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb) +static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { - return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR); + return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } static unsigned int ip6mr_rules_seq_read(struct net *net) @@ -324,7 +325,8 @@ static void __net_exit ip6mr_rules_exit(struct net *net) rtnl_unlock(); } -static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb) +static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return 0; } @@ -1256,10 +1258,11 @@ static unsigned int ip6mr_seq_read(struct net *net) return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net); } -static int ip6mr_dump(struct net *net, struct notifier_block *nb) +static int ip6mr_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, - ip6mr_mr_table_iter, &mrt_lock); + ip6mr_mr_table_iter, &mrt_lock, extack); } static struct notifier_block ip6_mr_notifier = { -- cgit v1.2.3 From 471f894f106573b0b086d1003ee6172253c67b59 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:31 +0200 Subject: net: devlink: export devlink net getter Allow drivers to get net struct for devlink instance. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 1 + net/core/devlink.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/include/net/devlink.h b/include/net/devlink.h index 23e4b65ec9df..5ac2be0f0857 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -771,6 +771,7 @@ static inline struct devlink *netdev_to_devlink(struct net_device *dev) struct ib_device; +struct net *devlink_net(const struct devlink *devlink); struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size); int devlink_register(struct devlink *devlink, struct device *dev); void devlink_unregister(struct devlink *devlink); diff --git a/net/core/devlink.c b/net/core/devlink.c index e48680efe54a..362cbbcca225 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -95,10 +95,11 @@ static LIST_HEAD(devlink_list); */ static DEFINE_MUTEX(devlink_mutex); -static struct net *devlink_net(const struct devlink *devlink) +struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); } +EXPORT_SYMBOL_GPL(devlink_net); static void devlink_net_set(struct devlink *devlink, struct net *net) { -- cgit v1.2.3 From 070c63f20f6c739a3c534555f56c7327536bfcc2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:39 +0200 Subject: net: devlink: allow to change namespaces during reload All devlink instances are created in init_net and stay there for a lifetime. Allow user to be able to move devlink instances into namespaces during devlink reload operation. That ensures proper re-instantiation of driver objects, including netdevices. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/main.c | 6 +- drivers/net/ethernet/mellanox/mlxsw/core.c | 1 + drivers/net/netdevsim/dev.c | 2 +- include/net/devlink.h | 2 +- include/uapi/linux/devlink.h | 4 + net/core/devlink.c | 154 +++++++++++++++++++++++++++-- 6 files changed, 158 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index fce9b3a24347..22c72fb7206a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -3935,13 +3935,17 @@ static void mlx4_restart_one_down(struct pci_dev *pdev); static int mlx4_restart_one_up(struct pci_dev *pdev, bool reload, struct devlink *devlink); -static int mlx4_devlink_reload_down(struct devlink *devlink, +static int mlx4_devlink_reload_down(struct devlink *devlink, bool netns_change, struct netlink_ext_ack *extack) { struct mlx4_priv *priv = devlink_priv(devlink); struct mlx4_dev *dev = &priv->dev; struct mlx4_dev_persistent *persist = dev->persist; + if (netns_change) { + NL_SET_ERR_MSG_MOD(extack, "Namespace change is not supported"); + return -EOPNOTSUPP; + } if (persist->num_vfs) mlx4_warn(persist->dev, "Reload performed on PF, will cause reset on operating Virtual Functions\n"); mlx4_restart_one_down(persist->pdev); diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index 1e61a012ca43..1c29522a2af3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -985,6 +985,7 @@ mlxsw_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, static int mlxsw_devlink_core_bus_device_reload_down(struct devlink *devlink, + bool netns_change, struct netlink_ext_ack *extack) { struct mlxsw_core *mlxsw_core = devlink_priv(devlink); diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 7de80faab047..3f3c7cc21077 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -473,7 +473,7 @@ static int nsim_dev_reload_create(struct nsim_dev *nsim_dev, struct netlink_ext_ack *extack); static void nsim_dev_reload_destroy(struct nsim_dev *nsim_dev); -static int nsim_dev_reload_down(struct devlink *devlink, +static int nsim_dev_reload_down(struct devlink *devlink, bool netns_change, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); diff --git a/include/net/devlink.h b/include/net/devlink.h index 5ac2be0f0857..3c9d4a063c98 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -643,7 +643,7 @@ enum devlink_trap_group_generic_id { } struct devlink_ops { - int (*reload_down)(struct devlink *devlink, + int (*reload_down)(struct devlink *devlink, bool netns_change, struct netlink_ext_ack *extack); int (*reload_up)(struct devlink *devlink, struct netlink_ext_ack *extack); diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 580b7a2e40e1..b558ea88b766 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -421,6 +421,10 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_FAILED, /* u8 0 or 1 */ + DEVLINK_ATTR_NETNS_FD, /* u32 */ + DEVLINK_ATTR_NETNS_PID, /* u32 */ + DEVLINK_ATTR_NETNS_ID, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 362cbbcca225..c4d8c4ab0fb5 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -435,8 +435,16 @@ static void devlink_nl_post_doit(const struct genl_ops *ops, { struct devlink *devlink; - devlink = devlink_get_from_info(info); - if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + /* When devlink changes netns, it would not be found + * by devlink_get_from_info(). So try if it is stored first. + */ + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) { + devlink = info->user_ptr[0]; + } else { + devlink = devlink_get_from_info(info); + WARN_ON(IS_ERR(devlink)); + } + if (!IS_ERR(devlink) && ~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); } @@ -2675,6 +2683,72 @@ devlink_resources_validate(struct devlink *devlink, return err; } +static struct net *devlink_netns_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID]; + struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD]; + struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID]; + struct net *net; + + if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) { + NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified"); + return ERR_PTR(-EINVAL); + } + + if (netns_pid_attr) { + net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr)); + } else if (netns_fd_attr) { + net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr)); + } else if (netns_id_attr) { + net = get_net_ns_by_id(sock_net(skb->sk), + nla_get_u32(netns_id_attr)); + if (!net) + net = ERR_PTR(-EINVAL); + } else { + WARN_ON(1); + net = ERR_PTR(-EINVAL); + } + if (IS_ERR(net)) { + NL_SET_ERR_MSG(info->extack, "Unknown network namespace"); + return ERR_PTR(-EINVAL); + } + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + put_net(net); + return ERR_PTR(-EPERM); + } + return net; +} + +static void devlink_param_notify(struct devlink *devlink, + unsigned int port_index, + struct devlink_param_item *param_item, + enum devlink_command cmd); + +static void devlink_reload_netns_change(struct devlink *devlink, + struct net *dest_net) +{ + struct devlink_param_item *param_item; + + /* Userspace needs to be notified about devlink objects + * removed from original and entering new network namespace. + * The rest of the devlink objects are re-created during + * reload process so the notifications are generated separatelly. + */ + + list_for_each_entry(param_item, &devlink->param_list, list) + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_DEL); + devlink_notify(devlink, DEVLINK_CMD_DEL); + + devlink_net_set(devlink, dest_net); + + devlink_notify(devlink, DEVLINK_CMD_NEW); + list_for_each_entry(param_item, &devlink->param_list, list) + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_NEW); +} + static bool devlink_reload_supported(struct devlink *devlink) { return devlink->ops->reload_down && devlink->ops->reload_up; @@ -2695,9 +2769,27 @@ bool devlink_is_reload_failed(const struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_is_reload_failed); +static int devlink_reload(struct devlink *devlink, struct net *dest_net, + struct netlink_ext_ack *extack) +{ + int err; + + err = devlink->ops->reload_down(devlink, !!dest_net, extack); + if (err) + return err; + + if (dest_net && !net_eq(dest_net, devlink_net(devlink))) + devlink_reload_netns_change(devlink, dest_net); + + err = devlink->ops->reload_up(devlink, extack); + devlink_reload_failed_set(devlink, !!err); + return err; +} + static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + struct net *dest_net = NULL; int err; if (!devlink_reload_supported(devlink)) @@ -2708,11 +2800,20 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); return err; } - err = devlink->ops->reload_down(devlink, info->extack); - if (err) - return err; - err = devlink->ops->reload_up(devlink, info->extack); - devlink_reload_failed_set(devlink, !!err); + + if (info->attrs[DEVLINK_ATTR_NETNS_PID] || + info->attrs[DEVLINK_ATTR_NETNS_FD] || + info->attrs[DEVLINK_ATTR_NETNS_ID]) { + dest_net = devlink_netns_get(skb, info); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); + } + + err = devlink_reload(devlink, dest_net, info->extack); + + if (dest_net) + put_net(dest_net); + return err; } @@ -5794,6 +5895,9 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 }, [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 }, + [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -8061,9 +8165,43 @@ int devlink_compat_switch_id_get(struct net_device *dev, return 0; } +static void __net_exit devlink_pernet_pre_exit(struct net *net) +{ + struct devlink *devlink; + int err; + + /* In case network namespace is getting destroyed, reload + * all devlink instances from this namespace into init_net. + */ + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (net_eq(devlink_net(devlink), net)) { + if (WARN_ON(!devlink_reload_supported(devlink))) + continue; + err = devlink_reload(devlink, &init_net, NULL); + if (err) + pr_warn("Failed to reload devlink instance into init_net\n"); + } + } + mutex_unlock(&devlink_mutex); +} + +static struct pernet_operations devlink_pernet_ops __net_initdata = { + .pre_exit = devlink_pernet_pre_exit, +}; + static int __init devlink_init(void) { - return genl_register_family(&devlink_nl_family); + int err; + + err = genl_register_family(&devlink_nl_family); + if (err) + goto out; + err = register_pernet_subsys(&devlink_pernet_ops); + +out: + WARN_ON(err); + return err; } subsys_initcall(devlink_init); -- cgit v1.2.3 From 193d357d087309f2d5ab8e8caab1af5e3bc29fa0 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 3 Oct 2019 23:56:37 +0300 Subject: net: spread "enum sock_flags" Some ints are "enum sock_flags" in fact. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- net/core/sock.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/include/net/sock.h b/include/net/sock.h index 2c53f1a1d905..ab905c4b1f0e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2512,7 +2512,7 @@ static inline bool sk_listener(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } -void sock_enable_timestamp(struct sock *sk, int flag); +void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); diff --git a/net/core/sock.c b/net/core/sock.c index 07863edbe6fc..9774ab2ed3f1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -687,7 +687,8 @@ out: return ret; } -static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) +static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, + int valbool) { if (valbool) sock_set_flag(sk, bit); @@ -3033,7 +3034,7 @@ int sock_gettstamp(struct socket *sock, void __user *userstamp, } EXPORT_SYMBOL(sock_gettstamp); -void sock_enable_timestamp(struct sock *sk, int flag) +void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) { if (!sock_flag(sk, flag)) { unsigned long previous_flags = sk->sk_flags; -- cgit v1.2.3 From c62c2cfb801b6c890641ed6c91ec9e5c7ad8e2f3 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 4 Oct 2019 11:50:12 +0200 Subject: net: devlink: don't ignore errors during dumpit Currently, some dumpit function may end-up with error which is not -EMSGSIZE and this error is silently ignored. Use does not have clue that something wrong happened. Instead of silent ignore, propagate the error to user. Suggested-by: Andrew Lunn Signed-off-by: Jiri Pirko Reviewed-by: Andrew Lunn Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/devlink.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/devlink.c b/net/core/devlink.c index c4d8c4ab0fb5..6d16908f34b0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1044,7 +1044,7 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, struct devlink_sb *devlink_sb; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -1067,6 +1067,9 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -1242,7 +1245,7 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, struct devlink_sb *devlink_sb; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -1265,6 +1268,9 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -1469,7 +1475,7 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, struct devlink_sb *devlink_sb; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -1494,6 +1500,9 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -3257,7 +3266,7 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, struct devlink *devlink; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -3285,6 +3294,9 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -3513,7 +3525,7 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, struct devlink *devlink; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -3546,6 +3558,9 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, out: mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } @@ -4168,7 +4183,7 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, struct devlink *devlink; int start = cb->args[0]; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); list_for_each_entry(devlink, &devlink_list, list) { @@ -4196,6 +4211,9 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, } mutex_unlock(&devlink_mutex); + if (err != -EMSGSIZE) + return err; + cb->args[0] = idx; return msg->len; } -- cgit v1.2.3 From 8273fd845447820c26b38821c8ac297f40a65260 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 5 Oct 2019 08:10:31 +0200 Subject: net: devlink: export devlink net setter For newly allocated devlink instance allow drivers to set net struct Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 2 ++ net/core/devlink.c | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/include/net/devlink.h b/include/net/devlink.h index 3c9d4a063c98..4095657fc23f 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -39,6 +39,7 @@ struct devlink { possible_net_t _net; struct mutex lock; bool reload_failed; + bool registered; char priv[0] __aligned(NETDEV_ALIGN); }; @@ -772,6 +773,7 @@ static inline struct devlink *netdev_to_devlink(struct net_device *dev) struct ib_device; struct net *devlink_net(const struct devlink *devlink); +void devlink_net_set(struct devlink *devlink, struct net *net); struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size); int devlink_register(struct devlink *devlink, struct device *dev); void devlink_unregister(struct devlink *devlink); diff --git a/net/core/devlink.c b/net/core/devlink.c index 0e464d071172..76d835581687 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -101,11 +101,19 @@ struct net *devlink_net(const struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_net); -static void devlink_net_set(struct devlink *devlink, struct net *net) +static void __devlink_net_set(struct devlink *devlink, struct net *net) { write_pnet(&devlink->_net, net); } +void devlink_net_set(struct devlink *devlink, struct net *net) +{ + if (WARN_ON(devlink->registered)) + return; + __devlink_net_set(devlink, net); +} +EXPORT_SYMBOL_GPL(devlink_net_set); + static struct devlink *devlink_get_from_attrs(struct net *net, struct nlattr **attrs) { @@ -2750,7 +2758,7 @@ static void devlink_reload_netns_change(struct devlink *devlink, DEVLINK_CMD_PARAM_DEL); devlink_notify(devlink, DEVLINK_CMD_DEL); - devlink_net_set(devlink, dest_net); + __devlink_net_set(devlink, dest_net); devlink_notify(devlink, DEVLINK_CMD_NEW); list_for_each_entry(param_item, &devlink->param_list, list) @@ -6278,7 +6286,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) if (!devlink) return NULL; devlink->ops = ops; - devlink_net_set(devlink, &init_net); + __devlink_net_set(devlink, &init_net); INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); @@ -6304,6 +6312,7 @@ int devlink_register(struct devlink *devlink, struct device *dev) { mutex_lock(&devlink_mutex); devlink->dev = dev; + devlink->registered = true; list_add_tail(&devlink->list, &devlink_list); devlink_notify(devlink, DEVLINK_CMD_NEW); mutex_unlock(&devlink_mutex); -- cgit v1.2.3 From ee85da535fe30e02908d30ec6b8960c4a991cb2d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 5 Oct 2019 20:04:42 +0200 Subject: devlink: have genetlink code to parse the attrs during dumpit Benefit from the fact that the generic netlink code can parse the attrs for dumpit op and avoid need to parse it in the op callback. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 38 ++++++-------------------------------- 1 file changed, 6 insertions(+), 32 deletions(-) (limited to 'net/core') diff --git a/net/core/devlink.c b/net/core/devlink.c index 76d835581687..22f59461b0c1 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3943,29 +3943,19 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { + const struct genl_dumpit_info *info = genl_dumpit_info(cb); u64 ret_offset, start_offset, end_offset = 0; + struct nlattr **attrs = info->attrs; struct devlink_region *region; struct nlattr *chunks_attr; const char *region_name; struct devlink *devlink; - struct nlattr **attrs; bool dump = true; void *hdr; int err; start_offset = *((u64 *)&cb->args[0]); - attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); - if (!attrs) - return -ENOMEM; - - err = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + devlink_nl_family.hdrsize, - attrs, DEVLINK_ATTR_MAX, - devlink_nl_family.policy, cb->extack); - if (err) - goto out_free; - mutex_lock(&devlink_mutex); devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); if (IS_ERR(devlink)) { @@ -4042,7 +4032,6 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, genlmsg_end(skb, hdr); mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); - kfree(attrs); return skb->len; @@ -4052,8 +4041,6 @@ out_unlock: mutex_unlock(&devlink->lock); out_dev: mutex_unlock(&devlink_mutex); -out_free: - kfree(attrs); return err; } @@ -4995,21 +4982,10 @@ devlink_health_reporter_get_from_info(struct devlink *devlink, static struct devlink_health_reporter * devlink_health_reporter_get_from_cb(struct netlink_callback *cb) { + const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct devlink_health_reporter *reporter; + struct nlattr **attrs = info->attrs; struct devlink *devlink; - struct nlattr **attrs; - int err; - - attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); - if (!attrs) - return NULL; - - err = nlmsg_parse_deprecated(cb->nlh, - GENL_HDRLEN + devlink_nl_family.hdrsize, - attrs, DEVLINK_ATTR_MAX, - devlink_nl_family.policy, cb->extack); - if (err) - goto free; mutex_lock(&devlink_mutex); devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); @@ -5018,12 +4994,9 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb) reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); mutex_unlock(&devlink_mutex); - kfree(attrs); return reporter; unlock: mutex_unlock(&devlink_mutex); -free: - kfree(attrs); return NULL; } @@ -6154,7 +6127,8 @@ static const struct genl_ops devlink_nl_ops[] = { }, { .cmd = DEVLINK_CMD_REGION_READ, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_region_read_dumpit, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, -- cgit v1.2.3 From 82a843de41d42681c1bbf9194b28736d06050b08 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 7 Oct 2019 09:28:31 +0200 Subject: net: devlink: fix reporter dump dumpit In order for attrs to be prepared for reporter dump dumpit callback, set GENL_DONT_VALIDATE_DUMP_STRICT instead of GENL_DONT_VALIDATE_DUMP. Fixes: ee85da535fe3 ("devlink: have genetlink code to parse the attrs during dumpit" Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/devlink.c b/net/core/devlink.c index 22f59461b0c1..eb0a22f05887 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -6176,7 +6176,8 @@ static const struct genl_ops devlink_nl_ops[] = { }, { .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | -- cgit v1.2.3 From 8211fbfaf2fe66ac4ca28bb52b4e7f61dcac0378 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 6 Oct 2019 18:52:43 +0200 Subject: net: core: use helper skb_ensure_writable in more places Use helper skb_ensure_writable in two more places to simplify the code. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- net/core/dev.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 944de67ee95d..7d05e042c6ba 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3165,12 +3165,9 @@ int skb_checksum_help(struct sk_buff *skb) offset += skb->csum_offset; BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); - if (skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(__sum16))) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; - } + ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); + if (ret) + goto out; *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: @@ -3205,12 +3202,11 @@ int skb_crc32c_csum_help(struct sk_buff *skb) ret = -EINVAL; goto out; } - if (skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(__le32))) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; - } + + ret = skb_ensure_writable(skb, offset + sizeof(__le32)); + if (ret) + goto out; + crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, skb->len - start, ~(__u32)0, crc32c_csum_stub)); -- cgit v1.2.3 From 163ab96b52ae2bb2d8f188cd29f0b570610f9007 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 6 Oct 2019 21:09:27 -0700 Subject: net: sockmap: use bitmap for copy info Don't use bool array in struct sk_msg_sg, save 12 bytes. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- include/linux/skmsg.h | 12 ++++++++---- net/core/filter.c | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e4b3fb4bb77c..fe80d537945d 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -28,13 +28,14 @@ struct sk_msg_sg { u32 end; u32 size; u32 copybreak; - bool copy[MAX_MSG_FRAGS]; + unsigned long copy; /* The extra element is used for chaining the front and sections when * the list becomes partitioned (e.g. end < start). The crypto APIs * require the chaining. */ struct scatterlist data[MAX_MSG_FRAGS + 1]; }; +static_assert(BITS_PER_LONG >= MAX_MSG_FRAGS); /* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { @@ -227,7 +228,7 @@ static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) { struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); - if (msg->sg.copy[msg->sg.start]) { + if (test_bit(msg->sg.start, &msg->sg.copy)) { msg->data = NULL; msg->data_end = NULL; } else { @@ -246,7 +247,7 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, sg_set_page(sge, page, len, offset); sg_unmark_end(sge); - msg->sg.copy[msg->sg.end] = true; + __set_bit(msg->sg.end, &msg->sg.copy); msg->sg.size += len; sk_msg_iter_next(msg, end); } @@ -254,7 +255,10 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) { do { - msg->sg.copy[i] = copy_state; + if (copy_state) + __set_bit(i, &msg->sg.copy); + else + __clear_bit(i, &msg->sg.copy); sk_msg_iter_var_next(i); if (i == msg->sg.end) break; diff --git a/net/core/filter.c b/net/core/filter.c index ed6563622ce3..46196e212413 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2245,7 +2245,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, * account for the headroom. */ bytes_sg_total = start - offset + bytes; - if (!msg->sg.copy[i] && bytes_sg_total <= len) + if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist @@ -2450,7 +2450,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; - msg->sg.copy[new] = false; + __clear_bit(new, &msg->sg.copy); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); -- cgit v1.2.3 From a11c397c43d5b27491aa2f36276713cf151a4735 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 7 Oct 2019 09:21:02 -0700 Subject: bpf/flow_dissector: add mode to enforce global BPF flow dissector Always use init_net flow dissector BPF program if it's attached and fall back to the per-net namespace one. Also, deny installing new programs if there is already one attached to the root namespace. Users can still detach their BPF programs, but can't attach any new ones (-EEXIST). Cc: Petar Penkov Acked-by: Andrii Nakryiko Acked-by: Song Liu Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- Documentation/bpf/prog_flow_dissector.rst | 3 +++ net/core/flow_dissector.c | 38 +++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/Documentation/bpf/prog_flow_dissector.rst b/Documentation/bpf/prog_flow_dissector.rst index a78bf036cadd..4d86780ab0f1 100644 --- a/Documentation/bpf/prog_flow_dissector.rst +++ b/Documentation/bpf/prog_flow_dissector.rst @@ -142,3 +142,6 @@ BPF flow dissector doesn't support exporting all the metadata that in-kernel C-based implementation can export. Notable example is single VLAN (802.1Q) and double VLAN (802.1AD) tags. Please refer to the ``struct bpf_flow_keys`` for a set of information that's currently can be exported from the BPF context. + +When BPF flow dissector is attached to the root network namespace (machine-wide +policy), users can't override it in their child network namespaces. diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 7c09d87d3269..6b4b88d1599d 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -114,19 +114,46 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, { struct bpf_prog *attached; struct net *net; + int ret = 0; net = current->nsproxy->net_ns; mutex_lock(&flow_dissector_mutex); + + if (net == &init_net) { + /* BPF flow dissector in the root namespace overrides + * any per-net-namespace one. When attaching to root, + * make sure we don't have any BPF program attached + * to the non-root namespaces. + */ + struct net *ns; + + for_each_net(ns) { + if (rcu_access_pointer(ns->flow_dissector_prog)) { + ret = -EEXIST; + goto out; + } + } + } else { + /* Make sure root flow dissector is not attached + * when attaching to the non-root namespace. + */ + if (rcu_access_pointer(init_net.flow_dissector_prog)) { + ret = -EEXIST; + goto out; + } + } + attached = rcu_dereference_protected(net->flow_dissector_prog, lockdep_is_held(&flow_dissector_mutex)); if (attached) { /* Only one BPF program can be attached at a time */ - mutex_unlock(&flow_dissector_mutex); - return -EEXIST; + ret = -EEXIST; + goto out; } rcu_assign_pointer(net->flow_dissector_prog, prog); +out: mutex_unlock(&flow_dissector_mutex); - return 0; + return ret; } int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) @@ -910,7 +937,10 @@ bool __skb_flow_dissect(const struct net *net, WARN_ON_ONCE(!net); if (net) { rcu_read_lock(); - attached = rcu_dereference(net->flow_dissector_prog); + attached = rcu_dereference(init_net.flow_dissector_prog); + + if (!attached) + attached = rcu_dereference(net->flow_dissector_prog); if (attached) { struct bpf_flow_keys flow_keys; -- cgit v1.2.3 From bacb7e1855969bba78b32302453d2cc8ba0bc403 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Oct 2019 14:20:34 -0700 Subject: Revert "tun: call dev_get_valid_name() before register_netdevice()" This reverts commit 0ad646c81b2182f7fa67ec0c8c825e0ee165696d. As noticed by Jakub, this is no longer needed after commit 11fc7d5a0a2d ("tun: fix memory leak in error path") This no longer exports dev_get_valid_name() for the exclusive use of tun driver. Suggested-by: Jakub Kicinski Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 3 --- include/linux/netdevice.h | 3 --- net/core/dev.c | 5 ++--- 3 files changed, 2 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 1e541b08b136..0413d182d782 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2788,9 +2788,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (!dev) return -ENOMEM; - err = dev_get_valid_name(net, dev, name); - if (err < 0) - goto err_free_dev; dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fe45b2c72315..3207e0b9ec4e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4113,9 +4113,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs); -int dev_get_valid_name(struct net *net, struct net_device *dev, - const char *name); - #define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \ alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1) diff --git a/net/core/dev.c b/net/core/dev.c index 7d05e042c6ba..8bc3dce71fc0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1249,8 +1249,8 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); -int dev_get_valid_name(struct net *net, struct net_device *dev, - const char *name) +static int dev_get_valid_name(struct net *net, struct net_device *dev, + const char *name) { BUG_ON(!net); @@ -1266,7 +1266,6 @@ int dev_get_valid_name(struct net *net, struct net_device *dev, return 0; } -EXPORT_SYMBOL(dev_get_valid_name); /** * dev_change_name - change name of a device -- cgit v1.2.3 From 719b78a5674f15fef2e4a56484614657fd759978 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 11 Oct 2019 10:29:45 +0200 Subject: flow_dissector: Allow updating the flow dissector program atomically It is currently not possible to detach the flow dissector program and attach a new one in an atomic fashion, that is with a single syscall. Attempts to do so will be met with EEXIST error. This makes updates to flow dissector program hard. Traffic steering that relies on BPF-powered flow dissection gets disrupted while old program has been already detached but the new one has not been attached yet. There is also a window of opportunity to attach a flow dissector to a non-root namespace while updating the root flow dissector, thus blocking the update. Lastly, the behavior is inconsistent with cgroup BPF programs, which can be replaced with a single bpf(BPF_PROG_ATTACH, ...) syscall without any restrictions. Allow attaching a new flow dissector program when another one is already present with a restriction that it can't be the same program. Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191011082946.22695-2-jakub@cloudflare.com --- net/core/flow_dissector.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 6b4b88d1599d..dbf502c18656 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -128,6 +128,8 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, struct net *ns; for_each_net(ns) { + if (ns == &init_net) + continue; if (rcu_access_pointer(ns->flow_dissector_prog)) { ret = -EEXIST; goto out; @@ -145,12 +147,14 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, attached = rcu_dereference_protected(net->flow_dissector_prog, lockdep_is_held(&flow_dissector_mutex)); - if (attached) { - /* Only one BPF program can be attached at a time */ - ret = -EEXIST; + if (attached == prog) { + /* The same program cannot be attached twice */ + ret = -EINVAL; goto out; } rcu_assign_pointer(net->flow_dissector_prog, prog); + if (attached) + bpf_prog_put(attached); out: mutex_unlock(&flow_dissector_mutex); return ret; -- cgit v1.2.3 From baead859edbb3cd53b8e388c1f33641ce01d4c01 Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Fri, 11 Oct 2019 09:43:03 +0100 Subject: xdp: Trivial, fix spelling in function description Fix typo 'boolian' into 'boolean'. Signed-off-by: Anton Ivanov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20191011084303.28418-1-anton.ivanov@cambridgegreys.com --- net/core/xdp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/xdp.c b/net/core/xdp.c index d7bf62ffbb5e..20781ad5f9c3 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -386,7 +386,7 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); /* XDP RX runs under NAPI protection, and in different delivery error * scenarios (e.g. queue full), it is possible to return the xdp_frame - * while still leveraging this protection. The @napi_direct boolian + * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ -- cgit v1.2.3 From 28e72b26ddeeef474ee9a8dd15df61b35ff557d8 Mon Sep 17 00:00:00 2001 From: Vito Caputo Date: Wed, 9 Oct 2019 21:08:24 -0700 Subject: sock_get_timeout: drop unnecessary return variable Remove pointless use of size return variable by directly returning sizes. Signed-off-by: Vito Caputo Signed-off-by: David S. Miller --- net/core/sock.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 24e93407239a..ceda6b126d84 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -333,7 +333,6 @@ EXPORT_SYMBOL(__sk_backlog_rcv); static int sock_get_timeout(long timeo, void *optval, bool old_timeval) { struct __kernel_sock_timeval tv; - int size; if (timeo == MAX_SCHEDULE_TIMEOUT) { tv.tv_sec = 0; @@ -354,13 +353,11 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval) old_tv.tv_sec = tv.tv_sec; old_tv.tv_usec = tv.tv_usec; *(struct __kernel_old_timeval *)optval = old_tv; - size = sizeof(old_tv); - } else { - *(struct __kernel_sock_timeval *)optval = tv; - size = sizeof(tv); + return sizeof(old_tv); } - return size; + *(struct __kernel_sock_timeval *)optval = tv; + return sizeof(tv); } static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval) -- cgit v1.2.3 From 402818205c9ecdfd922fdfa58fb113f60fdda523 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 10 Oct 2019 15:18:48 +0200 Subject: devlink: don't do reporter recovery if the state is healthy If reporter state is healthy, don't call into a driver for recover and don't increase recovery count. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/devlink.c b/net/core/devlink.c index eb0a22f05887..95887462eecf 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4851,6 +4851,9 @@ devlink_health_reporter_recover(struct devlink_health_reporter *reporter, { int err; + if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) + return 0; + if (!reporter->ops->recover) return -EOPNOTSUPP; -- cgit v1.2.3 From e7a981050a7fb9a14b652365c00d9c5a025704ce Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 10 Oct 2019 15:18:49 +0200 Subject: devlink: propagate extack down to health reporter ops During health reporter operations, driver might want to fill-up the extack message, so propagate extack down to the health reporter ops. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 9 ++++++--- .../net/ethernet/mellanox/mlx5/core/en/reporter_rx.c | 6 ++++-- .../net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 6 ++++-- drivers/net/ethernet/mellanox/mlx5/core/health.c | 12 ++++++++---- include/net/devlink.h | 8 +++++--- net/core/devlink.c | 20 +++++++++++--------- 6 files changed, 38 insertions(+), 23 deletions(-) (limited to 'net/core') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index e664392dccc0..ff1bc0ec2e7c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -16,7 +16,8 @@ #include "bnxt_devlink.h" static int bnxt_fw_reporter_diagnose(struct devlink_health_reporter *reporter, - struct devlink_fmsg *fmsg) + struct devlink_fmsg *fmsg, + struct netlink_ext_ack *extack) { struct bnxt *bp = devlink_health_reporter_priv(reporter); struct bnxt_fw_health *health = bp->fw_health; @@ -66,7 +67,8 @@ static const struct devlink_health_reporter_ops bnxt_dl_fw_reporter_ops = { }; static int bnxt_fw_reset_recover(struct devlink_health_reporter *reporter, - void *priv_ctx) + void *priv_ctx, + struct netlink_ext_ack *extack) { struct bnxt *bp = devlink_health_reporter_priv(reporter); @@ -84,7 +86,8 @@ struct devlink_health_reporter_ops bnxt_dl_fw_reset_reporter_ops = { }; static int bnxt_fw_fatal_recover(struct devlink_health_reporter *reporter, - void *priv_ctx) + void *priv_ctx, + struct netlink_ext_ack *extack) { struct bnxt *bp = devlink_health_reporter_priv(reporter); struct bnxt_fw_reporter_ctx *fw_reporter_ctx = priv_ctx; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c index b860569d4247..6c72b592315b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -222,7 +222,8 @@ static int mlx5e_rx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) } static int mlx5e_rx_reporter_recover(struct devlink_health_reporter *reporter, - void *context) + void *context, + struct netlink_ext_ack *extack) { struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); struct mlx5e_err_ctx *err_ctx = context; @@ -301,7 +302,8 @@ static int mlx5e_rx_reporter_build_diagnose_output(struct mlx5e_rq *rq, } static int mlx5e_rx_reporter_diagnose(struct devlink_health_reporter *reporter, - struct devlink_fmsg *fmsg) + struct devlink_fmsg *fmsg, + struct netlink_ext_ack *extack) { struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); struct mlx5e_params *params = &priv->channels.params; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index bfed558637c2..b468549e96ff 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -135,7 +135,8 @@ static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) } static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, - void *context) + void *context, + struct netlink_ext_ack *extack) { struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); struct mlx5e_err_ctx *err_ctx = context; @@ -205,7 +206,8 @@ mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg, } static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, - struct devlink_fmsg *fmsg) + struct devlink_fmsg *fmsg, + struct netlink_ext_ack *extack) { struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); struct mlx5e_txqsq *generic_sq = priv->txq2sq[0]; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index d685122d9ff7..be3c3c704bfc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -390,7 +390,8 @@ static void print_health_info(struct mlx5_core_dev *dev) static int mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter, - struct devlink_fmsg *fmsg) + struct devlink_fmsg *fmsg, + struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); struct mlx5_core_health *health = &dev->priv.health; @@ -491,7 +492,8 @@ mlx5_fw_reporter_heath_buffer_data_put(struct mlx5_core_dev *dev, static int mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter, - struct devlink_fmsg *fmsg, void *priv_ctx) + struct devlink_fmsg *fmsg, void *priv_ctx, + struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); int err; @@ -545,7 +547,8 @@ static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = { static int mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter, - void *priv_ctx) + void *priv_ctx, + struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); @@ -555,7 +558,8 @@ mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter, #define MLX5_CR_DUMP_CHUNK_SIZE 256 static int mlx5_fw_fatal_reporter_dump(struct devlink_health_reporter *reporter, - struct devlink_fmsg *fmsg, void *priv_ctx) + struct devlink_fmsg *fmsg, void *priv_ctx, + struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter); u32 crdump_size = dev->priv.health.crdump_size; diff --git a/include/net/devlink.h b/include/net/devlink.h index 4095657fc23f..6bf3b9e0595a 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -507,11 +507,13 @@ enum devlink_health_reporter_state { struct devlink_health_reporter_ops { char *name; int (*recover)(struct devlink_health_reporter *reporter, - void *priv_ctx); + void *priv_ctx, struct netlink_ext_ack *extack); int (*dump)(struct devlink_health_reporter *reporter, - struct devlink_fmsg *fmsg, void *priv_ctx); + struct devlink_fmsg *fmsg, void *priv_ctx, + struct netlink_ext_ack *extack); int (*diagnose)(struct devlink_health_reporter *reporter, - struct devlink_fmsg *fmsg); + struct devlink_fmsg *fmsg, + struct netlink_ext_ack *extack); }; /** diff --git a/net/core/devlink.c b/net/core/devlink.c index 95887462eecf..97e9a2246929 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4847,7 +4847,7 @@ EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); static int devlink_health_reporter_recover(struct devlink_health_reporter *reporter, - void *priv_ctx) + void *priv_ctx, struct netlink_ext_ack *extack) { int err; @@ -4857,7 +4857,7 @@ devlink_health_reporter_recover(struct devlink_health_reporter *reporter, if (!reporter->ops->recover) return -EOPNOTSUPP; - err = reporter->ops->recover(reporter, priv_ctx); + err = reporter->ops->recover(reporter, priv_ctx, extack); if (err) return err; @@ -4878,7 +4878,8 @@ devlink_health_dump_clear(struct devlink_health_reporter *reporter) } static int devlink_health_do_dump(struct devlink_health_reporter *reporter, - void *priv_ctx) + void *priv_ctx, + struct netlink_ext_ack *extack) { int err; @@ -4899,7 +4900,7 @@ static int devlink_health_do_dump(struct devlink_health_reporter *reporter, goto dump_err; err = reporter->ops->dump(reporter, reporter->dump_fmsg, - priv_ctx); + priv_ctx, extack); if (err) goto dump_err; @@ -4946,11 +4947,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter, mutex_lock(&reporter->dump_lock); /* store current dump of current error, for later analysis */ - devlink_health_do_dump(reporter, priv_ctx); + devlink_health_do_dump(reporter, priv_ctx, NULL); mutex_unlock(&reporter->dump_lock); if (reporter->auto_recover) - return devlink_health_reporter_recover(reporter, priv_ctx); + return devlink_health_reporter_recover(reporter, + priv_ctx, NULL); return 0; } @@ -5188,7 +5190,7 @@ static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, if (!reporter) return -EINVAL; - err = devlink_health_reporter_recover(reporter, NULL); + err = devlink_health_reporter_recover(reporter, NULL, info->extack); devlink_health_reporter_put(reporter); return err; @@ -5221,7 +5223,7 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, if (err) goto out; - err = reporter->ops->diagnose(reporter, fmsg); + err = reporter->ops->diagnose(reporter, fmsg, info->extack); if (err) goto out; @@ -5256,7 +5258,7 @@ devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb, } mutex_lock(&reporter->dump_lock); if (!start) { - err = devlink_health_do_dump(reporter, NULL); + err = devlink_health_do_dump(reporter, NULL, cb->extack); if (err) goto unlock; cb->args[1] = reporter->dump_ts; -- cgit v1.2.3 From 6570bc79c0dfff0f228b7afd2de720fb4e84d61d Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Mon, 14 Oct 2019 11:00:33 +0300 Subject: net: core: use listified Rx for GRO_NORMAL in napi_gro_receive() Commit 323ebb61e32b4 ("net: use listified RX for handling GRO_NORMAL skbs") made use of listified skb processing for the users of napi_gro_frags(). The same technique can be used in a way more common napi_gro_receive() to speed up non-merged (GRO_NORMAL) skbs for a wide range of drivers including gro_cells and mac80211 users. This slightly changes the return value in cases where skb is being dropped by the core stack, but it seems to have no impact on related drivers' functionality. gro_normal_batch is left untouched as it's very individual for every single system configuration and might be tuned in manual order to achieve an optimal performance. Signed-off-by: Alexander Lobakin Acked-by: Edward Cree Signed-off-by: David S. Miller --- net/core/dev.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 8bc3dce71fc0..74f593986524 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5884,6 +5884,26 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); +/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ +static void gro_normal_list(struct napi_struct *napi) +{ + if (!napi->rx_count) + return; + netif_receive_skb_list_internal(&napi->rx_list); + INIT_LIST_HEAD(&napi->rx_list); + napi->rx_count = 0; +} + +/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, + * pass the whole batch up to the stack. + */ +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) +{ + list_add_tail(&skb->list, &napi->rx_list); + if (++napi->rx_count >= gro_normal_batch) + gro_normal_list(napi); +} + static void napi_skb_free_stolen_head(struct sk_buff *skb) { skb_dst_drop(skb); @@ -5891,12 +5911,13 @@ static void napi_skb_free_stolen_head(struct sk_buff *skb) kmem_cache_free(skbuff_head_cache, skb); } -static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +static gro_result_t napi_skb_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) { switch (ret) { case GRO_NORMAL: - if (netif_receive_skb_internal(skb)) - ret = GRO_DROP; + gro_normal_one(napi, skb); break; case GRO_DROP: @@ -5928,7 +5949,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) skb_gro_reset_offset(skb); - ret = napi_skb_finish(dev_gro_receive(napi, skb), skb); + ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); return ret; @@ -5974,26 +5995,6 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_get_frags); -/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ -static void gro_normal_list(struct napi_struct *napi) -{ - if (!napi->rx_count) - return; - netif_receive_skb_list_internal(&napi->rx_list); - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; -} - -/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, - * pass the whole batch up to the stack. - */ -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) -{ - list_add_tail(&skb->list, &napi->rx_list); - if (++napi->rx_count >= gro_normal_batch) - gro_normal_list(napi); -} - static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) -- cgit v1.2.3 From a7658e1a4164ce2b9eb4a11aadbba38586e93bd6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:04 -0700 Subject: bpf: Check types of arguments passed into helpers Introduce new helper that reuses existing skb perf_event output implementation, but can be called from raw_tracepoint programs that receive 'struct sk_buff *' as tracepoint argument or can walk other kernel data structures to skb pointer. In order to do that teach verifier to resolve true C types of bpf helpers into in-kernel BTF ids. The type of kernel pointer passed by raw tracepoint into bpf program will be tracked by the verifier all the way until it's passed into helper function. For example: kfree_skb() kernel function calls trace_kfree_skb(skb, loc); bpf programs receives that skb pointer and may eventually pass it into bpf_skb_output() bpf helper which in-kernel is implemented via bpf_skb_event_output() kernel function. Its first argument in the kernel is 'struct sk_buff *'. The verifier makes sure that types match all the way. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-11-ast@kernel.org --- include/linux/bpf.h | 18 +++++++---- include/uapi/linux/bpf.h | 27 ++++++++++++++++- kernel/bpf/btf.c | 68 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 44 +++++++++++++++++---------- kernel/trace/bpf_trace.c | 4 +++ net/core/filter.c | 15 +++++++++- tools/include/uapi/linux/bpf.h | 27 ++++++++++++++++- 7 files changed, 180 insertions(+), 23 deletions(-) (limited to 'net/core') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a7330d75bb94..2c2c29b49845 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -213,6 +213,7 @@ enum bpf_arg_type { ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ + ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ }; /* type of values returned from helper functions */ @@ -235,11 +236,17 @@ struct bpf_func_proto { bool gpl_only; bool pkt_access; enum bpf_return_type ret_type; - enum bpf_arg_type arg1_type; - enum bpf_arg_type arg2_type; - enum bpf_arg_type arg3_type; - enum bpf_arg_type arg4_type; - enum bpf_arg_type arg5_type; + union { + struct { + enum bpf_arg_type arg1_type; + enum bpf_arg_type arg2_type; + enum bpf_arg_type arg3_type; + enum bpf_arg_type arg4_type; + enum bpf_arg_type arg5_type; + }; + enum bpf_arg_type arg_type[5]; + }; + u32 *btf_id; /* BTF ids of arguments */ }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is @@ -765,6 +772,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); +u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3bb2cd1de341..4af8b0819a32 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2751,6 +2751,30 @@ union bpf_attr { * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + * + * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2863,7 +2887,8 @@ union bpf_attr { FN(sk_storage_get), \ FN(sk_storage_delete), \ FN(send_signal), \ - FN(tcp_gen_syncookie), + FN(tcp_gen_syncookie), \ + FN(skb_output), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 271d27cd427f..f7557af39756 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3626,6 +3626,74 @@ again: return -EINVAL; } +u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg) +{ + char fnname[KSYM_SYMBOL_LEN + 4] = "btf_"; + const struct btf_param *args; + const struct btf_type *t; + const char *tname, *sym; + u32 btf_id, i; + + if (IS_ERR(btf_vmlinux)) { + bpf_log(log, "btf_vmlinux is malformed\n"); + return -EINVAL; + } + + sym = kallsyms_lookup((long)fn, NULL, NULL, NULL, fnname + 4); + if (!sym) { + bpf_log(log, "kernel doesn't have kallsyms\n"); + return -EFAULT; + } + + for (i = 1; i <= btf_vmlinux->nr_types; i++) { + t = btf_type_by_id(btf_vmlinux, i); + if (BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) + continue; + tname = __btf_name_by_offset(btf_vmlinux, t->name_off); + if (!strcmp(tname, fnname)) + break; + } + if (i > btf_vmlinux->nr_types) { + bpf_log(log, "helper %s type is not found\n", fnname); + return -ENOENT; + } + + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_ptr(t)) + return -EFAULT; + t = btf_type_by_id(btf_vmlinux, t->type); + if (!btf_type_is_func_proto(t)) + return -EFAULT; + + args = (const struct btf_param *)(t + 1); + if (arg >= btf_type_vlen(t)) { + bpf_log(log, "bpf helper %s doesn't have %d-th argument\n", + fnname, arg); + return -EINVAL; + } + + t = btf_type_by_id(btf_vmlinux, args[arg].type); + if (!btf_type_is_ptr(t) || !t->type) { + /* anything but the pointer to struct is a helper config bug */ + bpf_log(log, "ARG_PTR_TO_BTF is misconfigured\n"); + return -EFAULT; + } + btf_id = t->type; + t = btf_type_by_id(btf_vmlinux, t->type); + /* skip modifiers */ + while (btf_type_is_modifier(t)) { + btf_id = t->type; + t = btf_type_by_id(btf_vmlinux, t->type); + } + if (!btf_type_is_struct(t)) { + bpf_log(log, "ARG_PTR_TO_BTF is not a struct\n"); + return -EFAULT; + } + bpf_log(log, "helper %s arg%d has btf_id %d struct %s\n", fnname + 4, + arg, btf_id, __btf_name_by_offset(btf_vmlinux, t->name_off)); + return btf_id; +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fba9ef6a831b..556e82f8869b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -205,6 +205,7 @@ struct bpf_call_arg_meta { u64 msize_umax_value; int ref_obj_id; int func_id; + u32 btf_id; }; struct btf *btf_vmlinux; @@ -3439,6 +3440,22 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_SOCKET; if (type != expected_type) goto err_type; + } else if (arg_type == ARG_PTR_TO_BTF_ID) { + expected_type = PTR_TO_BTF_ID; + if (type != expected_type) + goto err_type; + if (reg->btf_id != meta->btf_id) { + verbose(env, "Helper has type %s got %s in R%d\n", + kernel_type_name(meta->btf_id), + kernel_type_name(reg->btf_id), regno); + + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value || reg->off) { + verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", + regno); + return -EACCES; + } } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -3586,6 +3603,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_PERF_EVENT_ARRAY: if (func_id != BPF_FUNC_perf_event_read && func_id != BPF_FUNC_perf_event_output && + func_id != BPF_FUNC_skb_output && func_id != BPF_FUNC_perf_event_read_value) goto error; break; @@ -3673,6 +3691,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: case BPF_FUNC_perf_event_read_value: + case BPF_FUNC_skb_output: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; @@ -4127,21 +4146,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ - err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta); - if (err) - return err; - err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta); - if (err) - return err; + for (i = 0; i < 5; i++) { + if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID) { + if (!fn->btf_id[i]) + fn->btf_id[i] = btf_resolve_helper_id(&env->log, fn->func, i); + meta.btf_id = fn->btf_id[i]; + } + err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta); + if (err) + return err; + } err = record_func_map(env, &meta, func_id, insn_idx); if (err) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6221e8c6ecc3..52f7e9d8c29b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -995,6 +995,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +extern const struct bpf_func_proto bpf_skb_output_proto; + BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) { @@ -1053,6 +1055,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_raw_tp; + case BPF_FUNC_skb_output: + return &bpf_skb_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: diff --git a/net/core/filter.c b/net/core/filter.c index 46196e212413..728ba6203c1f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3798,7 +3798,7 @@ BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; - if (unlikely(skb_size > skb->len)) + if (unlikely(!skb || skb_size > skb->len)) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, @@ -3816,6 +3816,19 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +static u32 bpf_skb_output_btf_ids[5]; +const struct bpf_func_proto bpf_skb_output_proto = { + .func = bpf_skb_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_skb_output_btf_ids, +}; + static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3bb2cd1de341..4af8b0819a32 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2751,6 +2751,30 @@ union bpf_attr { * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + * + * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2863,7 +2887,8 @@ union bpf_attr { FN(sk_storage_get), \ FN(sk_storage_delete), \ FN(send_signal), \ - FN(tcp_gen_syncookie), + FN(tcp_gen_syncookie), \ + FN(skb_output), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From a8c41a68076e88d24fd7c0ac39de93654a298594 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Thu, 17 Oct 2019 18:34:13 +0800 Subject: pktgen: remove unnecessary assignment in pktgen_xmit() variable ret is not used after jumping to "unlock" label, so the assignment is redundant. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- net/core/pktgen.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 48b1e429857c..294bfcf0ce0e 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3404,7 +3404,6 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) HARD_TX_LOCK(odev, txq, smp_processor_id()); if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) { - ret = NETDEV_TX_BUSY; pkt_dev->last_ok = 0; goto unlock; } -- cgit v1.2.3 From f95f96a4946ac9a38acf85f8421d8e9c5cbb516f Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 25 Oct 2019 17:18:36 +0800 Subject: sock: remove unneeded semicolon remove unneeded semicolon. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 5cb567e36f5e..997b352c2a72 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3013,7 +3013,7 @@ int sock_gettstamp(struct socket *sock, void __user *userstamp, return -ENOENT; if (ts.tv_sec == 0) { ktime_t kt = ktime_get_real(); - sock_write_timestamp(sk, kt);; + sock_write_timestamp(sk, kt); ts = ktime_to_timespec64(kt); } -- cgit v1.2.3 From 98298e6ca6d5908f96e529e70a254a4d5bf754e7 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 29 Oct 2019 14:50:50 +0100 Subject: flow_dissector: add meaningful comments Documents two piece of code which can't be understood at a glance. Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 1 + net/core/flow_dissector.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'net/core') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 90bd210be060..7747af3cc500 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -282,6 +282,7 @@ struct flow_keys { struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; + /* 'addrs' must be the last member */ struct flow_dissector_key_addrs addrs; }; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index dbf502c18656..bc22b384ac6c 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1408,6 +1408,9 @@ static inline size_t flow_keys_hash_length(const struct flow_keys *flow) { size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); + /* flow.addrs MUST be the last member in struct flow_keys because + * different L3 protocols have different address length + */ BUILD_BUG_ON(offsetof(typeof(*flow), addrs) != sizeof(*flow) - sizeof(flow->addrs)); @@ -1455,6 +1458,9 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow) } EXPORT_SYMBOL(flow_get_u32_dst); +/* Sort the source and destination IP (and the ports if the IP are the same), + * to have consistent hash within the two directions + */ static inline void __flow_hash_consistentify(struct flow_keys *keys) { int addr_diff, i; -- cgit v1.2.3 From 3b336d6f4ec690b0082bcffe55bac22f234a41ff Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 29 Oct 2019 14:50:51 +0100 Subject: flow_dissector: skip the ICMP dissector for non ICMP packets FLOW_DISSECTOR_KEY_ICMP is checked for every packet, not only ICMP ones. Even if the test overhead is probably negligible, move the ICMP dissector code under the big 'switch(ip_proto)' so it gets called only for ICMP packets. Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index bc22b384ac6c..0fb721976f3d 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -234,6 +234,25 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); +/* If FLOW_DISSECTOR_KEY_ICMP is set, get the Type and Code from an ICMP packet + * using skb_flow_get_be16(). + */ +static void __skb_flow_dissect_icmp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, + void *data, int thoff, int hlen) +{ + struct flow_dissector_key_icmp *key_icmp; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP)) + return; + + key_icmp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ICMP, + target_container); + key_icmp->icmp = skb_flow_get_be16(skb, thoff, data, hlen); +} + void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) @@ -884,7 +903,6 @@ bool __skb_flow_dissect(const struct net *net, struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; - struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; struct bpf_prog *attached = NULL; @@ -1329,6 +1347,12 @@ ip_proto_again: data, nhoff, hlen); break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + __skb_flow_dissect_icmp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; + default: break; } @@ -1342,14 +1366,6 @@ ip_proto_again: data, hlen); } - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ICMP)) { - key_icmp = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_ICMP, - target_container); - key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen); - } - /* Process result of IP proto processing */ switch (fdret) { case FLOW_DISSECT_RET_PROTO_AGAIN: -- cgit v1.2.3 From 5dec597e5cd0f4c3000d120508efa64157d5bd7a Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 29 Oct 2019 14:50:52 +0100 Subject: flow_dissector: extract more ICMP information The ICMP flow dissector currently parses only the Type and Code fields. Some ICMP packets (echo, timestamp) have a 16 bit Identifier field which is used to correlate packets. Add such field in flow_dissector_key_icmp and replace skb_flow_get_be16() with a more complex function which populate this field. Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 19 +++++++----- net/core/flow_dissector.c | 74 ++++++++++++++++++++++++++++++-------------- 2 files changed, 61 insertions(+), 32 deletions(-) (limited to 'net/core') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 7747af3cc500..f8541d018848 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -6,6 +6,8 @@ #include #include +struct sk_buff; + /** * struct flow_dissector_key_control: * @thoff: Transport header offset @@ -156,19 +158,16 @@ struct flow_dissector_key_ports { /** * flow_dissector_key_icmp: - * @ports: type and code of ICMP header - * icmp: ICMP type (high) and code (low) * type: ICMP type * code: ICMP code + * id: session identifier */ struct flow_dissector_key_icmp { - union { - __be16 icmp; - struct { - u8 type; - u8 code; - }; + struct { + u8 type; + u8 code; }; + u16 id; }; /** @@ -282,6 +281,7 @@ struct flow_keys { struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; + struct flow_dissector_key_icmp icmp; /* 'addrs' must be the last member */ struct flow_dissector_key_addrs addrs; }; @@ -316,6 +316,9 @@ static inline bool flow_keys_have_l4(const struct flow_keys *keys) } u32 flow_hash_from_keys(struct flow_keys *keys); +void skb_flow_get_icmp_tci(const struct sk_buff *skb, + struct flow_dissector_key_icmp *key_icmp, + void *data, int thoff, int hlen); static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0fb721976f3d..0807df0bde02 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -178,27 +178,6 @@ int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) mutex_unlock(&flow_dissector_mutex); return 0; } -/** - * skb_flow_get_be16 - extract be16 entity - * @skb: sk_buff to extract from - * @poff: offset to extract at - * @data: raw buffer pointer to the packet - * @hlen: packet header length - * - * The function will try to retrieve a be32 entity at - * offset poff - */ -static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff, - void *data, int hlen) -{ - __be16 *u, _u; - - u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u); - if (u) - return *u; - - return 0; -} /** * __skb_flow_get_ports - extract the upper layer ports and return them @@ -234,8 +213,54 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); -/* If FLOW_DISSECTOR_KEY_ICMP is set, get the Type and Code from an ICMP packet - * using skb_flow_get_be16(). +static bool icmp_has_id(u8 type) +{ + switch (type) { + case ICMP_ECHO: + case ICMP_ECHOREPLY: + case ICMP_TIMESTAMP: + case ICMP_TIMESTAMPREPLY: + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + return true; + } + + return false; +} + +/** + * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields + * @skb: sk_buff to extract from + * @key_icmp: struct flow_dissector_key_icmp to fill + * @data: raw buffer pointer to the packet + * @toff: offset to extract at + * @hlen: packet header length + */ +void skb_flow_get_icmp_tci(const struct sk_buff *skb, + struct flow_dissector_key_icmp *key_icmp, + void *data, int thoff, int hlen) +{ + struct icmphdr *ih, _ih; + + ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih); + if (!ih) + return; + + key_icmp->type = ih->type; + key_icmp->code = ih->code; + + /* As we use 0 to signal that the Id field is not present, + * avoid confusion with packets without such field + */ + if (icmp_has_id(ih->type)) + key_icmp->id = ih->un.echo.id ? : 1; + else + key_icmp->id = 0; +} +EXPORT_SYMBOL(skb_flow_get_icmp_tci); + +/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet + * using skb_flow_get_icmp_tci(). */ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, @@ -250,7 +275,8 @@ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, key_icmp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ICMP, target_container); - key_icmp->icmp = skb_flow_get_be16(skb, thoff, data, hlen); + + skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); } void skb_flow_dissect_meta(const struct sk_buff *skb, -- cgit v1.2.3 From d0083d98f685b9f4fe810570f93cef0b0bb6b354 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Nov 2019 19:13:14 -0800 Subject: net_sched: extend packet counter to 64bit After this change, qdisc packet counter is no longer a 32bit quantity. We still export 32bit values to user. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/gen_stats.h | 4 ++-- net/core/gen_stats.c | 3 +-- net/sched/act_simple.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 5f3889e7ec1b..1424e02cef90 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -10,8 +10,8 @@ /* Note: this used to be in include/uapi/linux/gen_stats.h */ struct gnet_stats_basic_packed { __u64 bytes; - __u32 packets; -} __attribute__ ((packed)); + __u64 packets; +}; struct gnet_stats_basic_cpu { struct gnet_stats_basic_packed bstats; diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 36888f5e09eb..fe33e2a9841e 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -123,8 +123,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, for_each_possible_cpu(i) { struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); unsigned int start; - u64 bytes; - u32 packets; + u64 bytes, packets; do { start = u64_stats_fetch_begin_irq(&bcpu->syncp); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 97639b259cd7..9813ca4006dd 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -35,7 +35,7 @@ static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, * Example if this was the 3rd packet and the string was "hello" * then it would look like "hello_3" (without quotes) */ - pr_info("simple: %s_%d\n", + pr_info("simple: %s_%llu\n", (char *)d->tcfd_defdata, d->tcf_bstats.packets); spin_unlock(&d->tcf_lock); return d->tcf_action; -- cgit v1.2.3 From b33e699fe43aa63f29113311f69357e119ef5276 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Nov 2019 19:13:15 -0800 Subject: net_sched: add TCA_STATS_PKT64 attribute Now the kernel uses 64bit packet counters in scheduler layer, we want to export these counters to user space. Instead risking breaking user space by adding fields to struct gnet_stats_basic, add a new TCA_STATS_PKT64. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/gen_stats.h | 1 + net/core/gen_stats.c | 9 +++++++-- net/sched/act_api.c | 2 ++ 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h index 4eaacdf452e3..852f234f1fd6 100644 --- a/include/uapi/linux/gen_stats.h +++ b/include/uapi/linux/gen_stats.h @@ -13,6 +13,7 @@ enum { TCA_STATS_RATE_EST64, TCA_STATS_PAD, TCA_STATS_BASIC_HW, + TCA_STATS_PKT64, __TCA_STATS_MAX, }; #define TCA_STATS_MAX (__TCA_STATS_MAX - 1) diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index fe33e2a9841e..1d653fbfcf52 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -175,12 +175,17 @@ ___gnet_stats_copy_basic(const seqcount_t *running, if (d->tail) { struct gnet_stats_basic sb; + int res; memset(&sb, 0, sizeof(sb)); sb.bytes = bstats.bytes; sb.packets = bstats.packets; - return gnet_stats_copy(d, type, &sb, sizeof(sb), - TCA_STATS_PAD); + res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); + if (res < 0 || sb.packets == bstats.packets) + return res; + /* emit 64bit stats only if needed */ + return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets, + sizeof(bstats.packets), TCA_STATS_PAD); } return 0; } diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 6284c552e943..bda1ba25c59e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -188,6 +188,8 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act) + nla_total_size(0) /* TCA_ACT_STATS nested */ /* TCA_STATS_BASIC */ + nla_total_size_64bit(sizeof(struct gnet_stats_basic)) + /* TCA_STATS_PKT64 */ + + nla_total_size_64bit(sizeof(u64)) /* TCA_STATS_QUEUE */ + nla_total_size_64bit(sizeof(struct gnet_stats_queue)) + nla_total_size(0) /* TCA_OPTIONS nested */ -- cgit v1.2.3 From 9d027e3a83f39b819e908e4e09084277a2e45e95 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Nov 2019 14:11:49 -0800 Subject: net: neigh: use long type to store jiffies delta A difference of two unsigned long needs long storage. Fixes: c7fb64db001f ("[NETLINK]: Neighbour table configuration and statistics via rtnetlink") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5480edff0c86..8c82e95f7539 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2052,8 +2052,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, goto nla_put_failure; { unsigned long now = jiffies; - unsigned int flush_delta = now - tbl->last_flush; - unsigned int rand_delta = now - tbl->last_rand; + long flush_delta = now - tbl->last_flush; + long rand_delta = now - tbl->last_rand; struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, -- cgit v1.2.3 From 1c8dd9cb4697a425ecb9e9fb8a6c05955642e141 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Nov 2019 20:52:40 -0800 Subject: net_sched: gen_estimator: extend packet counter to 64bit I forgot to change last_packets field in struct net_rate_estimator. Without this fix, rate estimators would misbehave after more than 2^32 packets have been sent. Another solution would be to be careful and only use the 32 least significant bits of packets counters, but we have a hole in net_rate_estimator structure and this looks easier to read/maintain. Fixes: d0083d98f685 ("net_sched: extend packet counter to 64bit") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/gen_estimator.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index bfe7bdd4c340..80dbf2f4016e 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -48,7 +48,7 @@ struct net_rate_estimator { u8 intvl_log; /* period : (250ms << intvl_log) */ seqcount_t seq; - u32 last_packets; + u64 last_packets; u64 last_bytes; u64 avpps; @@ -83,7 +83,7 @@ static void est_timer(struct timer_list *t) brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log); brate -= (est->avbps >> est->ewma_log); - rate = (u64)(b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); + rate = (b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); rate -= (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); -- cgit v1.2.3 From 6896cc4d8fe6fe6163d6f0baa02a270da68896e8 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Thu, 7 Nov 2019 18:42:09 +0200 Subject: devlink: Add layer 3 generic packet traps Add packet traps that can report packets that were dropped during layer 3 forwarding. Signed-off-by: Amit Cohen Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- Documentation/networking/devlink-trap.rst | 41 +++++++++++++++++++++++++++++++ include/net/devlink.h | 27 ++++++++++++++++++++ net/core/devlink.c | 9 +++++++ 3 files changed, 77 insertions(+) (limited to 'net/core') diff --git a/Documentation/networking/devlink-trap.rst b/Documentation/networking/devlink-trap.rst index 8e90a85f3bd5..dc3dc87217c9 100644 --- a/Documentation/networking/devlink-trap.rst +++ b/Documentation/networking/devlink-trap.rst @@ -162,6 +162,47 @@ be added to the following table: - ``drop`` - Traps packets that the device decided to drop because they could not be enqueued to a transmission queue which is full + * - ``non_ip`` + - ``drop`` + - Traps packets that the device decided to drop because they need to + undergo a layer 3 lookup, but are not IP or MPLS packets + * - ``uc_dip_over_mc_dmac`` + - ``drop`` + - Traps packets that the device decided to drop because they need to be + routed and they have a unicast destination IP and a multicast destination + MAC + * - ``dip_is_loopback_address`` + - ``drop`` + - Traps packets that the device decided to drop because they need to be + routed and their destination IP is the loopback address (i.e., 127.0.0.0/8 + and ::1/128) + * - ``sip_is_mc`` + - ``drop`` + - Traps packets that the device decided to drop because they need to be + routed and their source IP is multicast (i.e., 224.0.0.0/8 and ff::/8) + * - ``sip_is_loopback_address`` + - ``drop`` + - Traps packets that the device decided to drop because they need to be + routed and their source IP is the loopback address (i.e., 127.0.0.0/8 and ::1/128) + * - ``ip_header_corrupted`` + - ``drop`` + - Traps packets that the device decided to drop because they need to be + routed and their IP header is corrupted: wrong checksum, wrong IP version + or too short Internet Header Length (IHL) + * - ``ipv4_sip_is_limited_bc`` + - ``drop`` + - Traps packets that the device decided to drop because they need to be + routed and their source IP is limited broadcast (i.e., 255.255.255.255/32) + * - ``ipv6_mc_dip_reserved_scope`` + - ``drop`` + - Traps IPv6 packets that the device decided to drop because they need to + be routed and their IPv6 multicast destination IP has a reserved scope + (i.e., ffx0::/16) + * - ``ipv6_mc_dip_interface_local_scope`` + - ``drop`` + - Traps IPv6 packets that the device decided to drop because they need to + be routed and their IPv6 multicast destination IP has an interface-local scope + (i.e., ffx1::/16) Driver-specific Packet Traps ============================ diff --git a/include/net/devlink.h b/include/net/devlink.h index 6bf3b9e0595a..df7814d55bf9 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -569,6 +569,15 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_BLACKHOLE_ROUTE, DEVLINK_TRAP_GENERIC_ID_TTL_ERROR, DEVLINK_TRAP_GENERIC_ID_TAIL_DROP, + DEVLINK_TRAP_GENERIC_ID_NON_IP_PACKET, + DEVLINK_TRAP_GENERIC_ID_UC_DIP_MC_DMAC, + DEVLINK_TRAP_GENERIC_ID_DIP_LB, + DEVLINK_TRAP_GENERIC_ID_SIP_MC, + DEVLINK_TRAP_GENERIC_ID_SIP_LB, + DEVLINK_TRAP_GENERIC_ID_CORRUPTED_IP_HDR, + DEVLINK_TRAP_GENERIC_ID_IPV4_SIP_BC, + DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_RESERVED_SCOPE, + DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -607,6 +616,24 @@ enum devlink_trap_group_generic_id { "ttl_value_is_too_small" #define DEVLINK_TRAP_GENERIC_NAME_TAIL_DROP \ "tail_drop" +#define DEVLINK_TRAP_GENERIC_NAME_NON_IP_PACKET \ + "non_ip" +#define DEVLINK_TRAP_GENERIC_NAME_UC_DIP_MC_DMAC \ + "uc_dip_over_mc_dmac" +#define DEVLINK_TRAP_GENERIC_NAME_DIP_LB \ + "dip_is_loopback_address" +#define DEVLINK_TRAP_GENERIC_NAME_SIP_MC \ + "sip_is_mc" +#define DEVLINK_TRAP_GENERIC_NAME_SIP_LB \ + "sip_is_loopback_address" +#define DEVLINK_TRAP_GENERIC_NAME_CORRUPTED_IP_HDR \ + "ip_header_corrupted" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_SIP_BC \ + "ipv4_sip_is_limited_bc" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_MC_DIP_RESERVED_SCOPE \ + "ipv6_mc_dip_reserved_scope" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE \ + "ipv6_mc_dip_interface_local_scope" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" diff --git a/net/core/devlink.c b/net/core/devlink.c index 97e9a2246929..9bbe2162f22f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -7602,6 +7602,15 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP), DEVLINK_TRAP(TTL_ERROR, EXCEPTION), DEVLINK_TRAP(TAIL_DROP, DROP), + DEVLINK_TRAP(NON_IP_PACKET, DROP), + DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP), + DEVLINK_TRAP(DIP_LB, DROP), + DEVLINK_TRAP(SIP_MC, DROP), + DEVLINK_TRAP(SIP_LB, DROP), + DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP), + DEVLINK_TRAP(IPV4_SIP_BC, DROP), + DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP), + DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ -- cgit v1.2.3 From 3b063ae57bdfec5e574ace440e6c3f34c4115a92 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Thu, 7 Nov 2019 18:42:14 +0200 Subject: devlink: Add layer 3 generic packet exception traps Add layer 3 generic packet exception traps that can report trapped packets and documentation of the traps. Unlike drop traps, these exception traps also need to inject the packet to the kernel's receive path. For example, a packet that was trapped due to unreachable neighbour need to be injected into the kernel so that it will trigger an ARP request or a neighbour solicitation message. Signed-off-by: Amit Cohen Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- Documentation/networking/devlink-trap.rst | 20 ++++++++++++++++++++ include/net/devlink.h | 18 ++++++++++++++++++ net/core/devlink.c | 6 ++++++ 3 files changed, 44 insertions(+) (limited to 'net/core') diff --git a/Documentation/networking/devlink-trap.rst b/Documentation/networking/devlink-trap.rst index dc3dc87217c9..dc9659ca06fa 100644 --- a/Documentation/networking/devlink-trap.rst +++ b/Documentation/networking/devlink-trap.rst @@ -203,6 +203,26 @@ be added to the following table: - Traps IPv6 packets that the device decided to drop because they need to be routed and their IPv6 multicast destination IP has an interface-local scope (i.e., ffx1::/16) + * - ``mtu_value_is_too_small`` + - ``exception`` + - Traps packets that should have been routed by the device, but were bigger + than the MTU of the egress interface + * - ``unresolved_neigh`` + - ``exception`` + - Traps packets that did not have a matching IP neighbour after routing + * - ``mc_reverse_path_forwarding`` + - ``exception`` + - Traps multicast IP packets that failed reverse-path forwarding (RPF) + check during multicast routing + * - ``reject_route`` + - ``exception`` + - Traps packets that hit reject routes (i.e., "unreachable", "prohibit") + * - ``ipv4_lpm_miss`` + - ``exception`` + - Traps unicast IPv4 packets that did not match any route + * - ``ipv6_lpm_miss`` + - ``exception`` + - Traps unicast IPv6 packets that did not match any route Driver-specific Packet Traps ============================ diff --git a/include/net/devlink.h b/include/net/devlink.h index df7814d55bf9..8d6b5846822c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -578,6 +578,12 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_IPV4_SIP_BC, DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_RESERVED_SCOPE, DEVLINK_TRAP_GENERIC_ID_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, + DEVLINK_TRAP_GENERIC_ID_MTU_ERROR, + DEVLINK_TRAP_GENERIC_ID_UNRESOLVED_NEIGH, + DEVLINK_TRAP_GENERIC_ID_RPF, + DEVLINK_TRAP_GENERIC_ID_REJECT_ROUTE, + DEVLINK_TRAP_GENERIC_ID_IPV4_LPM_UNICAST_MISS, + DEVLINK_TRAP_GENERIC_ID_IPV6_LPM_UNICAST_MISS, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -634,6 +640,18 @@ enum devlink_trap_group_generic_id { "ipv6_mc_dip_reserved_scope" #define DEVLINK_TRAP_GENERIC_NAME_IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE \ "ipv6_mc_dip_interface_local_scope" +#define DEVLINK_TRAP_GENERIC_NAME_MTU_ERROR \ + "mtu_value_is_too_small" +#define DEVLINK_TRAP_GENERIC_NAME_UNRESOLVED_NEIGH \ + "unresolved_neigh" +#define DEVLINK_TRAP_GENERIC_NAME_RPF \ + "mc_reverse_path_forwarding" +#define DEVLINK_TRAP_GENERIC_NAME_REJECT_ROUTE \ + "reject_route" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_LPM_UNICAST_MISS \ + "ipv4_lpm_miss" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_LPM_UNICAST_MISS \ + "ipv6_lpm_miss" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" diff --git a/net/core/devlink.c b/net/core/devlink.c index 9bbe2162f22f..ff53f7d29dea 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -7611,6 +7611,12 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(IPV4_SIP_BC, DROP), DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP), DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP), + DEVLINK_TRAP(MTU_ERROR, EXCEPTION), + DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION), + DEVLINK_TRAP(RPF, EXCEPTION), + DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION), + DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION), + DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION), }; #define DEVLINK_TRAP_GROUP(_id) \ -- cgit v1.2.3 From c305c6ae79e2ce20c22660ceda94f0d86d639a82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 18:29:11 -0800 Subject: net: add annotations on hh->hh_len lockless accesses KCSAN reported a data-race [1] While we can use READ_ONCE() on the read sides, we need to make sure hh->hh_len is written last. [1] BUG: KCSAN: data-race in eth_header_cache / neigh_resolve_output write to 0xffff8880b9dedcb8 of 4 bytes by task 29760 on cpu 0: eth_header_cache+0xa9/0xd0 net/ethernet/eth.c:247 neigh_hh_init net/core/neighbour.c:1463 [inline] neigh_resolve_output net/core/neighbour.c:1480 [inline] neigh_resolve_output+0x415/0x470 net/core/neighbour.c:1470 neigh_output include/net/neighbour.h:511 [inline] ip6_finish_output2+0x7a2/0xec0 net/ipv6/ip6_output.c:116 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline] __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175 dst_output include/net/dst.h:436 [inline] NF_HOOK include/linux/netfilter.h:305 [inline] ndisc_send_skb+0x459/0x5f0 net/ipv6/ndisc.c:505 ndisc_send_ns+0x207/0x430 net/ipv6/ndisc.c:647 rt6_probe_deferred+0x98/0xf0 net/ipv6/route.c:615 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269 worker_thread+0xa0/0x800 kernel/workqueue.c:2415 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 read to 0xffff8880b9dedcb8 of 4 bytes by task 29572 on cpu 1: neigh_resolve_output net/core/neighbour.c:1479 [inline] neigh_resolve_output+0x113/0x470 net/core/neighbour.c:1470 neigh_output include/net/neighbour.h:511 [inline] ip6_finish_output2+0x7a2/0xec0 net/ipv6/ip6_output.c:116 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline] __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175 dst_output include/net/dst.h:436 [inline] NF_HOOK include/linux/netfilter.h:305 [inline] ndisc_send_skb+0x459/0x5f0 net/ipv6/ndisc.c:505 ndisc_send_ns+0x207/0x430 net/ipv6/ndisc.c:647 rt6_probe_deferred+0x98/0xf0 net/ipv6/route.c:615 process_one_work+0x3d4/0x890 kernel/workqueue.c:2269 worker_thread+0xa0/0x800 kernel/workqueue.c:2415 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 29572 Comm: kworker/1:4 Not tainted 5.4.0-rc6+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events rt6_probe_deferred Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- drivers/firewire/net.c | 6 +++++- include/net/neighbour.h | 2 +- net/core/neighbour.c | 4 ++-- net/ethernet/eth.c | 7 ++++++- 4 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c index b132ab9ad607..715e491dfbc3 100644 --- a/drivers/firewire/net.c +++ b/drivers/firewire/net.c @@ -250,7 +250,11 @@ static int fwnet_header_cache(const struct neighbour *neigh, h = (struct fwnet_header *)((u8 *)hh->hh_data + HH_DATA_OFF(sizeof(*h))); h->h_proto = type; memcpy(h->h_dest, neigh->ha, net->addr_len); - hh->hh_len = FWNET_HLEN; + + /* Pairs with the READ_ONCE() in neigh_resolve_output(), + * neigh_hh_output() and neigh_update_hhs(). + */ + smp_store_release(&hh->hh_len, FWNET_HLEN); return 0; } diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 50a67bd6a434..6a86e49181db 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -468,7 +468,7 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb do { seq = read_seqbegin(&hh->hh_lock); - hh_len = hh->hh_len; + hh_len = READ_ONCE(hh->hh_len); if (likely(hh_len <= HH_DATA_MOD)) { hh_alen = HH_DATA_MOD; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8c82e95f7539..652da6369037 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1197,7 +1197,7 @@ static void neigh_update_hhs(struct neighbour *neigh) if (update) { hh = &neigh->hh; - if (hh->hh_len) { + if (READ_ONCE(hh->hh_len)) { write_seqlock_bh(&hh->hh_lock); update(hh, neigh->dev, neigh->ha); write_sequnlock_bh(&hh->hh_lock); @@ -1476,7 +1476,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) struct net_device *dev = neigh->dev; unsigned int seq; - if (dev->header_ops->cache && !neigh->hh.hh_len) + if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len)) neigh_hh_init(neigh); do { diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 17374afee28f..9040fe55e0f5 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -244,7 +244,12 @@ int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 eth->h_proto = type; memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); memcpy(eth->h_dest, neigh->ha, ETH_ALEN); - hh->hh_len = ETH_HLEN; + + /* Pairs with READ_ONCE() in neigh_resolve_output(), + * neigh_hh_output() and neigh_update_hhs(). + */ + smp_store_release(&hh->hh_len, ETH_HLEN); + return 0; } EXPORT_SYMBOL(eth_header_cache); -- cgit v1.2.3 From 90b2be27bb0e56483f335cc10fb59ec66882b949 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Nov 2019 08:45:23 -0800 Subject: net/sched: annotate lockless accesses to qdisc->empty KCSAN reported the following race [1] BUG: KCSAN: data-race in __dev_queue_xmit / net_tx_action read to 0xffff8880ba403508 of 1 bytes by task 21814 on cpu 1: __dev_xmit_skb net/core/dev.c:3389 [inline] __dev_queue_xmit+0x9db/0x1b40 net/core/dev.c:3761 dev_queue_xmit+0x21/0x30 net/core/dev.c:3825 neigh_hh_output include/net/neighbour.h:500 [inline] neigh_output include/net/neighbour.h:509 [inline] ip6_finish_output2+0x873/0xec0 net/ipv6/ip6_output.c:116 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline] __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175 dst_output include/net/dst.h:436 [inline] ip6_local_out+0x74/0x90 net/ipv6/output_core.c:179 ip6_send_skb+0x53/0x110 net/ipv6/ip6_output.c:1795 udp_v6_send_skb.isra.0+0x3ec/0xa70 net/ipv6/udp.c:1173 udpv6_sendmsg+0x1906/0x1c20 net/ipv6/udp.c:1471 inet6_sendmsg+0x6d/0x90 net/ipv6/af_inet6.c:576 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0x9f/0xc0 net/socket.c:657 ___sys_sendmsg+0x2b7/0x5d0 net/socket.c:2311 __sys_sendmmsg+0x123/0x350 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] __se_sys_sendmmsg net/socket.c:2439 [inline] __x64_sys_sendmmsg+0x64/0x80 net/socket.c:2439 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 write to 0xffff8880ba403508 of 1 bytes by interrupt on cpu 0: qdisc_run_begin include/net/sch_generic.h:160 [inline] qdisc_run include/net/pkt_sched.h:120 [inline] net_tx_action+0x2b1/0x6c0 net/core/dev.c:4551 __do_softirq+0x115/0x33f kernel/softirq.c:292 do_softirq_own_stack+0x2a/0x40 arch/x86/entry/entry_64.S:1082 do_softirq.part.0+0x6b/0x80 kernel/softirq.c:337 do_softirq kernel/softirq.c:329 [inline] __local_bh_enable_ip+0x76/0x80 kernel/softirq.c:189 local_bh_enable include/linux/bottom_half.h:32 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:688 [inline] ip6_finish_output2+0x7bb/0xec0 net/ipv6/ip6_output.c:117 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline] __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175 dst_output include/net/dst.h:436 [inline] ip6_local_out+0x74/0x90 net/ipv6/output_core.c:179 ip6_send_skb+0x53/0x110 net/ipv6/ip6_output.c:1795 udp_v6_send_skb.isra.0+0x3ec/0xa70 net/ipv6/udp.c:1173 udpv6_sendmsg+0x1906/0x1c20 net/ipv6/udp.c:1471 inet6_sendmsg+0x6d/0x90 net/ipv6/af_inet6.c:576 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0x9f/0xc0 net/socket.c:657 ___sys_sendmsg+0x2b7/0x5d0 net/socket.c:2311 __sys_sendmmsg+0x123/0x350 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] __se_sys_sendmmsg net/socket.c:2439 [inline] __x64_sys_sendmmsg+0x64/0x80 net/socket.c:2439 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 21817 Comm: syz-executor.2 Not tainted 5.4.0-rc6+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: d518d2ed8640 ("net/sched: fix race between deactivation and dequeue for NOLOCK qdisc") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Paolo Abeni Cc: Davide Caratti Signed-off-by: David S. Miller --- include/net/sch_generic.h | 6 +++--- net/core/dev.c | 2 +- net/sched/sch_generic.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index a8b0a9a4c686..d43da37737be 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -148,8 +148,8 @@ static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) static inline bool qdisc_is_empty(const struct Qdisc *qdisc) { if (qdisc_is_percpu_stats(qdisc)) - return qdisc->empty; - return !qdisc->q.qlen; + return READ_ONCE(qdisc->empty); + return !READ_ONCE(qdisc->q.qlen); } static inline bool qdisc_run_begin(struct Qdisc *qdisc) @@ -157,7 +157,7 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) if (qdisc->flags & TCQ_F_NOLOCK) { if (!spin_trylock(&qdisc->seqlock)) return false; - qdisc->empty = false; + WRITE_ONCE(qdisc->empty, false); } else if (qdisc_is_running(qdisc)) { return false; } diff --git a/net/core/dev.c b/net/core/dev.c index bb15800c8cb5..1c799d486623 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3607,7 +3607,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); if (q->flags & TCQ_F_NOLOCK) { - if ((q->flags & TCQ_F_CAN_BYPASS) && q->empty && + if ((q->flags & TCQ_F_CAN_BYPASS) && READ_ONCE(q->empty) && qdisc_run_begin(q)) { if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 8561e825f401..5ab696efca95 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -652,7 +652,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) if (likely(skb)) { qdisc_update_stats_at_dequeue(qdisc, skb); } else { - qdisc->empty = true; + WRITE_ONCE(qdisc->empty, true); } return skb; -- cgit v1.2.3 From a0c76345e3d3dbc40c39de2e00d15a3b7eef7885 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 8 Nov 2019 21:42:43 +0100 Subject: devlink: disallow reload operation during device cleanup There is a race between driver code that does setup/cleanup of device and devlink reload operation that in some drivers works with the same code. Use after free could we easily obtained by running: while true; do echo 10 > /sys/bus/netdevsim/new_device devlink dev reload netdevsim/netdevsim10 & echo 10 > /sys/bus/netdevsim/del_device done Fix this by enabling reload only after setup of device is complete and disabling it at the beginning of the cleanup process. Reported-by: Ido Schimmel Fixes: 2d8dc5bbf4e7 ("devlink: Add support for reload") Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/main.c | 3 +++ drivers/net/ethernet/mellanox/mlxsw/core.c | 6 ++++- drivers/net/netdevsim/dev.c | 3 +++ include/net/devlink.h | 7 +++-- net/core/devlink.c | 42 +++++++++++++++++++++++++++++- 5 files changed, 57 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 22c72fb7206a..77f056b0895e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -4015,6 +4015,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) goto err_params_unregister; devlink_params_publish(devlink); + devlink_reload_enable(devlink); pci_save_state(pdev); return 0; @@ -4126,6 +4127,8 @@ static void mlx4_remove_one(struct pci_dev *pdev) struct devlink *devlink = priv_to_devlink(priv); int active_vfs = 0; + devlink_reload_disable(devlink); + if (mlx4_is_slave(dev)) persist->interface_state |= MLX4_INTERFACE_STATE_NOWAIT; diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index e1a90f5bddd0..da436a6aad2f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -1198,8 +1198,10 @@ __mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info, if (err) goto err_thermal_init; - if (mlxsw_driver->params_register) + if (mlxsw_driver->params_register) { devlink_params_publish(devlink); + devlink_reload_enable(devlink); + } return 0; @@ -1263,6 +1265,8 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core, { struct devlink *devlink = priv_to_devlink(mlxsw_core); + if (!reload) + devlink_reload_disable(devlink); if (devlink_is_reload_failed(devlink)) { if (!reload) /* Only the parts that were not de-initialized in the diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 3da96c7e8265..059711edfc61 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -820,6 +820,7 @@ int nsim_dev_probe(struct nsim_bus_dev *nsim_bus_dev) goto err_bpf_dev_exit; devlink_params_publish(devlink); + devlink_reload_enable(devlink); return 0; err_bpf_dev_exit: @@ -865,6 +866,8 @@ void nsim_dev_remove(struct nsim_bus_dev *nsim_bus_dev) struct nsim_dev *nsim_dev = dev_get_drvdata(&nsim_bus_dev->dev); struct devlink *devlink = priv_to_devlink(nsim_dev); + devlink_reload_disable(devlink); + nsim_dev_reload_destroy(nsim_dev); nsim_bpf_dev_exit(nsim_dev); diff --git a/include/net/devlink.h b/include/net/devlink.h index 8d6b5846822c..7891611868e4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -38,8 +38,9 @@ struct devlink { struct device *dev; possible_net_t _net; struct mutex lock; - bool reload_failed; - bool registered; + u8 reload_failed:1, + reload_enabled:1, + registered:1; char priv[0] __aligned(NETDEV_ALIGN); }; @@ -824,6 +825,8 @@ void devlink_net_set(struct devlink *devlink, struct net *net); struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size); int devlink_register(struct devlink *devlink, struct device *dev); void devlink_unregister(struct devlink *devlink); +void devlink_reload_enable(struct devlink *devlink); +void devlink_reload_disable(struct devlink *devlink); void devlink_free(struct devlink *devlink); int devlink_port_register(struct devlink *devlink, struct devlink_port *devlink_port, diff --git a/net/core/devlink.c b/net/core/devlink.c index ff53f7d29dea..2e027c9436e0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2791,6 +2791,9 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, { int err; + if (!devlink->reload_enabled) + return -EOPNOTSUPP; + err = devlink->ops->reload_down(devlink, !!dest_net, extack); if (err) return err; @@ -6308,12 +6311,49 @@ EXPORT_SYMBOL_GPL(devlink_register); void devlink_unregister(struct devlink *devlink) { mutex_lock(&devlink_mutex); + WARN_ON(devlink_reload_supported(devlink) && + devlink->reload_enabled); devlink_notify(devlink, DEVLINK_CMD_DEL); list_del(&devlink->list); mutex_unlock(&devlink_mutex); } EXPORT_SYMBOL_GPL(devlink_unregister); +/** + * devlink_reload_enable - Enable reload of devlink instance + * + * @devlink: devlink + * + * Should be called at end of device initialization + * process when reload operation is supported. + */ +void devlink_reload_enable(struct devlink *devlink) +{ + mutex_lock(&devlink_mutex); + devlink->reload_enabled = true; + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_reload_enable); + +/** + * devlink_reload_disable - Disable reload of devlink instance + * + * @devlink: devlink + * + * Should be called at the beginning of device cleanup + * process when reload operation is supported. + */ +void devlink_reload_disable(struct devlink *devlink) +{ + mutex_lock(&devlink_mutex); + /* Mutex is taken which ensures that no reload operation is in + * progress while setting up forbidded flag. + */ + devlink->reload_enabled = false; + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_reload_disable); + /** * devlink_free - Free devlink instance resources * @@ -8201,7 +8241,7 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net) if (WARN_ON(!devlink_reload_supported(devlink))) continue; err = devlink_reload(devlink, &init_net, NULL); - if (err) + if (err && err != -EOPNOTSUPP) pr_warn("Failed to reload devlink instance into init_net\n"); } } -- cgit v1.2.3 From 6c7295e13ffd5623b02f1adc1442f1d8a3d52424 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Fri, 8 Nov 2019 23:45:20 +0000 Subject: devlink: Add new "enable_roce" generic device param New device parameter to enable/disable handling of RoCE traffic in the device. Signed-off-by: Michael Guralnik Acked-by: Jiri Pirko Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- Documentation/networking/devlink-params.txt | 4 ++++ include/net/devlink.h | 4 ++++ net/core/devlink.c | 5 +++++ 3 files changed, 13 insertions(+) (limited to 'net/core') diff --git a/Documentation/networking/devlink-params.txt b/Documentation/networking/devlink-params.txt index ddba3e9b55b1..04e234e9acc9 100644 --- a/Documentation/networking/devlink-params.txt +++ b/Documentation/networking/devlink-params.txt @@ -65,3 +65,7 @@ reset_dev_on_drv_probe [DEVICE, GENERIC] Reset only if device firmware can be found in the filesystem. Type: u8 + +enable_roce [DEVICE, GENERIC] + Enable handling of RoCE traffic in the device. + Type: Boolean diff --git a/include/net/devlink.h b/include/net/devlink.h index 23e4b65ec9df..39fb4d957838 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -400,6 +400,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE, + DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -434,6 +435,9 @@ enum devlink_param_generic_id { "reset_dev_on_drv_probe" #define DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE DEVLINK_PARAM_TYPE_U8 +#define DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME "enable_roce" +#define DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE DEVLINK_PARAM_TYPE_BOOL + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/core/devlink.c b/net/core/devlink.c index f80151eeaf51..0fbcd44aa64f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2884,6 +2884,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME, .type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, + .name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From db53c73a8b5db120cb741d7d932cdf831a576e8f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:29 -0800 Subject: netprio: use css ID instead of cgroup ID netprio uses cgroup ID to index the priority mapping table. This is currently okay as cgroup IDs are allocated using idr and packed. However, cgroup IDs will be changed to use full 64bit range and won't be packed making this impractical. netprio doesn't care what type of IDs it uses as long as they can identify the controller instances and are packed. Let's switch to css IDs instead of cgroup IDs. Signed-off-by: Tejun Heo Acked-by: Neil Horman Reviewed-by: Greg Kroah-Hartman Cc: "David S. Miller" Cc: Namhyung Kim --- include/net/netprio_cgroup.h | 2 +- net/core/netprio_cgroup.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index cfc9441ef074..dec7522b6ce1 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -26,7 +26,7 @@ static inline u32 task_netprioidx(struct task_struct *p) rcu_read_lock(); css = task_css(p, net_prio_cgrp_id); - idx = css->cgroup->id; + idx = css->id; rcu_read_unlock(); return idx; } diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 256b7954b720..8881dd943dd0 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -93,7 +93,7 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx) static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) { struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); - int id = css->cgroup->id; + int id = css->id; if (map && id < map->priomap_len) return map->priomap[id]; @@ -113,7 +113,7 @@ static int netprio_set_prio(struct cgroup_subsys_state *css, struct net_device *dev, u32 prio) { struct netprio_map *map; - int id = css->cgroup->id; + int id = css->id; int ret; /* avoid extending priomap for zero writes */ @@ -177,7 +177,7 @@ static void cgrp_css_free(struct cgroup_subsys_state *css) static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) { - return css->cgroup->id; + return css->id; } static int read_priomap(struct seq_file *sf, void *v) @@ -237,7 +237,7 @@ static void net_prio_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; cgroup_taskset_for_each(p, css, tset) { - void *v = (void *)(unsigned long)css->cgroup->id; + void *v = (void *)(unsigned long)css->id; task_lock(p); iterate_fd(p->files, 0, update_netprio, v); -- cgit v1.2.3 From 67c0496e87d193b8356d2af49ab95e8a1b954b3c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: kernfs: convert kernfs_node->id from union kernfs_node_id to u64 kernfs_node->id is currently a union kernfs_node_id which represents either a 32bit (ino, gen) pair or u64 value. I can't see much value in the usage of the union - all that's needed is a 64bit ID which the current code is already limited to. Using a union makes the code unnecessarily complicated and prevents using 64bit ino without adding practical benefits. This patch drops union kernfs_node_id and makes kernfs_node->id a u64. ino is stored in the lower 32bits and gen upper. Accessors - kernfs[_id]_ino() and kernfs[_id]_gen() - are added to retrieve the ino and gen. This simplifies ID handling less cumbersome and will allow using 64bit inos on supported archs. This patch doesn't make any functional changes. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim Cc: Jens Axboe Cc: Alexei Starovoitov --- fs/kernfs/dir.c | 10 +++--- fs/kernfs/file.c | 4 +-- fs/kernfs/inode.c | 4 +-- fs/kernfs/mount.c | 7 ++--- include/linux/cgroup.h | 17 +++++----- include/linux/kernfs.h | 45 ++++++++++++++++----------- include/trace/events/writeback.h | 4 +-- kernel/bpf/helpers.c | 2 +- kernel/bpf/local_storage.c | 2 +- kernel/cgroup/cgroup.c | 3 +- kernel/trace/blktrace.c | 67 +++++++++++++++++++--------------------- net/core/filter.c | 4 +-- 12 files changed, 85 insertions(+), 84 deletions(-) (limited to 'net/core') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index beabb585a7d8..c67afb591e5b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -532,7 +532,7 @@ void kernfs_put(struct kernfs_node *kn) kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } spin_lock(&kernfs_idr_lock); - idr_remove(&root->ino_idr, kn->id.ino); + idr_remove(&root->ino_idr, kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); kmem_cache_free(kernfs_node_cache, kn); @@ -639,8 +639,8 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, idr_preload_end(); if (ret < 0) goto err_out2; - kn->id.ino = ret; - kn->id.generation = gen; + + kn->id = (u64)gen << 32 | ret; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); @@ -671,7 +671,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, return kn; err_out3: - idr_remove(&root->ino_idr, kn->id.ino); + idr_remove(&root->ino_idr, kernfs_ino(kn)); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: @@ -1656,7 +1656,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) const char *name = pos->name; unsigned int type = dt_type(pos); int len = strlen(name); - ino_t ino = pos->id.ino; + ino_t ino = kernfs_ino(pos); ctx->pos = pos->hash; file->private_data = pos; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e8c792b49616..34366db3620d 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -892,7 +892,7 @@ repeat: * have the matching @file available. Look up the inodes * and generate the events manually. */ - inode = ilookup(info->sb, kn->id.ino); + inode = ilookup(info->sb, kernfs_ino(kn)); if (!inode) continue; @@ -901,7 +901,7 @@ repeat: if (parent) { struct inode *p_inode; - p_inode = ilookup(info->sb, parent->id.ino); + p_inode = ilookup(info->sb, kernfs_ino(parent)); if (p_inode) { fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, &name, 0); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index f3eaa8869f42..eac277c63d42 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -201,7 +201,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) inode->i_private = kn; inode->i_mapping->a_ops = &kernfs_aops; inode->i_op = &kernfs_iops; - inode->i_generation = kn->id.generation; + inode->i_generation = kernfs_gen(kn); set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); @@ -247,7 +247,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { struct inode *inode; - inode = iget_locked(sb, kn->id.ino); + inode = iget_locked(sb, kernfs_ino(kn)); if (inode && (inode->i_state & I_NEW)) kernfs_init_inode(kn, inode); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 067b7c380056..f05d5d6f926d 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -57,15 +57,14 @@ const struct super_operations kernfs_sops = { * Similar to kernfs_fh_get_inode, this one gets kernfs node from inode * number and generation */ -struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, - const union kernfs_node_id *id) +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, u64 id) { struct kernfs_node *kn; - kn = kernfs_find_and_get_node_by_ino(root, id->ino); + kn = kernfs_find_and_get_node_by_ino(root, kernfs_id_ino(id)); if (!kn) return NULL; - if (kn->id.generation != id->generation) { + if (kernfs_gen(kn) != kernfs_id_gen(id)) { kernfs_put(kn); return NULL; } diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f6b048902d6c..815fff49d555 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -616,7 +616,7 @@ static inline bool cgroup_is_populated(struct cgroup *cgrp) /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { - return cgrp->kn->id.ino; + return kernfs_ino(cgrp->kn); } /* cft/css accessors for cftype->write() operation */ @@ -687,13 +687,12 @@ static inline void cgroup_kthread_ready(void) current->no_cgroup_migration = 0; } -static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) +static inline u64 cgroup_get_kernfs_id(struct cgroup *cgrp) { - return &cgrp->kn->id; + return cgrp->kn->id; } -void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen); +void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; @@ -718,9 +717,9 @@ static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} -static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) +static inline union u64 cgroup_get_kernfs_id(struct cgroup *cgrp) { - return NULL; + return 0; } static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) @@ -739,8 +738,8 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, return true; } -static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen) {} +static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) +{} #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index f797ccc650e7..b2fc5c8ef6d9 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -104,21 +104,6 @@ struct kernfs_elem_attr { struct kernfs_node *notify_next; /* for kernfs_notify() */ }; -/* represent a kernfs node */ -union kernfs_node_id { - struct { - /* - * blktrace will export this struct as a simplified 'struct - * fid' (which is a big data struction), so userspace can use - * it to find kernfs node. The layout must match the first two - * fields of 'struct fid' exactly. - */ - u32 ino; - u32 generation; - }; - u64 id; -}; - /* * kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs node is represented by single kernfs_node. Most fields are @@ -155,7 +140,12 @@ struct kernfs_node { void *priv; - union kernfs_node_id id; + /* + * 64bit unique ID. Lower 32bits carry the inode number and lower + * generation. + */ + u64 id; + unsigned short flags; umode_t mode; struct kernfs_iattrs *iattr; @@ -292,6 +282,26 @@ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) return kn->flags & KERNFS_TYPE_MASK; } +static inline ino_t kernfs_id_ino(u64 id) +{ + return (u32)id; +} + +static inline u32 kernfs_id_gen(u64 id) +{ + return id >> 32; +} + +static inline ino_t kernfs_ino(struct kernfs_node *kn) +{ + return kernfs_id_ino(kn->id); +} + +static inline ino_t kernfs_gen(struct kernfs_node *kn) +{ + return kernfs_id_gen(kn->id); +} + /** * kernfs_enable_ns - enable namespace under a directory * @kn: directory of interest, should be empty @@ -383,8 +393,7 @@ void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); -struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, - const union kernfs_node_id *id); +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, u64 id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 95e50677476b..b4f0ffe1817e 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -152,7 +152,7 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { - return wb->memcg_css->cgroup->kn->id.ino; + return cgroup_ino(wb->memcg_css->cgroup); } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) @@ -260,7 +260,7 @@ TRACE_EVENT(track_foreign_dirty, __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); - __entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino; + __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5e28718928ca..912e761cd17a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -317,7 +317,7 @@ BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp = task_dfl_cgroup(current); - return cgrp->kn->id.id; + return cgrp->kn->id; } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index addd6fdceec8..5d867f6d7204 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -569,7 +569,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, return; storage->key.attach_type = type; - storage->key.cgroup_inode_id = cgroup->kn->id.id; + storage->key.cgroup_inode_id = cgroup->kn->id; map = storage->map; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cf32c0c7a45d..c6bd1a5a1977 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5786,8 +5786,7 @@ static int __init cgroup_wq_init(void) } core_initcall(cgroup_wq_init); -void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen) +void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) { struct kernfs_node *kn; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2d6e93ab0478..a986d2e74ca2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -64,8 +64,7 @@ static void blk_unregister_tracepoints(void); * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len, - union kernfs_node_id *cgid) + const void *data, size_t len, u64 cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; @@ -73,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; - ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (blk_tracer) { buffer = blk_tr->trace_buffer.buffer; @@ -100,8 +99,8 @@ record_it: t->pid = pid; t->cpu = cpu; t->pdu_len = len + cgid_len; - if (cgid) - memcpy((void *)t + sizeof(*t), cgid, cgid_len); + if (cgid_len) + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) @@ -122,7 +121,7 @@ static void trace_note_tsk(struct task_struct *tsk) spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, - sizeof(tsk->comm), NULL); + sizeof(tsk->comm), 0); } spin_unlock_irqrestore(&running_trace_lock, flags); } @@ -139,7 +138,7 @@ static void trace_note_time(struct blk_trace *bt) words[1] = now.tv_nsec; local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0); local_irq_restore(flags); } @@ -172,9 +171,9 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, blkcg = NULL; #ifdef CONFIG_BLK_CGROUP trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL); + blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : 0); #else - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); #endif local_irq_restore(flags); } @@ -212,7 +211,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data, union kernfs_node_id *cgid) + void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -223,7 +222,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid_t pid; int cpu, pc = 0; bool blk_tracer = blk_tracer_enabled; - ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -294,7 +293,7 @@ record_it: t->pdu_len = pdu_len + cgid_len; if (cgid_len) - memcpy((void *)t + sizeof(*t), cgid, cgid_len); + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); if (pdu_len) memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); @@ -751,31 +750,29 @@ void blk_trace_shutdown(struct request_queue *q) } #ifdef CONFIG_BLK_CGROUP -static union kernfs_node_id * -blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { struct blk_trace *bt = q->blk_trace; if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) - return NULL; + return 0; if (!bio->bi_blkg) - return NULL; + return 0; return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); } #else -static union kernfs_node_id * -blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { - return NULL; + return 0; } #endif -static union kernfs_node_id * +static u64 blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) { if (!rq->bio) - return NULL; + return 0; /* Use the first bio */ return blk_trace_bio_get_cgid(q, rq->bio); } @@ -797,8 +794,7 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) * **/ static void blk_add_trace_rq(struct request *rq, int error, - unsigned int nr_bytes, u32 what, - union kernfs_node_id *cgid) + unsigned int nr_bytes, u32 what, u64 cgid) { struct blk_trace *bt = rq->q->blk_trace; @@ -913,7 +909,7 @@ static void blk_add_trace_getrq(void *ignore, if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL, NULL); + NULL, 0); } } @@ -929,7 +925,7 @@ static void blk_add_trace_sleeprq(void *ignore, if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, - 0, 0, NULL, NULL); + 0, 0, NULL, 0); } } @@ -938,7 +934,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -955,7 +951,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } } @@ -1172,19 +1168,17 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) { - return (void *)(te_blk_io_trace(ent) + 1) + - (has_cg ? sizeof(union kernfs_node_id) : 0); + return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0); } -static inline const void *cgid_start(const struct trace_entry *ent) +static inline u64 t_cgid(const struct trace_entry *ent) { - return (void *)(te_blk_io_trace(ent) + 1); + return *(u64 *)(te_blk_io_trace(ent) + 1); } static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) { - return te_blk_io_trace(ent)->pdu_len - - (has_cg ? sizeof(union kernfs_node_id) : 0); + return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0); } static inline u32 t_action(const struct trace_entry *ent) @@ -1257,7 +1251,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, fill_rwbs(rwbs, t); if (has_cg) { - const union kernfs_node_id *id = cgid_start(iter->ent); + u64 id = t_cgid(iter->ent); if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { char blkcg_name_buf[NAME_MAX + 1] = "<...>"; @@ -1269,9 +1263,10 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, blkcg_name_buf, act, rwbs); } else trace_seq_printf(&iter->seq, - "%3d,%-3d %x,%-x %2s %3s ", + "%3d,%-3d %lx,%-x %2s %3s ", MAJOR(t->device), MINOR(t->device), - id->ino, id->generation, act, rwbs); + kernfs_id_ino(id), kernfs_id_gen(id), + act, rwbs); } else trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); diff --git a/net/core/filter.c b/net/core/filter.c index ed6563622ce3..b360a7beb6fc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4089,7 +4089,7 @@ BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return cgrp->kn->id.id; + return cgrp->kn->id; } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4114,7 +4114,7 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, if (!ancestor) return 0; - return ancestor->kn->id.id; + return ancestor->kn->id; } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { -- cgit v1.2.3 From 743210386c0354a2f8ef3d697353c7d8477fa81d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Nov 2019 15:54:30 -0800 Subject: cgroup: use cgrp->kn->id as the cgroup ID cgroup ID is currently allocated using a dedicated per-hierarchy idr and used internally and exposed through tracepoints and bpf. This is confusing because there are tracepoints and other interfaces which use the cgroupfs ino as IDs. The preceding changes made kn->id exposed as ino as 64bit ino on supported archs or ino+gen (low 32bits as ino, high gen). There's no reason for cgroup to use different IDs. The kernfs IDs are unique and userland can easily discover them and map them back to paths using standard file operations. This patch replaces cgroup IDs with kernfs IDs. * cgroup_id() is added and all cgroup ID users are converted to use it. * kernfs_node creation is moved to earlier during cgroup init so that cgroup_id() is available during init. * While at it, s/cgroup/cgrp/ in psi helpers for consistency. * Fallback ID value is changed to 1 to be consistent with root cgroup ID. Signed-off-by: Tejun Heo Reviewed-by: Greg Kroah-Hartman Cc: Namhyung Kim --- include/linux/cgroup-defs.h | 17 ++-------- include/linux/cgroup.h | 17 ++++------ include/trace/events/cgroup.h | 6 ++-- kernel/bpf/helpers.c | 2 +- kernel/bpf/local_storage.c | 2 +- kernel/cgroup/cgroup.c | 76 ++++++++++++++----------------------------- kernel/trace/blktrace.c | 4 +-- net/core/filter.c | 4 +-- 8 files changed, 43 insertions(+), 85 deletions(-) (limited to 'net/core') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4904b1ebd1ff..63097cb243cb 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -354,16 +354,6 @@ struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ - /* - * idr allocated in-hierarchy ID. - * - * ID 0 is not used, the ID of the root cgroup is always 1, and a - * new cgroup will be assigned with a smallest available ID. - * - * Allocating/Removing ID must be protected by cgroup_mutex. - */ - int id; - /* * The depth this cgroup is at. The root is at depth zero and each * step down the hierarchy increments the level. This along with @@ -488,7 +478,7 @@ struct cgroup { struct cgroup_freezer_state freezer; /* ids of the ancestors at each level including self */ - int ancestor_ids[]; + u64 ancestor_ids[]; }; /* @@ -509,7 +499,7 @@ struct cgroup_root { struct cgroup cgrp; /* for cgrp->ancestor_ids[0] */ - int cgrp_ancestor_id_storage; + u64 cgrp_ancestor_id_storage; /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; @@ -520,9 +510,6 @@ struct cgroup_root { /* Hierarchy-specific flags */ unsigned int flags; - /* IDs for cgroups in this hierarchy */ - struct idr cgroup_idr; - /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 815fff49d555..d7ddebd0cdec 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -304,6 +304,11 @@ void css_task_iter_end(struct css_task_iter *it); * Inline functions. */ +static inline u64 cgroup_id(struct cgroup *cgrp) +{ + return cgrp->kn->id; +} + /** * css_get - obtain a reference on the specified css * @css: target css @@ -565,7 +570,7 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, { if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) return false; - return cgrp->ancestor_ids[ancestor->level] == ancestor->id; + return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor); } /** @@ -687,17 +692,13 @@ static inline void cgroup_kthread_ready(void) current->no_cgroup_migration = 0; } -static inline u64 cgroup_get_kernfs_id(struct cgroup *cgrp) -{ - return cgrp->kn->id; -} - void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; struct cgroup; +static inline u64 cgroup_id(struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline int cgroup_attach_task_all(struct task_struct *from, @@ -717,10 +718,6 @@ static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} -static inline union u64 cgroup_get_kernfs_id(struct cgroup *cgrp) -{ - return 0; -} static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h index a566cc521476..7f42a3de59e6 100644 --- a/include/trace/events/cgroup.h +++ b/include/trace/events/cgroup.h @@ -66,7 +66,7 @@ DECLARE_EVENT_CLASS(cgroup, TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; - __entry->id = cgrp->id; + __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); ), @@ -135,7 +135,7 @@ DECLARE_EVENT_CLASS(cgroup_migrate, TP_fast_assign( __entry->dst_root = dst_cgrp->root->hierarchy_id; - __entry->dst_id = dst_cgrp->id; + __entry->dst_id = cgroup_id(dst_cgrp); __entry->dst_level = dst_cgrp->level; __assign_str(dst_path, path); __entry->pid = task->pid; @@ -179,7 +179,7 @@ DECLARE_EVENT_CLASS(cgroup_event, TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; - __entry->id = cgrp->id; + __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); __entry->val = val; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 912e761cd17a..cada974c9f4e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -317,7 +317,7 @@ BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp = task_dfl_cgroup(current); - return cgrp->kn->id; + return cgroup_id(cgrp); } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 5d867f6d7204..2ba750725cb2 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -569,7 +569,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, return; storage->key.attach_type = type; - storage->key.cgroup_inode_id = cgroup->kn->id; + storage->key.cgroup_inode_id = cgroup_id(cgroup); map = storage->map; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b5dcbee5aa6c..c12dcf7dc432 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1308,10 +1308,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - if (root) { - idr_destroy(&root->cgroup_idr); - kfree(root); - } + kfree(root); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1917,7 +1914,6 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); - idr_init(&root->cgroup_idr); root->flags = ctx->flags; if (ctx->release_agent) @@ -1938,12 +1934,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) lockdep_assert_held(&cgroup_mutex); - ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); - if (ret < 0) - goto out; - root_cgrp->id = ret; - root_cgrp->ancestor_ids[0] = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) @@ -1976,6 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto exit_root_id; } root_cgrp->kn = root->kf_root->kn; + WARN_ON_ONCE(cgroup_id(root_cgrp) != 1); + root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); ret = css_populate_dir(&root_cgrp->self); if (ret) @@ -3552,22 +3544,22 @@ static int cpu_stat_show(struct seq_file *seq, void *v) #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_CPU); } @@ -4987,9 +4979,6 @@ static void css_release_work_fn(struct work_struct *work) tcgrp->nr_dying_descendants--; spin_unlock_irq(&css_set_lock); - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - cgrp->id = -1; - /* * There are two control paths which try to determine * cgroup from dentry without going through kernfs - @@ -5154,10 +5143,12 @@ err_free_css: * it isn't associated with its kernfs_node and doesn't have the control * mask applied. */ -static struct cgroup *cgroup_create(struct cgroup *parent) +static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, + umode_t mode) { struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; + struct kernfs_node *kn; int level = parent->level + 1; int ret; @@ -5177,15 +5168,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent) goto out_cancel_ref; } - /* - * Temporarily set the pointer to NULL, so idr_find() won't return - * a half-baked cgroup. - */ - cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); - if (cgrp->id < 0) { - ret = -ENOMEM; + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); goto out_stat_exit; } + cgrp->kn = kn; init_cgroup_housekeeping(cgrp); @@ -5195,7 +5184,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) ret = psi_cgroup_alloc(cgrp); if (ret) - goto out_idr_free; + goto out_kernfs_remove; ret = cgroup_bpf_inherit(cgrp); if (ret) @@ -5219,7 +5208,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { - cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; + cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp); if (tcgrp != cgrp) { tcgrp->nr_descendants++; @@ -5248,12 +5237,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent) atomic_inc(&root->nr_cgrps); cgroup_get_live(parent); - /* - * @cgrp is now fully operational. If something fails after this - * point, it'll be released via the normal destruction path. - */ - cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - /* * On the default hierarchy, a child doesn't automatically inherit * subtree_control from the parent. Each is configured manually. @@ -5267,8 +5250,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) out_psi_free: psi_cgroup_free(cgrp); -out_idr_free: - cgroup_idr_remove(&root->cgroup_idr, cgrp->id); +out_kernfs_remove: + kernfs_remove(cgrp->kn); out_stat_exit: if (cgroup_on_dfl(parent)) cgroup_rstat_exit(cgrp); @@ -5305,7 +5288,6 @@ fail: int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; - struct kernfs_node *kn; int ret; /* do not accept '\n' to prevent making /proc//cgroup unparsable */ @@ -5321,27 +5303,19 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) goto out_unlock; } - cgrp = cgroup_create(parent); + cgrp = cgroup_create(parent, name, mode); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); goto out_unlock; } - /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - goto out_destroy; - } - cgrp->kn = kn; - /* * This extra ref will be put in cgroup_free_fn() and guarantees * that @cgrp->kn is always accessible. */ - kernfs_get(kn); + kernfs_get(cgrp->kn); - ret = cgroup_kn_set_ugid(kn); + ret = cgroup_kn_set_ugid(cgrp->kn); if (ret) goto out_destroy; @@ -5356,7 +5330,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) TRACE_CGROUP_PATH(mkdir, cgrp); /* let's create and online css's */ - kernfs_activate(kn); + kernfs_activate(cgrp->kn); ret = 0; goto out_unlock; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a7dac5b63f3f..475e29498bca 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -171,7 +171,7 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, blkcg = NULL; #ifdef CONFIG_BLK_CGROUP trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : 0); + blkcg ? cgroup_id(blkcg->css.cgroup) : 1); #else trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); #endif @@ -759,7 +759,7 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) if (!bio->bi_blkg) return 0; - return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); + return cgroup_id(bio_blkcg(bio)->css.cgroup); } #else u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) diff --git a/net/core/filter.c b/net/core/filter.c index b360a7beb6fc..caef7c74cad5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4089,7 +4089,7 @@ BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return cgrp->kn->id; + return cgroup_id(cgrp); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4114,7 +4114,7 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, if (!ancestor) return 0; - return ancestor->kn->id; + return cgroup_id(ancestor); } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { -- cgit v1.2.3 From e2cde864a1d3e3626bfc8fa088fbc82b04ce66ed Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Tue, 12 Nov 2019 14:07:49 +0200 Subject: devlink: Allow large formatted message of binary output Devlink supports pair output of name and value. When the value is binary, it must be presented in an array. If the length of the binary value exceeds fmsg limitation, break the value into chunks internally. Signed-off-by: Aya Levin Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 4 +--- net/core/devlink.c | 24 +++++++++++++++--------- 2 files changed, 16 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/include/net/devlink.h b/include/net/devlink.h index 7891611868e4..7e72b2e71164 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -967,8 +967,6 @@ int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value); int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value); int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value); int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value); -int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, - u16 value_len); int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, bool value); @@ -981,7 +979,7 @@ int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, const char *value); int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, - const void *value, u16 value_len); + const void *value, u32 value_len); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, diff --git a/net/core/devlink.c b/net/core/devlink.c index 2e027c9436e0..9bad78388a07 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4414,12 +4414,11 @@ int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) } EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); -int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, - u16 value_len) +static int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, + u16 value_len) { return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY); } -EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, bool value) @@ -4527,19 +4526,26 @@ int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put); int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, - const void *value, u16 value_len) + const void *value, u32 value_len) { + u32 data_size; + u32 offset; int err; - err = devlink_fmsg_pair_nest_start(fmsg, name); + err = devlink_fmsg_arr_pair_nest_start(fmsg, name); if (err) return err; - err = devlink_fmsg_binary_put(fmsg, value, value_len); - if (err) - return err; + for (offset = 0; offset < value_len; offset += data_size) { + data_size = value_len - offset; + if (data_size > DEVLINK_FMSG_MAX_SIZE) + data_size = DEVLINK_FMSG_MAX_SIZE; + err = devlink_fmsg_binary_put(fmsg, value + offset, data_size); + if (err) + return err; + } - err = devlink_fmsg_pair_nest_end(fmsg); + err = devlink_fmsg_arr_pair_nest_end(fmsg); if (err) return err; -- cgit v1.2.3 From 9cc31b3a092d9bf2a18f09ad77e727ddb42a5b1e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:14 -0800 Subject: bpf: Fix race in btf_resolve_helper_id() btf_resolve_helper_id() caching logic is a bit racy, since under root the verifier can verify several programs in parallel. Fix it with READ/WRITE_ONCE. Fix the type as well, since error is also recorded. Fixes: a7658e1a4164 ("bpf: Check types of arguments passed into helpers") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20191114185720.1641606-15-ast@kernel.org --- include/linux/bpf.h | 5 +++-- kernel/bpf/btf.c | 26 +++++++++++++++++++++++++- kernel/bpf/verifier.c | 8 +++----- net/core/filter.c | 2 +- 4 files changed, 32 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0d4c5c224d79..cb5a356381f5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -248,7 +248,7 @@ struct bpf_func_proto { }; enum bpf_arg_type arg_type[5]; }; - u32 *btf_id; /* BTF ids of arguments */ + int *btf_id; /* BTF ids of arguments */ }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is @@ -881,7 +881,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); -u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int); +int btf_resolve_helper_id(struct bpf_verifier_log *log, + const struct bpf_func_proto *fn, int); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9e1164e5b429..033d071eb59c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3721,7 +3721,8 @@ again: return -EINVAL; } -u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg) +static int __btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, + int arg) { char fnname[KSYM_SYMBOL_LEN + 4] = "btf_"; const struct btf_param *args; @@ -3789,6 +3790,29 @@ u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn, int arg) return btf_id; } +int btf_resolve_helper_id(struct bpf_verifier_log *log, + const struct bpf_func_proto *fn, int arg) +{ + int *btf_id = &fn->btf_id[arg]; + int ret; + + if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID) + return -EINVAL; + + ret = READ_ONCE(*btf_id); + if (ret) + return ret; + /* ok to race the search. The result is the same */ + ret = __btf_resolve_helper_id(log, fn->func, arg); + if (!ret) { + /* Function argument cannot be type 'void' */ + bpf_log(log, "BTF resolution bug\n"); + return -EFAULT; + } + WRITE_ONCE(*btf_id, ret); + return ret; +} + static int __get_type_size(struct btf *btf, u32 btf_id, const struct btf_type **bad_type) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8f89cfa93e88..e78ec7990767 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4147,11 +4147,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ for (i = 0; i < 5; i++) { - if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID) { - if (!fn->btf_id[i]) - fn->btf_id[i] = btf_resolve_helper_id(&env->log, fn->func, i); - meta.btf_id = fn->btf_id[i]; - } + err = btf_resolve_helper_id(&env->log, fn, i); + if (err > 0) + meta.btf_id = err; err = check_func_arg(env, BPF_REG_1 + i, fn->arg_type[i], &meta); if (err) return err; diff --git a/net/core/filter.c b/net/core/filter.c index fc303abec8fa..f72face90659 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3816,7 +3816,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; -static u32 bpf_skb_output_btf_ids[5]; +static int bpf_skb_output_btf_ids[5]; const struct bpf_func_proto bpf_skb_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, -- cgit v1.2.3 From 91cc1a99740e2ed1d903b5906afb470cc5a07379 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:15 -0800 Subject: bpf: Annotate context types Annotate BPF program context types with program-side type and kernel-side type. This type information is used by the verifier. btf_get_prog_ctx_type() is used in the later patches to verify that BTF type of ctx in BPF program matches to kernel expected ctx type. For example, the XDP program type is: BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp, struct xdp_md, struct xdp_buff) That means that XDP program should be written as: int xdp_prog(struct xdp_md *ctx) { ... } Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-16-ast@kernel.org --- include/linux/bpf.h | 11 ++++- include/linux/bpf_types.h | 78 ++++++++++++++++++++----------- kernel/bpf/btf.c | 114 ++++++++++++++++++++++++++++++++++++++++++++-- kernel/bpf/syscall.c | 4 +- kernel/bpf/verifier.c | 2 +- net/core/filter.c | 10 ---- 6 files changed, 176 insertions(+), 43 deletions(-) (limited to 'net/core') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cb5a356381f5..9c48f11fe56e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -747,7 +747,7 @@ DECLARE_PER_CPU(int, bpf_prog_active); extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; -#define BPF_PROG_TYPE(_id, _name) \ +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ extern const struct bpf_prog_ops _name ## _prog_ops; \ extern const struct bpf_verifier_ops _name ## _verifier_ops; #define BPF_MAP_TYPE(_id, _ops) \ @@ -1213,6 +1213,15 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, #endif #ifdef CONFIG_INET +struct sk_reuseport_kern { + struct sk_buff *skb; + struct sock *sk; + struct sock *selected_sk; + void *data_end; + u32 hash; + u32 reuseport_id; + bool bind_inany; +}; bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index de14872b01ba..93740b3614d7 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -2,42 +2,68 @@ /* internal file - do not include directly */ #ifdef CONFIG_NET -BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter) -BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act) -BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act) -BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp, + struct xdp_md, struct xdp_buff) #ifdef CONFIG_CGROUP_BPF -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock, + struct bpf_sock, struct sock) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr, + struct bpf_sock_addr, struct bpf_sock_addr_kern) #endif -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local) -BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) -BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) -BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) -BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops, + struct bpf_sock_ops, struct bpf_sock_ops_kern) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb, + struct __sk_buff, struct sk_buff) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg, + struct sk_msg_md, struct sk_msg) +BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector, + struct __sk_buff, struct bpf_flow_dissector) #endif #ifdef CONFIG_BPF_EVENTS -BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) -BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) -BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) -BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) -BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) -BPF_PROG_TYPE(BPF_PROG_TYPE_TRACING, tracing) +BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe, + bpf_user_pt_regs_t, struct pt_regs) +BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint, + __u64, u64) +BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event, + struct bpf_perf_event_data, struct bpf_perf_event_data_kern) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint, + struct bpf_raw_tracepoint_args, u64) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable, + struct bpf_raw_tracepoint_args, u64) +BPF_PROG_TYPE(BPF_PROG_TYPE_TRACING, tracing, + void *, void *) #endif #ifdef CONFIG_CGROUP_BPF -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl) -BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev, + struct bpf_cgroup_dev_ctx, struct bpf_cgroup_dev_ctx) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl, + struct bpf_sysctl, struct bpf_sysctl_kern) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt, + struct bpf_sockopt, struct bpf_sockopt_kern) #endif #ifdef CONFIG_BPF_LIRC_MODE2 -BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) +BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2, + __u32, u32) #endif #ifdef CONFIG_INET -BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport, + struct sk_reuseport_md, struct sk_reuseport_kern) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 033d071eb59c..4b7c8bd423d6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2,6 +2,8 @@ /* Copyright (c) 2018 Facebook */ #include +#include +#include #include #include #include @@ -16,6 +18,9 @@ #include #include #include +#include +#include +#include /* BTF (BPF Type Format) is the meta data format which describes * the data types of BPF program/map. Hence, it basically focus @@ -3439,13 +3444,98 @@ errout: extern char __weak _binary__btf_vmlinux_bin_start[]; extern char __weak _binary__btf_vmlinux_bin_end[]; +extern struct btf *btf_vmlinux; + +#define BPF_MAP_TYPE(_id, _ops) +static union { + struct bpf_ctx_convert { +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ + prog_ctx_type _id##_prog; \ + kern_ctx_type _id##_kern; +#include +#undef BPF_PROG_TYPE + } *__t; + /* 't' is written once under lock. Read many times. */ + const struct btf_type *t; +} bpf_ctx_convert; +enum { +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ + __ctx_convert##_id, +#include +#undef BPF_PROG_TYPE +}; +static u8 bpf_ctx_convert_map[] = { +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ + [_id] = __ctx_convert##_id, +#include +#undef BPF_PROG_TYPE +}; +#undef BPF_MAP_TYPE + +static const struct btf_member * +btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type) +{ + const struct btf_type *conv_struct; + const struct btf_type *ctx_struct; + const struct btf_member *ctx_type; + const char *tname, *ctx_tname; + + conv_struct = bpf_ctx_convert.t; + if (!conv_struct) { + bpf_log(log, "btf_vmlinux is malformed\n"); + return NULL; + } + t = btf_type_by_id(btf, t->type); + while (btf_type_is_modifier(t)) + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_struct(t)) { + /* Only pointer to struct is supported for now. + * That means that BPF_PROG_TYPE_TRACEPOINT with BTF + * is not supported yet. + * BPF_PROG_TYPE_RAW_TRACEPOINT is fine. + */ + bpf_log(log, "BPF program ctx type is not a struct\n"); + return NULL; + } + tname = btf_name_by_offset(btf, t->name_off); + if (!tname) { + bpf_log(log, "BPF program ctx struct doesn't have a name\n"); + return NULL; + } + /* prog_type is valid bpf program type. No need for bounds check. */ + ctx_type = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2; + /* ctx_struct is a pointer to prog_ctx_type in vmlinux. + * Like 'struct __sk_buff' + */ + ctx_struct = btf_type_by_id(btf_vmlinux, ctx_type->type); + if (!ctx_struct) + /* should not happen */ + return NULL; + ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off); + if (!ctx_tname) { + /* should not happen */ + bpf_log(log, "Please fix kernel include/linux/bpf_types.h\n"); + return NULL; + } + /* only compare that prog's ctx type name is the same as + * kernel expects. No need to compare field by field. + * It's ok for bpf prog to do: + * struct __sk_buff {}; + * int socket_filter_bpf_prog(struct __sk_buff *skb) + * { // no fields of skb are ever used } + */ + if (strcmp(ctx_tname, tname)) + return NULL; + return ctx_type; +} struct btf *btf_parse_vmlinux(void) { struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; struct btf *btf = NULL; - int err; + int err, i; env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) @@ -3479,6 +3569,26 @@ struct btf *btf_parse_vmlinux(void) if (err) goto errout; + /* find struct bpf_ctx_convert for type checking later */ + for (i = 1; i <= btf->nr_types; i++) { + const struct btf_type *t; + const char *tname; + + t = btf_type_by_id(btf, i); + if (!__btf_type_is_struct(t)) + continue; + tname = __btf_name_by_offset(btf, t->name_off); + if (!strcmp(tname, "bpf_ctx_convert")) { + /* btf_parse_vmlinux() runs under bpf_verifier_lock */ + bpf_ctx_convert.t = t; + break; + } + } + if (i > btf->nr_types) { + err = -ENOENT; + goto errout; + } + btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; @@ -3492,8 +3602,6 @@ errout: return ERR_PTR(err); } -extern struct btf *btf_vmlinux; - bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e2e37bea86bc..05a0ee75eca0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -43,7 +43,7 @@ static DEFINE_SPINLOCK(map_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; static const struct bpf_map_ops * const bpf_map_types[] = { -#define BPF_PROG_TYPE(_id, _ops) +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) \ [_id] = &_ops, #include @@ -1189,7 +1189,7 @@ err_put: } static const struct bpf_prog_ops * const bpf_prog_types[] = { -#define BPF_PROG_TYPE(_id, _name) \ +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _prog_ops, #define BPF_MAP_TYPE(_id, _ops) #include diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e78ec7990767..7395d6bebefd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23,7 +23,7 @@ #include "disasm.h" static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { -#define BPF_PROG_TYPE(_id, _name) \ +#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _verifier_ops, #define BPF_MAP_TYPE(_id, _ops) #include diff --git a/net/core/filter.c b/net/core/filter.c index f72face90659..49ded4a7588a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8684,16 +8684,6 @@ out: } #ifdef CONFIG_INET -struct sk_reuseport_kern { - struct sk_buff *skb; - struct sock *sk; - struct sock *selected_sk; - void *data_end; - u32 hash; - u32 reuseport_id; - bool bind_inany; -}; - static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock_reuseport *reuse, struct sock *sk, struct sk_buff *skb, -- cgit v1.2.3 From c3f812cea0d7006469d1cf33a4a9f0a12bb4b3a3 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Thu, 14 Nov 2019 14:13:00 -0800 Subject: page_pool: do not release pool until inflight == 0. The page pool keeps track of the number of pages in flight, and it isn't safe to remove the pool until all pages are returned. Disallow removing the pool until all pages are back, so the pool is always available for page producers. Make the page pool responsible for its own delayed destruction instead of relying on XDP, so the page pool can be used without the xdp memory model. When all pages are returned, free the pool and notify xdp if the pool is registered with the xdp memory system. Have the callback perform a table walk since some drivers (cpsw) may share the pool among multiple xdp_rxq_info. Note that the increment of pages_state_release_cnt may result in inflight == 0, resulting in the pool being released. Fixes: d956a048cd3f ("xdp: force mem allocator removal and periodic warning") Signed-off-by: Jonathan Lemon Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 +- include/net/page_pool.h | 52 +++------ include/net/xdp_priv.h | 4 - include/trace/events/xdp.h | 19 +--- net/core/page_pool.c | 122 ++++++++++++++-------- net/core/xdp.c | 121 ++++++++------------- 6 files changed, 139 insertions(+), 183 deletions(-) (limited to 'net/core') diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 4ba250a9008f..8cc4cd0cc515 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1503,10 +1503,8 @@ static void free_dma_rx_desc_resources(struct stmmac_priv *priv) rx_q->dma_erx, rx_q->dma_rx_phy); kfree(rx_q->buf_pool); - if (rx_q->page_pool) { - page_pool_request_shutdown(rx_q->page_pool); + if (rx_q->page_pool) page_pool_destroy(rx_q->page_pool); - } } } diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 2cbcdbdec254..1121faa99c12 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -70,7 +70,12 @@ struct page_pool_params { struct page_pool { struct page_pool_params p; - u32 pages_state_hold_cnt; + struct delayed_work release_dw; + void (*disconnect)(void *); + unsigned long defer_start; + unsigned long defer_warn; + + u32 pages_state_hold_cnt; /* * Data structure for allocation side @@ -129,25 +134,19 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) struct page_pool *page_pool_create(const struct page_pool_params *params); -void __page_pool_free(struct page_pool *pool); -static inline void page_pool_free(struct page_pool *pool) -{ - /* When page_pool isn't compiled-in, net/core/xdp.c doesn't - * allow registering MEM_TYPE_PAGE_POOL, but shield linker. - */ #ifdef CONFIG_PAGE_POOL - __page_pool_free(pool); -#endif -} - -/* Drivers use this instead of page_pool_free */ +void page_pool_destroy(struct page_pool *pool); +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)); +#else static inline void page_pool_destroy(struct page_pool *pool) { - if (!pool) - return; +} - page_pool_free(pool); +static inline void page_pool_use_xdp_mem(struct page_pool *pool, + void (*disconnect)(void *)) +{ } +#endif /* Never call this directly, use helpers below */ void __page_pool_put_page(struct page_pool *pool, @@ -170,24 +169,6 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, __page_pool_put_page(pool, page, true); } -/* API user MUST have disconnected alloc-side (not allowed to call - * page_pool_alloc_pages()) before calling this. The free-side can - * still run concurrently, to handle in-flight packet-pages. - * - * A request to shutdown can fail (with false) if there are still - * in-flight packet-pages. - */ -bool __page_pool_request_shutdown(struct page_pool *pool); -static inline bool page_pool_request_shutdown(struct page_pool *pool) -{ - bool safe_to_remove = false; - -#ifdef CONFIG_PAGE_POOL - safe_to_remove = __page_pool_request_shutdown(pool); -#endif - return safe_to_remove; -} - /* Disconnects a page (from a page_pool). API users can have a need * to disconnect a page (from a page_pool), to allow it to be used as * a regular page (that will eventually be returned to the normal @@ -216,11 +197,6 @@ static inline bool is_page_pool_compiled_in(void) #endif } -static inline void page_pool_get(struct page_pool *pool) -{ - refcount_inc(&pool->user_cnt); -} - static inline bool page_pool_put(struct page_pool *pool) { return refcount_dec_and_test(&pool->user_cnt); diff --git a/include/net/xdp_priv.h b/include/net/xdp_priv.h index 6a8cba6ea79a..a9d5b7603b89 100644 --- a/include/net/xdp_priv.h +++ b/include/net/xdp_priv.h @@ -12,12 +12,8 @@ struct xdp_mem_allocator { struct page_pool *page_pool; struct zero_copy_allocator *zc_alloc; }; - int disconnect_cnt; - unsigned long defer_start; struct rhash_head node; struct rcu_head rcu; - struct delayed_work defer_wq; - unsigned long defer_warn; }; #endif /* __LINUX_NET_XDP_PRIV_H__ */ diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index c7e3c9c5bad3..a7378bcd9928 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -317,19 +317,15 @@ __MEM_TYPE_MAP(__MEM_TYPE_TP_FN) TRACE_EVENT(mem_disconnect, - TP_PROTO(const struct xdp_mem_allocator *xa, - bool safe_to_remove, bool force), + TP_PROTO(const struct xdp_mem_allocator *xa), - TP_ARGS(xa, safe_to_remove, force), + TP_ARGS(xa), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) - __field(bool, safe_to_remove) - __field(bool, force) - __field(int, disconnect_cnt) ), TP_fast_assign( @@ -337,19 +333,12 @@ TRACE_EVENT(mem_disconnect, __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; - __entry->safe_to_remove = safe_to_remove; - __entry->force = force; - __entry->disconnect_cnt = xa->disconnect_cnt; ), - TP_printk("mem_id=%d mem_type=%s allocator=%p" - " safe_to_remove=%s force=%s disconnect_cnt=%d", + TP_printk("mem_id=%d mem_type=%s allocator=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), - __entry->allocator, - __entry->safe_to_remove ? "true" : "false", - __entry->force ? "true" : "false", - __entry->disconnect_cnt + __entry->allocator ) ); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 5bc65587f1c4..dfc2501c35d9 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -18,6 +18,9 @@ #include +#define DEFER_TIME (msecs_to_jiffies(1000)) +#define DEFER_WARN_INTERVAL (60 * HZ) + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -193,22 +196,14 @@ static s32 page_pool_inflight(struct page_pool *pool) { u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); - s32 distance; - - distance = _distance(hold_cnt, release_cnt); - - trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt); - return distance; -} + s32 inflight; -static bool __page_pool_safe_to_destroy(struct page_pool *pool) -{ - s32 inflight = page_pool_inflight(pool); + inflight = _distance(hold_cnt, release_cnt); - /* The distance should not be able to become negative */ + trace_page_pool_inflight(pool, inflight, hold_cnt, release_cnt); WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); - return (inflight == 0); + return inflight; } /* Cleanup page_pool state from page */ @@ -216,6 +211,7 @@ static void __page_pool_clean_page(struct page_pool *pool, struct page *page) { dma_addr_t dma; + int count; if (!(pool->p.flags & PP_FLAG_DMA_MAP)) goto skip_dma_unmap; @@ -227,9 +223,11 @@ static void __page_pool_clean_page(struct page_pool *pool, DMA_ATTR_SKIP_CPU_SYNC); page->dma_addr = 0; skip_dma_unmap: - atomic_inc(&pool->pages_state_release_cnt); - trace_page_pool_state_release(pool, page, - atomic_read(&pool->pages_state_release_cnt)); + /* This may be the last page returned, releasing the pool, so + * it is not safe to reference pool afterwards. + */ + count = atomic_inc_return(&pool->pages_state_release_cnt); + trace_page_pool_state_release(pool, page, count); } /* unmap the page and clean our state */ @@ -338,31 +336,10 @@ static void __page_pool_empty_ring(struct page_pool *pool) } } -static void __warn_in_flight(struct page_pool *pool) +static void page_pool_free(struct page_pool *pool) { - u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); - u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); - s32 distance; - - distance = _distance(hold_cnt, release_cnt); - - /* Drivers should fix this, but only problematic when DMA is used */ - WARN(1, "Still in-flight pages:%d hold:%u released:%u", - distance, hold_cnt, release_cnt); -} - -void __page_pool_free(struct page_pool *pool) -{ - /* Only last user actually free/release resources */ - if (!page_pool_put(pool)) - return; - - WARN(pool->alloc.count, "API usage violation"); - WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty"); - - /* Can happen due to forced shutdown */ - if (!__page_pool_safe_to_destroy(pool)) - __warn_in_flight(pool); + if (pool->disconnect) + pool->disconnect(pool); ptr_ring_cleanup(&pool->ring, NULL); @@ -371,12 +348,8 @@ void __page_pool_free(struct page_pool *pool) kfree(pool); } -EXPORT_SYMBOL(__page_pool_free); -/* Request to shutdown: release pages cached by page_pool, and check - * for in-flight pages - */ -bool __page_pool_request_shutdown(struct page_pool *pool) +static void page_pool_scrub(struct page_pool *pool) { struct page *page; @@ -393,7 +366,64 @@ bool __page_pool_request_shutdown(struct page_pool *pool) * be in-flight. */ __page_pool_empty_ring(pool); +} + +static int page_pool_release(struct page_pool *pool) +{ + int inflight; + + page_pool_scrub(pool); + inflight = page_pool_inflight(pool); + if (!inflight) + page_pool_free(pool); + + return inflight; +} + +static void page_pool_release_retry(struct work_struct *wq) +{ + struct delayed_work *dwq = to_delayed_work(wq); + struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); + int inflight; + + inflight = page_pool_release(pool); + if (!inflight) + return; + + /* Periodic warning */ + if (time_after_eq(jiffies, pool->defer_warn)) { + int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; + + pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", + __func__, inflight, sec); + pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; + } + + /* Still not ready to be disconnected, retry later */ + schedule_delayed_work(&pool->release_dw, DEFER_TIME); +} + +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)) +{ + refcount_inc(&pool->user_cnt); + pool->disconnect = disconnect; +} + +void page_pool_destroy(struct page_pool *pool) +{ + if (!pool) + return; + + if (!page_pool_put(pool)) + return; + + if (!page_pool_release(pool)) + return; + + pool->defer_start = jiffies; + pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; - return __page_pool_safe_to_destroy(pool); + INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); + schedule_delayed_work(&pool->release_dw, DEFER_TIME); } -EXPORT_SYMBOL(__page_pool_request_shutdown); +EXPORT_SYMBOL(page_pool_destroy); diff --git a/net/core/xdp.c b/net/core/xdp.c index 20781ad5f9c3..8e405abaf05a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -70,10 +70,6 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) xa = container_of(rcu, struct xdp_mem_allocator, rcu); - /* Allocator have indicated safe to remove before this is called */ - if (xa->mem.type == MEM_TYPE_PAGE_POOL) - page_pool_free(xa->page_pool); - /* Allow this ID to be reused */ ida_simple_remove(&mem_id_pool, xa->mem.id); @@ -85,62 +81,57 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) kfree(xa); } -static bool __mem_id_disconnect(int id, bool force) +static void mem_xa_remove(struct xdp_mem_allocator *xa) { - struct xdp_mem_allocator *xa; - bool safe_to_remove = true; + trace_mem_disconnect(xa); mutex_lock(&mem_id_lock); - xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - WARN(1, "Request remove non-existing id(%d), driver bug?", id); - return true; - } - xa->disconnect_cnt++; - - /* Detects in-flight packet-pages for page_pool */ - if (xa->mem.type == MEM_TYPE_PAGE_POOL) - safe_to_remove = page_pool_request_shutdown(xa->page_pool); - - trace_mem_disconnect(xa, safe_to_remove, force); - - if ((safe_to_remove || force) && - !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); mutex_unlock(&mem_id_lock); - return (safe_to_remove|force); } -#define DEFER_TIME (msecs_to_jiffies(1000)) -#define DEFER_WARN_INTERVAL (30 * HZ) -#define DEFER_MAX_RETRIES 120 +static void mem_allocator_disconnect(void *allocator) +{ + struct xdp_mem_allocator *xa; + struct rhashtable_iter iter; + + rhashtable_walk_enter(mem_id_ht, &iter); + do { + rhashtable_walk_start(&iter); + + while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) { + if (xa->allocator == allocator) + mem_xa_remove(xa); + } + + rhashtable_walk_stop(&iter); -static void mem_id_disconnect_defer_retry(struct work_struct *wq) + } while (xa == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); +} + +static void mem_id_disconnect(int id) { - struct delayed_work *dwq = to_delayed_work(wq); - struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq); - bool force = false; + struct xdp_mem_allocator *xa; - if (xa->disconnect_cnt > DEFER_MAX_RETRIES) - force = true; + mutex_lock(&mem_id_lock); - if (__mem_id_disconnect(xa->mem.id, force)) + xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); + if (!xa) { + mutex_unlock(&mem_id_lock); + WARN(1, "Request remove non-existing id(%d), driver bug?", id); return; + } - /* Periodic warning */ - if (time_after_eq(jiffies, xa->defer_warn)) { - int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ; + trace_mem_disconnect(xa); - pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n", - __func__, xa->mem.id, xa->disconnect_cnt, sec); - xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; - } + if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); - /* Still not ready to be disconnected, retry later */ - schedule_delayed_work(&xa->defer_wq, DEFER_TIME); + mutex_unlock(&mem_id_lock); } void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) @@ -153,38 +144,21 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) return; } - if (xdp_rxq->mem.type != MEM_TYPE_PAGE_POOL && - xdp_rxq->mem.type != MEM_TYPE_ZERO_COPY) { - return; - } - if (id == 0) return; - if (__mem_id_disconnect(id, false)) - return; - - /* Could not disconnect, defer new disconnect attempt to later */ - mutex_lock(&mem_id_lock); + if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY) + return mem_id_disconnect(id); - xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - return; + if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { + rcu_read_lock(); + xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); + page_pool_destroy(xa->page_pool); + rcu_read_unlock(); } - xa->defer_start = jiffies; - xa->defer_warn = jiffies + DEFER_WARN_INTERVAL; - - INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry); - mutex_unlock(&mem_id_lock); - schedule_delayed_work(&xa->defer_wq, DEFER_TIME); } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model); -/* This unregister operation will also cleanup and destroy the - * allocator. The page_pool_free() operation is first called when it's - * safe to remove, possibly deferred to a workqueue. - */ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) { /* Simplify driver cleanup code paths, allow unreg "unused" */ @@ -371,7 +345,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, } if (type == MEM_TYPE_PAGE_POOL) - page_pool_get(xdp_alloc->page_pool); + page_pool_use_xdp_mem(allocator, mem_allocator_disconnect); mutex_unlock(&mem_id_lock); @@ -402,15 +376,8 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); - if (likely(xa)) { - napi_direct &= !xdp_return_frame_no_direct(); - page_pool_put_page(xa->page_pool, page, napi_direct); - } else { - /* Hopefully stack show who to blame for late return */ - WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id); - trace_mem_return_failed(mem, page); - put_page(page); - } + napi_direct &= !xdp_return_frame_no_direct(); + page_pool_put_page(xa->page_pool, page, napi_direct); rcu_read_unlock(); break; case MEM_TYPE_PAGE_SHARED: -- cgit v1.2.3 From 8aef998df3979faa19626acf889abecb733342db Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 15 Nov 2019 12:11:35 +0300 Subject: net: core: allow fast GRO for skbs with Ethernet header in head Commit 78d3fd0b7de8 ("gro: Only use skb_gro_header for completely non-linear packets") back in May'09 (v2.6.31-rc1) has changed the original condition '!skb_headlen(skb)' to 'skb->mac_header == skb->tail' in gro_reset_offset() saying: "Since the drivers that need this optimisation all provide completely non-linear packets" (note that this condition has become the current 'skb_mac_header(skb) == skb_tail_pointer(skb)' later with commmit ced14f6804a9 ("net: Correct comparisons and calculations using skb->tail and skb-transport_header") without any functional changes). For now, we have the following rough statistics for v5.4-rc7: 1) napi_gro_frags: 14 2) napi_gro_receive with skb->head containing (most of) payload: 83 3) napi_gro_receive with skb->head containing all the headers: 20 4) napi_gro_receive with skb->head containing only Ethernet header: 2 With the current condition, fast GRO with the usage of NAPI_GRO_CB(skb)->frag0 is available only in the [1] case. Packets pushed by [2] and [3] go through the 'slow' path, but it's not a problem for them as they already contain all the needed headers in skb->head, so pskb_may_pull() only moves skb->data. The layout of skbs in the fourth [4] case at the moment of dev_gro_receive() is identical to skbs that have come through [1], as napi_frags_skb() pulls Ethernet header to skb->head. The only difference is that the mentioned condition is always false for them, because skb_put() and friends irreversibly alter the tail pointer. They also go through the 'slow' path, but now every single pskb_may_pull() in every single .gro_receive() will call the *really* slow __pskb_pull_tail() to pull headers to head. This significantly decreases the overall performance for no visible reasons. The only two users of method [4] is: * drivers/staging/qlge * drivers/net/wireless/iwlwifi (all three variants: dvm, mvm, mvm-mq) Note that in case with wireless drivers we can't use [1] (napi_gro_frags()) at least for now and mac80211 stack always performs pushes and pulls anyways, so performance hit is inavoidable. At the moment of v2.6.31 the mentioned change was necessary (that's why I don't add the "Fixes:" tag), but it became obsolete since skb_gro_mac_header() has gone in commit a50e233c50db ("net-gro: restore frag0 optimization"), so we can simply revert the condition in gro_reset_offset() to allow skbs from [4] go through the 'fast' path just like in case [1]. This was tested on a 600 MHz MIPS CPU and a custom driver and this patch gave boosts up to 40 Mbps to method [4] in both directions comparing to net-next, which made overall performance relatively close to [1] (without it, [4] is the slowest). v2: - Add more references and explanations to commit message - Fix some typos ibid - No functional changes Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1c799d486623..da78a433c10c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5611,8 +5611,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; - if (skb_mac_header(skb) == skb_tail_pointer(skb) && - pinfo->nr_frags && + if (!skb_headlen(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, -- cgit v1.2.3 From 1e0bd5a091e5d9e0f1d5b0e6329b87bb1792f784 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:02 -0800 Subject: bpf: Switch bpf_map ref counter to atomic64_t so bpf_map_inc() never fails 92117d8443bc ("bpf: fix refcnt overflow") turned refcounting of bpf_map into potentially failing operation, when refcount reaches BPF_MAX_REFCNT limit (32k). Due to using 32-bit counter, it's possible in practice to overflow refcounter and make it wrap around to 0, causing erroneous map free, while there are still references to it, causing use-after-free problems. But having a failing refcounting operations are problematic in some cases. One example is mmap() interface. After establishing initial memory-mapping, user is allowed to arbitrarily map/remap/unmap parts of mapped memory, arbitrarily splitting it into multiple non-contiguous regions. All this happening without any control from the users of mmap subsystem. Rather mmap subsystem sends notifications to original creator of memory mapping through open/close callbacks, which are optionally specified during initial memory mapping creation. These callbacks are used to maintain accurate refcount for bpf_map (see next patch in this series). The problem is that open() callback is not supposed to fail, because memory-mapped resource is set up and properly referenced. This is posing a problem for using memory-mapping with BPF maps. One solution to this is to maintain separate refcount for just memory-mappings and do single bpf_map_inc/bpf_map_put when it goes from/to zero, respectively. There are similar use cases in current work on tcp-bpf, necessitating extra counter as well. This seems like a rather unfortunate and ugly solution that doesn't scale well to various new use cases. Another approach to solve this is to use non-failing refcount_t type, which uses 32-bit counter internally, but, once reaching overflow state at UINT_MAX, stays there. This utlimately causes memory leak, but prevents use after free. But given refcounting is not the most performance-critical operation with BPF maps (it's not used from running BPF program code), we can also just switch to 64-bit counter that can't overflow in practice, potentially disadvantaging 32-bit platforms a tiny bit. This simplifies semantics and allows above described scenarios to not worry about failing refcount increment operation. In terms of struct bpf_map size, we are still good and use the same amount of space: BEFORE (3 cache lines, 8 bytes of padding at the end): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic_t refcnt __attribute__((__aligned__(64))); /* 128 4 */ atomic_t usercnt; /* 132 4 */ struct work_struct work; /* 136 32 */ char name[16]; /* 168 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 146, holes: 1, sum holes: 38 */ /* padding: 8 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); AFTER (same 3 cache lines, no extra padding now): struct bpf_map { const struct bpf_map_ops * ops __attribute__((__aligned__(64))); /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ int spin_lock_off; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ u32 btf_key_type_id; /* 56 4 */ u32 btf_value_type_id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct btf * btf; /* 64 8 */ struct bpf_map_memory memory; /* 72 16 */ bool unpriv_array; /* 88 1 */ bool frozen; /* 89 1 */ /* XXX 38 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ atomic64_t refcnt __attribute__((__aligned__(64))); /* 128 8 */ atomic64_t usercnt; /* 136 8 */ struct work_struct work; /* 144 32 */ char name[16]; /* 176 16 */ /* size: 192, cachelines: 3, members: 21 */ /* sum members: 154, holes: 1, sum holes: 38 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 38 */ } __attribute__((__aligned__(64))); This patch, while modifying all users of bpf_map_inc, also cleans up its interface to match bpf_map_put with separate operations for bpf_map_inc and bpf_map_inc_with_uref (to match bpf_map_put and bpf_map_put_with_uref, respectively). Also, given there are no users of bpf_map_inc_not_zero specifying uref=true, remove uref flag and default to uref=false internally. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191117172806.2195367-2-andriin@fb.com --- drivers/net/ethernet/netronome/nfp/bpf/offload.c | 4 +- include/linux/bpf.h | 10 ++--- kernel/bpf/inode.c | 2 +- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/syscall.c | 51 ++++++++++-------------- kernel/bpf/verifier.c | 6 +-- kernel/bpf/xskmap.c | 6 +-- net/core/bpf_sk_storage.c | 2 +- 8 files changed, 34 insertions(+), 49 deletions(-) (limited to 'net/core') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 88fab6a82acf..06927ba5a3ae 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -46,9 +46,7 @@ nfp_map_ptr_record(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, /* Grab a single ref to the map for our record. The prog destroy ndo * happens after free_used_maps(). */ - map = bpf_map_inc(map, false); - if (IS_ERR(map)) - return PTR_ERR(map); + bpf_map_inc(map); record = kmalloc(sizeof(*record), GFP_KERNEL); if (!record) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5b81cde47314..34a34445c009 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -103,8 +103,8 @@ struct bpf_map { /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ - atomic_t refcnt ____cacheline_aligned; - atomic_t usercnt; + atomic64_t refcnt ____cacheline_aligned; + atomic64_t usercnt; struct work_struct work; char name[BPF_OBJ_NAME_LEN]; }; @@ -783,9 +783,9 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); -struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); -struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map, - bool uref); +void bpf_map_inc(struct bpf_map *map); +void bpf_map_inc_with_uref(struct bpf_map *map); +struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index a70f7209cda3..2f17f24258dc 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -34,7 +34,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) raw = bpf_prog_inc(raw); break; case BPF_TYPE_MAP: - raw = bpf_map_inc(raw, true); + bpf_map_inc_with_uref(raw); break; default: WARN_ON_ONCE(1); diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index fab4fb134547..4cbe987be35b 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -98,7 +98,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, return inner_map; if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) - inner_map = bpf_map_inc(inner_map, false); + bpf_map_inc(inner_map); else inner_map = ERR_PTR(-EINVAL); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c88c815c2154..20030751b7a2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -311,7 +311,7 @@ static void bpf_map_free_deferred(struct work_struct *work) static void bpf_map_put_uref(struct bpf_map *map) { - if (atomic_dec_and_test(&map->usercnt)) { + if (atomic64_dec_and_test(&map->usercnt)) { if (map->ops->map_release_uref) map->ops->map_release_uref(map); } @@ -322,7 +322,7 @@ static void bpf_map_put_uref(struct bpf_map *map) */ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) { - if (atomic_dec_and_test(&map->refcnt)) { + if (atomic64_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ bpf_map_free_id(map, do_idr_lock); btf_put(map->btf); @@ -575,8 +575,8 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map; - atomic_set(&map->refcnt, 1); - atomic_set(&map->usercnt, 1); + atomic64_set(&map->refcnt, 1); + atomic64_set(&map->usercnt, 1); if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; @@ -653,21 +653,19 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -/* prog's and map's refcnt limit */ -#define BPF_MAX_REFCNT 32768 - -struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) +void bpf_map_inc(struct bpf_map *map) { - if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) { - atomic_dec(&map->refcnt); - return ERR_PTR(-EBUSY); - } - if (uref) - atomic_inc(&map->usercnt); - return map; + atomic64_inc(&map->refcnt); } EXPORT_SYMBOL_GPL(bpf_map_inc); +void bpf_map_inc_with_uref(struct bpf_map *map) +{ + atomic64_inc(&map->refcnt); + atomic64_inc(&map->usercnt); +} +EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); + struct bpf_map *bpf_map_get_with_uref(u32 ufd) { struct fd f = fdget(ufd); @@ -677,38 +675,30 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) if (IS_ERR(map)) return map; - map = bpf_map_inc(map, true); + bpf_map_inc_with_uref(map); fdput(f); return map; } /* map_idr_lock should have been held */ -static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, - bool uref) +static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) { int refold; - refold = atomic_fetch_add_unless(&map->refcnt, 1, 0); - - if (refold >= BPF_MAX_REFCNT) { - __bpf_map_put(map, false); - return ERR_PTR(-EBUSY); - } - + refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0); if (!refold) return ERR_PTR(-ENOENT); - if (uref) - atomic_inc(&map->usercnt); + atomic64_inc(&map->usercnt); return map; } -struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, bool uref) +struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) { spin_lock_bh(&map_idr_lock); - map = __bpf_map_inc_not_zero(map, uref); + map = __bpf_map_inc_not_zero(map, false); spin_unlock_bh(&map_idr_lock); return map; @@ -1455,6 +1445,9 @@ static struct bpf_prog *____bpf_prog_get(struct fd f) return f.file->private_data; } +/* prog's refcnt limit */ +#define BPF_MAX_REFCNT 32768 + struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) { if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e9dc95a18d44..9f59f7a19dd0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8179,11 +8179,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) * will be used by the valid program until it's unloaded * and all maps are released in free_used_maps() */ - map = bpf_map_inc(map, false); - if (IS_ERR(map)) { - fdput(f); - return PTR_ERR(map); - } + bpf_map_inc(map); aux->map_index = env->used_map_cnt; env->used_maps[env->used_map_cnt++] = map; diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index da16c30868f3..90c4fce1c981 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -11,10 +11,8 @@ int xsk_map_inc(struct xsk_map *map) { - struct bpf_map *m = &map->map; - - m = bpf_map_inc(m, false); - return PTR_ERR_OR_ZERO(m); + bpf_map_inc(&map->map); + return 0; } void xsk_map_put(struct xsk_map *map) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index da5639a5bd3b..458be6b3eda9 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -798,7 +798,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) * Try to grab map refcnt to make sure that it's still * alive and prevent concurrent removal. */ - map = bpf_map_inc_not_zero(&smap->map, false); + map = bpf_map_inc_not_zero(&smap->map); if (IS_ERR(map)) continue; -- cgit v1.2.3 From c491eae8f9c0720520ebdeb4d335671f84b84b71 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 16 Nov 2019 12:22:38 +0100 Subject: xdp: remove memory poison on free for struct xdp_mem_allocator When looking at the details I realised that the memory poison in __xdp_mem_allocator_rcu_free doesn't make sense. This is because the SLUB allocator uses the first 16 bytes (on 64 bit), for its freelist, which overlap with members in struct xdp_mem_allocator, that were updated. Thus, SLUB already does the "poisoning" for us. I still believe that poisoning memory make sense in other cases. Kernel have gained different use-after-free detection mechanism, but enabling those is associated with a huge overhead. Experience is that debugging facilities can change the timing so much, that that a race condition will not be provoked when enabled. Thus, I'm still in favour of poisoning memory where it makes sense. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/xdp.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net/core') diff --git a/net/core/xdp.c b/net/core/xdp.c index 8e405abaf05a..e334fad0a6b8 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -73,11 +73,6 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) /* Allow this ID to be reused */ ida_simple_remove(&mem_id_pool, xa->mem.id); - /* Poison memory */ - xa->mem.id = 0xFFFF; - xa->mem.type = 0xF0F0; - xa->allocator = (void *)0xDEAD9001; - kfree(xa); } -- cgit v1.2.3 From 7c9e69428da39ed761c9d903c4850368fa4ef7bf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sat, 16 Nov 2019 12:22:43 +0100 Subject: page_pool: add destroy attempts counter and rename tracepoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Jonathan change the page_pool to become responsible to its own shutdown via deferred work queue, then the disconnect_cnt counter was removed from xdp memory model tracepoint. This patch change the page_pool_inflight tracepoint name to page_pool_release, because it reflects the new responsability better. And it reintroduces a counter that reflect the number of times page_pool_release have been tried. The counter is also used by the code, to only empty the alloc cache once. With a stuck work queue running every second and counter being 64-bit, it will overrun in approx 584 billion years. For comparison, Earth lifetime expectancy is 7.5 billion years, before the Sun will engulf, and destroy, the Earth. Signed-off-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/net/page_pool.h | 2 ++ include/trace/events/page_pool.h | 9 ++++++--- net/core/page_pool.c | 13 +++++++++++-- 3 files changed, 19 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 1121faa99c12..ace881c15dcb 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -112,6 +112,8 @@ struct page_pool { * refcnt serves purpose is to simplify drivers error handling. */ refcount_t user_cnt; + + u64 destroy_cnt; }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h index 47b5ee880aa9..ee7f1aca7839 100644 --- a/include/trace/events/page_pool.h +++ b/include/trace/events/page_pool.h @@ -10,7 +10,7 @@ #include -TRACE_EVENT(page_pool_inflight, +TRACE_EVENT(page_pool_release, TP_PROTO(const struct page_pool *pool, s32 inflight, u32 hold, u32 release), @@ -22,6 +22,7 @@ TRACE_EVENT(page_pool_inflight, __field(s32, inflight) __field(u32, hold) __field(u32, release) + __field(u64, cnt) ), TP_fast_assign( @@ -29,10 +30,12 @@ TRACE_EVENT(page_pool_inflight, __entry->inflight = inflight; __entry->hold = hold; __entry->release = release; + __entry->cnt = pool->destroy_cnt; ), - TP_printk("page_pool=%p inflight=%d hold=%u release=%u", - __entry->pool, __entry->inflight, __entry->hold, __entry->release) + TP_printk("page_pool=%p inflight=%d hold=%u release=%u cnt=%llu", + __entry->pool, __entry->inflight, __entry->hold, + __entry->release, __entry->cnt) ); TRACE_EVENT(page_pool_state_release, diff --git a/net/core/page_pool.c b/net/core/page_pool.c index dfc2501c35d9..e28db2ef8e12 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -200,7 +200,7 @@ static s32 page_pool_inflight(struct page_pool *pool) inflight = _distance(hold_cnt, release_cnt); - trace_page_pool_inflight(pool, inflight, hold_cnt, release_cnt); + trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); return inflight; @@ -349,10 +349,13 @@ static void page_pool_free(struct page_pool *pool) kfree(pool); } -static void page_pool_scrub(struct page_pool *pool) +static void page_pool_empty_alloc_cache_once(struct page_pool *pool) { struct page *page; + if (pool->destroy_cnt) + return; + /* Empty alloc cache, assume caller made sure this is * no-longer in use, and page_pool_alloc_pages() cannot be * call concurrently. @@ -361,6 +364,12 @@ static void page_pool_scrub(struct page_pool *pool) page = pool->alloc.cache[--pool->alloc.count]; __page_pool_return_page(pool, page); } +} + +static void page_pool_scrub(struct page_pool *pool) +{ + page_pool_empty_alloc_cache_once(pool); + pool->destroy_cnt++; /* No more consumers should exist, but producers could still * be in-flight. -- cgit v1.2.3 From bc836748707cf6b8b1a948b61149278f109107da Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Wed, 20 Nov 2019 00:15:17 +0000 Subject: page_pool: Add API to update numa node Add page_pool_update_nid() to be called by page pool consumers when they detect numa node changes. It will update the page pool nid value to start allocating from the new effective numa node. This is to mitigate page pool allocating pages from a wrong numa node, where the pool was originally allocated, and holding on to pages that belong to a different numa node, which causes performance degradation. For pages that are already being consumed and could be returned to the pool by the consumer, in next patch we will add a check per page to avoid recycling them back to the pool and return them to the page allocator. Signed-off-by: Saeed Mahameed Acked-by: Jonathan Lemon Reviewed-by: Ilias Apalodimas Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/page_pool.h | 7 +++++++ include/trace/events/page_pool.h | 22 ++++++++++++++++++++++ net/core/page_pool.c | 8 ++++++++ 3 files changed, 37 insertions(+) (limited to 'net/core') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index ace881c15dcb..e2e1b7b1e8ba 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -204,4 +204,11 @@ static inline bool page_pool_put(struct page_pool *pool) return refcount_dec_and_test(&pool->user_cnt); } +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid); +static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) +{ + if (unlikely(pool->p.nid != new_nid)) + page_pool_update_nid(pool, new_nid); +} #endif /* _NET_PAGE_POOL_H */ diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h index 2f2a10e8eb56..ad0aa7f31675 100644 --- a/include/trace/events/page_pool.h +++ b/include/trace/events/page_pool.h @@ -89,6 +89,28 @@ TRACE_EVENT(page_pool_state_hold, __entry->pool, __entry->page, __entry->pfn, __entry->hold) ); +TRACE_EVENT(page_pool_update_nid, + + TP_PROTO(const struct page_pool *pool, int new_nid), + + TP_ARGS(pool, new_nid), + + TP_STRUCT__entry( + __field(const struct page_pool *, pool) + __field(int, pool_nid) + __field(int, new_nid) + ), + + TP_fast_assign( + __entry->pool = pool; + __entry->pool_nid = pool->p.nid; + __entry->new_nid = new_nid; + ), + + TP_printk("page_pool=%p pool_nid=%d new_nid=%d", + __entry->pool, __entry->pool_nid, __entry->new_nid) +); + #endif /* _TRACE_PAGE_POOL_H */ /* This part must be outside protection */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index e28db2ef8e12..9b704ea3f4b2 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -436,3 +436,11 @@ void page_pool_destroy(struct page_pool *pool) schedule_delayed_work(&pool->release_dw, DEFER_TIME); } EXPORT_SYMBOL(page_pool_destroy); + +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid) +{ + trace_page_pool_update_nid(pool, new_nid); + pool->p.nid = new_nid; +} +EXPORT_SYMBOL(page_pool_update_nid); -- cgit v1.2.3 From d5394610b1ba06373b3543b5177a4c91052f9f62 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Wed, 20 Nov 2019 00:15:19 +0000 Subject: page_pool: Don't recycle non-reusable pages A page is NOT reusable when at least one of the following is true: 1) allocated when system was under some pressure. (page_is_pfmemalloc) 2) belongs to a different NUMA node than pool->p.nid. To update pool->p.nid users should call page_pool_update_nid(). Holding on to such pages in the pool will hurt the consumer performance when the pool migrates to a different numa node. Performance testing: XDP drop/tx rate and TCP single/multi stream, on mlx5 driver while migrating rx ring irq from close to far numa: mlx5 internal page cache was locally disabled to get pure page pool results. CPU: Intel(R) Xeon(R) CPU E5-2603 v4 @ 1.70GHz NIC: Mellanox Technologies MT27700 Family [ConnectX-4] (100G) XDP Drop/TX single core: NUMA | XDP | Before | After --------------------------------------- Close | Drop | 11 Mpps | 10.9 Mpps Far | Drop | 4.4 Mpps | 5.8 Mpps Close | TX | 6.5 Mpps | 6.5 Mpps Far | TX | 3.5 Mpps | 4 Mpps Improvement is about 30% drop packet rate, 15% tx packet rate for numa far test. No degradation for numa close tests. TCP single/multi cpu/stream: NUMA | #cpu | Before | After -------------------------------------- Close | 1 | 18 Gbps | 18 Gbps Far | 1 | 15 Gbps | 18 Gbps Close | 12 | 80 Gbps | 80 Gbps Far | 12 | 68 Gbps | 80 Gbps In all test cases we see improvement for the far numa case, and no impact on the close numa case. The impact of adding a check per page is very negligible, and shows no performance degradation whatsoever, also functionality wise it seems more correct and more robust for page pool to verify when pages should be recycled, since page pool can't guarantee where pages are coming from. Signed-off-by: Saeed Mahameed Acked-by: Jonathan Lemon Reviewed-by: Ilias Apalodimas Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/page_pool.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 9b704ea3f4b2..6c7f78bd6421 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -281,6 +281,17 @@ static bool __page_pool_recycle_direct(struct page *page, return true; } +/* page is NOT reusable when: + * 1) allocated when system is under some pressure. (page_is_pfmemalloc) + * 2) belongs to a different NUMA node than pool->p.nid. + * + * To update pool->p.nid users must call page_pool_update_nid. + */ +static bool pool_page_reusable(struct page_pool *pool, struct page *page) +{ + return !page_is_pfmemalloc(page) && page_to_nid(page) == pool->p.nid; +} + void __page_pool_put_page(struct page_pool *pool, struct page *page, bool allow_direct) { @@ -290,7 +301,8 @@ void __page_pool_put_page(struct page_pool *pool, * * refcnt == 1 means page_pool owns page, and can recycle it. */ - if (likely(page_ref_count(page) == 1)) { + if (likely(page_ref_count(page) == 1 && + pool_page_reusable(pool, page))) { /* Read barrier done in page_ref_count / READ_ONCE */ if (allow_direct && in_serving_softirq()) -- cgit v1.2.3 From b8eb718348b8fb30b5a7d0a8fce26fb3f4ac741b Mon Sep 17 00:00:00 2001 From: Jouni Hogander Date: Wed, 20 Nov 2019 09:08:16 +0200 Subject: net-sysfs: Fix reference count leak in rx|netdev_queue_add_kobject kobject_init_and_add takes reference even when it fails. This has to be given up by the caller in error handling. Otherwise memory allocated by kobject_init_and_add is never freed. Originally found by Syzkaller: BUG: memory leak unreferenced object 0xffff8880679f8b08 (size 8): comm "netdev_register", pid 269, jiffies 4294693094 (age 12.132s) hex dump (first 8 bytes): 72 78 2d 30 00 36 20 d4 rx-0.6 . backtrace: [<000000008c93818e>] __kmalloc_track_caller+0x16e/0x290 [<000000001f2e4e49>] kvasprintf+0xb1/0x140 [<000000007f313394>] kvasprintf_const+0x56/0x160 [<00000000aeca11c8>] kobject_set_name_vargs+0x5b/0x140 [<0000000073a0367c>] kobject_init_and_add+0xd8/0x170 [<0000000088838e4b>] net_rx_queue_update_kobjects+0x152/0x560 [<000000006be5f104>] netdev_register_kobject+0x210/0x380 [<00000000e31dab9d>] register_netdevice+0xa1b/0xf00 [<00000000f68b2465>] __tun_chr_ioctl+0x20d5/0x3dd0 [<000000004c50599f>] tun_chr_ioctl+0x2f/0x40 [<00000000bbd4c317>] do_vfs_ioctl+0x1c7/0x1510 [<00000000d4c59e8f>] ksys_ioctl+0x99/0xb0 [<00000000946aea81>] __x64_sys_ioctl+0x78/0xb0 [<0000000038d946e5>] do_syscall_64+0x16f/0x580 [<00000000e0aa5d8f>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [<00000000285b3d1a>] 0xffffffffffffffff Cc: David Miller Cc: Lukas Bulwahn Signed-off-by: Jouni Hogander Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 865ba6ca16eb..4f404bf33e44 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -923,21 +923,23 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, "rx-%u", index); if (error) - return error; + goto err; dev_hold(queue->dev); if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); - if (error) { - kobject_put(kobj); - return error; - } + if (error) + goto err; } kobject_uevent(kobj, KOBJ_ADD); return error; + +err: + kobject_put(kobj); + return error; } #endif /* CONFIG_SYSFS */ @@ -1461,21 +1463,21 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, "tx-%u", index); if (error) - return error; + goto err; dev_hold(queue->dev); #ifdef CONFIG_BQL error = sysfs_create_group(kobj, &dql_group); - if (error) { - kobject_put(kobj); - return error; - } + if (error) + goto err; #endif kobject_uevent(kobj, KOBJ_ADD); - return 0; +err: + kobject_put(kobj); + return error; } #endif /* CONFIG_SYSFS */ -- cgit v1.2.3 From e68bc75691cc3de608c2c7505057c948d13ae587 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 20 Nov 2019 16:54:18 +0200 Subject: net: page_pool: add the possibility to sync DMA memory for device Introduce the following parameters in order to add the possibility to sync DMA memory for device before putting allocated pages in the page_pool caches: - PP_FLAG_DMA_SYNC_DEV: if set in page_pool_params flags, all pages that the driver gets from page_pool will be DMA-synced-for-device according to the length provided by the device driver. Please note DMA-sync-for-CPU is still device driver responsibility - offset: DMA address offset where the DMA engine starts copying rx data - max_len: maximum DMA memory size page_pool is allowed to flush. This is currently used in __page_pool_alloc_pages_slow routine when pages are allocated from page allocator These parameters are supposed to be set by device drivers. This optimization reduces the length of the DMA-sync-for-device. The optimization is valid because pages are initially DMA-synced-for-device as defined via max_len. At RX time, the driver will perform a DMA-sync-for-CPU on the memory for the packet length. What is important is the memory occupied by packet payload, because this is the area CPU is allowed to read and modify. As we don't track cache-lines written into by the CPU, simply use the packet payload length as dma_sync_size at page_pool recycle time. This also take into account any tail-extend. Tested-by: Matteo Croce Signed-off-by: Lorenzo Bianconi Signed-off-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/net/page_pool.h | 24 ++++++++++++++++++------ net/core/page_pool.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index e2e1b7b1e8ba..cfbed00ba7ee 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -34,8 +34,18 @@ #include #include -#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */ -#define PP_FLAG_ALL PP_FLAG_DMA_MAP +#define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA + * map/unmap + */ +#define PP_FLAG_DMA_SYNC_DEV BIT(1) /* If set all pages that the driver gets + * from page_pool will be + * DMA-synced-for-device according to + * the length provided by the device + * driver. + * Please note DMA-sync-for-CPU is still + * device driver responsibility + */ +#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV) /* * Fast allocation side cache array/stack @@ -65,6 +75,8 @@ struct page_pool_params { int nid; /* Numa node id to allocate from pages from */ struct device *dev; /* device, for DMA pre-mapping purposes */ enum dma_data_direction dma_dir; /* DMA mapping direction */ + unsigned int max_len; /* max DMA sync memory size */ + unsigned int offset; /* DMA addr offset */ }; struct page_pool { @@ -151,8 +163,8 @@ static inline void page_pool_use_xdp_mem(struct page_pool *pool, #endif /* Never call this directly, use helpers below */ -void __page_pool_put_page(struct page_pool *pool, - struct page *page, bool allow_direct); +void __page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct); static inline void page_pool_put_page(struct page_pool *pool, struct page *page, bool allow_direct) @@ -161,14 +173,14 @@ static inline void page_pool_put_page(struct page_pool *pool, * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - __page_pool_put_page(pool, page, allow_direct); + __page_pool_put_page(pool, page, -1, allow_direct); #endif } /* Very limited use-cases allow recycle direct */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) { - __page_pool_put_page(pool, page, true); + __page_pool_put_page(pool, page, -1, true); } /* Disconnects a page (from a page_pool). API users can have a need diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 6c7f78bd6421..a6aefe989043 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -47,6 +47,21 @@ static int page_pool_init(struct page_pool *pool, (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { + /* In order to request DMA-sync-for-device the page + * needs to be mapped + */ + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + return -EINVAL; + + if (!pool->p.max_len) + return -EINVAL; + + /* pool->p.offset has to be set according to the address + * offset used by the DMA engine to start copying rx data + */ + } + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; @@ -115,6 +130,16 @@ static struct page *__page_pool_get_cached(struct page_pool *pool) return page; } +static void page_pool_dma_sync_for_device(struct page_pool *pool, + struct page *page, + unsigned int dma_sync_size) +{ + dma_sync_size = min(dma_sync_size, pool->p.max_len); + dma_sync_single_range_for_device(pool->p.dev, page->dma_addr, + pool->p.offset, dma_sync_size, + pool->p.dma_dir); +} + /* slow path */ noinline static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, @@ -159,6 +184,9 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, } page->dma_addr = dma; + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, pool->p.max_len); + skip_dma_map: /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -292,8 +320,8 @@ static bool pool_page_reusable(struct page_pool *pool, struct page *page) return !page_is_pfmemalloc(page) && page_to_nid(page) == pool->p.nid; } -void __page_pool_put_page(struct page_pool *pool, - struct page *page, bool allow_direct) +void __page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the @@ -305,6 +333,10 @@ void __page_pool_put_page(struct page_pool *pool, pool_page_reusable(pool, page))) { /* Read barrier done in page_ref_count / READ_ONCE */ + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, + dma_sync_size); + if (allow_direct && in_serving_softirq()) if (__page_pool_recycle_direct(page, pool)) return; -- cgit v1.2.3 From 48a322b6f9965b2f1e4ce81af972f0e287b07ed0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Nov 2019 19:19:07 -0800 Subject: net-sysfs: fix netdev_queue_add_kobject() breakage kobject_put() should only be called in error path. Fixes: b8eb718348b8 ("net-sysfs: Fix reference count leak in rx|netdev_queue_add_kobject") Signed-off-by: Eric Dumazet Cc: Jouni Hogander Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 4f404bf33e44..ae3bcb1540ec 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1474,6 +1474,7 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) #endif kobject_uevent(kobj, KOBJ_ADD); + return 0; err: kobject_put(kobj); -- cgit v1.2.3 From 8163999db445021f2651a8a47b5632483e8722ea Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 21 Nov 2019 08:25:09 -0800 Subject: bpf: skmsg, fix potential psock NULL pointer dereference Report from Dan Carpenter, net/core/skmsg.c:792 sk_psock_write_space() error: we previously assumed 'psock' could be null (see line 790) net/core/skmsg.c 789 psock = sk_psock(sk); 790 if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))) Check for NULL 791 schedule_work(&psock->work); 792 write_space = psock->saved_write_space; ^^^^^^^^^^^^^^^^^^^^^^^^ 793 rcu_read_unlock(); 794 write_space(sk); Ensure psock dereference on line 792 only occurs if psock is not null. Reported-by: Dan Carpenter Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/core/skmsg.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index ad31e4e53d0a..a469d2124f3f 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -793,15 +793,18 @@ static void sk_psock_strp_data_ready(struct sock *sk) static void sk_psock_write_space(struct sock *sk) { struct sk_psock *psock; - void (*write_space)(struct sock *sk); + void (*write_space)(struct sock *sk) = NULL; rcu_read_lock(); psock = sk_psock(sk); - if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))) - schedule_work(&psock->work); - write_space = psock->saved_write_space; + if (likely(psock)) { + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_work(&psock->work); + write_space = psock->saved_write_space; + } rcu_read_unlock(); - write_space(sk); + if (write_space) + write_space(sk); } int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) -- cgit v1.2.3 From ff08ddba3a55caadd0ae531975b06b407d008ae7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 20 Nov 2019 15:34:38 +0300 Subject: net: rtnetlink: prevent underflows in do_setvfinfo() The "ivm->vf" variable is a u32, but the problem is that a number of drivers cast it to an int and then forget to check for negatives. An example of this is in the cxgb4 driver. drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 2890 static int cxgb4_mgmt_get_vf_config(struct net_device *dev, 2891 int vf, struct ifla_vf_info *ivi) ^^^^^^ 2892 { 2893 struct port_info *pi = netdev_priv(dev); 2894 struct adapter *adap = pi->adapter; 2895 struct vf_info *vfinfo; 2896 2897 if (vf >= adap->num_vfs) ^^^^^^^^^^^^^^^^^^^ 2898 return -EINVAL; 2899 vfinfo = &adap->vfinfo[vf]; ^^^^^^^^^^^^^^^^^^^^^^^^^^ There are 48 functions affected. drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c:8435 hclge_set_vf_vlan_filter() warn: can 'vfid' underflow 's32min-2147483646' drivers/net/ethernet/freescale/enetc/enetc_pf.c:377 enetc_pf_set_vf_mac() warn: can 'vf' underflow 's32min-2147483646' drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c:2899 cxgb4_mgmt_get_vf_config() warn: can 'vf' underflow 's32min-254' drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c:2960 cxgb4_mgmt_set_vf_rate() warn: can 'vf' underflow 's32min-254' drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c:3019 cxgb4_mgmt_set_vf_rate() warn: can 'vf' underflow 's32min-254' drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c:3038 cxgb4_mgmt_set_vf_vlan() warn: can 'vf' underflow 's32min-254' drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c:3086 cxgb4_mgmt_set_vf_link_state() warn: can 'vf' underflow 's32min-254' drivers/net/ethernet/chelsio/cxgb/cxgb2.c:791 get_eeprom() warn: can 'i' underflow 's32min-(-4),0,4-s32max' drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c:82 bnxt_set_vf_spoofchk() warn: can 'vf_id' underflow 's32min-65534' drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c:164 bnxt_set_vf_trust() warn: can 'vf_id' underflow 's32min-65534' drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c:186 bnxt_get_vf_config() warn: can 'vf_id' underflow 's32min-65534' drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c:228 bnxt_set_vf_mac() warn: can 'vf_id' underflow 's32min-65534' drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c:264 bnxt_set_vf_vlan() warn: can 'vf_id' underflow 's32min-65534' drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c:293 bnxt_set_vf_bw() warn: can 'vf_id' underflow 's32min-65534' drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c:333 bnxt_set_vf_link_state() warn: can 'vf_id' underflow 's32min-65534' drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c:2595 bnx2x_vf_op_prep() warn: can 'vfidx' underflow 's32min-63' drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c:2595 bnx2x_vf_op_prep() warn: can 'vfidx' underflow 's32min-63' drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c:2281 bnx2x_post_vf_bulletin() warn: can 'vf' underflow 's32min-63' drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c:2285 bnx2x_post_vf_bulletin() warn: can 'vf' underflow 's32min-63' drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c:2286 bnx2x_post_vf_bulletin() warn: can 'vf' underflow 's32min-63' drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c:2292 bnx2x_post_vf_bulletin() warn: can 'vf' underflow 's32min-63' drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c:2297 bnx2x_post_vf_bulletin() warn: can 'vf' underflow 's32min-63' drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c:1832 qlcnic_sriov_set_vf_mac() warn: can 'vf' underflow 's32min-254' drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c:1864 qlcnic_sriov_set_vf_tx_rate() warn: can 'vf' underflow 's32min-254' drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c:1937 qlcnic_sriov_set_vf_vlan() warn: can 'vf' underflow 's32min-254' drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c:2005 qlcnic_sriov_get_vf_config() warn: can 'vf' underflow 's32min-254' drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_pf.c:2036 qlcnic_sriov_set_vf_spoofchk() warn: can 'vf' underflow 's32min-254' drivers/net/ethernet/emulex/benet/be_main.c:1914 be_get_vf_config() warn: can 'vf' underflow 's32min-65534' drivers/net/ethernet/emulex/benet/be_main.c:1915 be_get_vf_config() warn: can 'vf' underflow 's32min-65534' drivers/net/ethernet/emulex/benet/be_main.c:1922 be_set_vf_tvt() warn: can 'vf' underflow 's32min-65534' drivers/net/ethernet/emulex/benet/be_main.c:1951 be_clear_vf_tvt() warn: can 'vf' underflow 's32min-65534' drivers/net/ethernet/emulex/benet/be_main.c:2063 be_set_vf_tx_rate() warn: can 'vf' underflow 's32min-65534' drivers/net/ethernet/emulex/benet/be_main.c:2091 be_set_vf_link_state() warn: can 'vf' underflow 's32min-65534' drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c:2609 ice_set_vf_port_vlan() warn: can 'vf_id' underflow 's32min-65534' drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c:3050 ice_get_vf_cfg() warn: can 'vf_id' underflow 's32min-65534' drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c:3103 ice_set_vf_spoofchk() warn: can 'vf_id' underflow 's32min-65534' drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c:3181 ice_set_vf_mac() warn: can 'vf_id' underflow 's32min-65534' drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c:3237 ice_set_vf_trust() warn: can 'vf_id' underflow 's32min-65534' drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c:3286 ice_set_vf_link_state() warn: can 'vf_id' underflow 's32min-65534' drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c:3919 i40e_validate_vf() warn: can 'vf_id' underflow 's32min-2147483646' drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c:3957 i40e_ndo_set_vf_mac() warn: can 'vf_id' underflow 's32min-2147483646' drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c:4104 i40e_ndo_set_vf_port_vlan() warn: can 'vf_id' underflow 's32min-2147483646' drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c:4263 i40e_ndo_set_vf_bw() warn: can 'vf_id' underflow 's32min-2147483646' drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c:4309 i40e_ndo_get_vf_config() warn: can 'vf_id' underflow 's32min-2147483646' drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c:4371 i40e_ndo_set_vf_link_state() warn: can 'vf_id' underflow 's32min-2147483646' drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c:4441 i40e_ndo_set_vf_spoofchk() warn: can 'vf_id' underflow 's32min-2147483646' drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c:4441 i40e_ndo_set_vf_spoofchk() warn: can 'vf_id' underflow 's32min-2147483646' drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c:4504 i40e_ndo_set_vf_trust() warn: can 'vf_id' underflow 's32min-2147483646' Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c81cd80114d9..e4ec575c1fba 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2195,6 +2195,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_MAC]) { struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]); + if (ivm->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_mac) err = ops->ndo_set_vf_mac(dev, ivm->vf, @@ -2206,6 +2208,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_VLAN]) { struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]); + if (ivv->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, @@ -2238,6 +2242,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (len == 0) return -EINVAL; + if (ivvl[0]->vf >= INT_MAX) + return -EINVAL; err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) @@ -2248,6 +2254,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]); struct ifla_vf_info ivf; + if (ivt->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_get_vf_config) err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf); @@ -2266,6 +2274,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_RATE]) { struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]); + if (ivt->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_rate) err = ops->ndo_set_vf_rate(dev, ivt->vf, @@ -2278,6 +2288,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_SPOOFCHK]) { struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]); + if (ivs->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_spoofchk) err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, @@ -2289,6 +2301,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_LINK_STATE]) { struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]); + if (ivl->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_link_state) err = ops->ndo_set_vf_link_state(dev, ivl->vf, @@ -2302,6 +2316,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) err = -EOPNOTSUPP; ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]); + if (ivrssq_en->vf >= INT_MAX) + return -EINVAL; if (ops->ndo_set_vf_rss_query_en) err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf, ivrssq_en->setting); @@ -2312,6 +2328,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_TRUST]) { struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]); + if (ivt->vf >= INT_MAX) + return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_trust) err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting); @@ -2322,15 +2340,18 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (tb[IFLA_VF_IB_NODE_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]); + if (ivt->vf >= INT_MAX) + return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; - return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID); } if (tb[IFLA_VF_IB_PORT_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]); + if (ivt->vf >= INT_MAX) + return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; -- cgit v1.2.3 From fc5141cb6a60afd81cf53cf4f9bd986f1b846010 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 22 Nov 2019 20:38:01 +0800 Subject: net: gro: use vlan API instead of accessing directly Use vlan common api to access the vlan_tag info. Signed-off-by: Tonghao Zhang Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index da78a433c10c..c7fc902ccbdc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5586,7 +5586,7 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); if (skb_vlan_tag_present(p)) - diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) -- cgit v1.2.3