From 8b58a39846568dcd7d0c98b2fadc25018e59dedf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 8 Jul 2015 23:32:12 +0200 Subject: ipv6: use flag instead of u16 for hop in inet6_skb_parm Hop was always either 0 or sizeof(struct ipv6hdr). Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 82806c60aa42..1319a6bb6b82 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -94,7 +94,6 @@ static inline struct ipv6hdr *ipipv6_hdr(const struct sk_buff *skb) struct inet6_skb_parm { int iif; __be16 ra; - __u16 hop; __u16 dst0; __u16 srcrt; __u16 dst1; @@ -111,6 +110,7 @@ struct inet6_skb_parm { #define IP6SKB_REROUTED 4 #define IP6SKB_ROUTERALERT 8 #define IP6SKB_FRAGMENTED 16 +#define IP6SKB_HOPBYHOP 32 }; #define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) -- cgit v1.2.3 From 634ec36cc0ab9d8dda0f2c101fa28d2e2a61b9eb Mon Sep 17 00:00:00 2001 From: David Thomson Date: Fri, 10 Jul 2015 13:56:54 +1200 Subject: net: phy: Pass mdix ethtool setting through to phy driver Pass the mdix setting from ethtool down to the phy driver, to allow driver specific implementations of manually setting the polarity. Signed-off-by: David Thomson Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 2 ++ include/linux/phy.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index b2197b506acb..47693a9ebd3a 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -353,6 +353,8 @@ int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd) phydev->duplex = cmd->duplex; + phydev->mdix = cmd->eth_tp_mdix_ctrl; + /* Restart the PHY */ phy_start_aneg(phydev); diff --git a/include/linux/phy.h b/include/linux/phy.h index a26c3f84b8dd..e5fb1d415961 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -424,6 +424,8 @@ struct phy_device { struct net_device *attached_dev; + u8 mdix; + void (*adjust_link)(struct net_device *dev); }; #define to_phy_device(d) container_of(d, struct phy_device, dev) -- cgit v1.2.3 From 70aa996601335ca3069190ebcdae8870828086a8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 10 Jul 2015 18:13:20 -0500 Subject: netfilter: kill nf_hooks_active The function obscures what is going on in nf_hook_thresh and it's existence requires computing the hook list twice. Signed-off-by: "Eric W. Biederman" Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 00050dfd9f23..60e89348a91d 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -150,11 +150,6 @@ static inline bool nf_hook_list_active(struct list_head *nf_hook_list, } #endif -static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook) -{ - return nf_hook_list_active(&nf_hooks[pf][hook], pf, hook); -} - int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state); /** @@ -172,10 +167,12 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, int (*okfn)(struct sock *, struct sk_buff *), int thresh) { - if (nf_hooks_active(pf, hook)) { + struct list_head *nf_hook_list = &nf_hooks[pf][hook]; + + if (nf_hook_list_active(nf_hook_list, pf, hook)) { struct nf_hook_state state; - nf_hook_state_init(&state, &nf_hooks[pf][hook], hook, thresh, + nf_hook_state_init(&state, nf_hook_list, hook, thresh, pf, indev, outdev, sk, okfn); return nf_hook_slow(skb, &state); } -- cgit v1.2.3 From 085db2c04557d31db61541f361bd8b4de92c9939 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 10 Jul 2015 18:15:06 -0500 Subject: netfilter: Per network namespace netfilter hooks. - Add a new set of functions for registering and unregistering per network namespace hooks. - Modify the old global namespace hook functions to use the per network namespace hooks in their implementation, so their remains a single list that needs to be walked for any hook (this is important for keeping the hook priority working and for keeping the code walking the hooks simple). - Only allow registering the per netdevice hooks in the network namespace where the network device lives. - Dynamically allocate the structures in the per network namespace hook list in nf_register_net_hook, and unregister them in nf_unregister_net_hook. Dynamic allocate is required somewhere as the number of network namespaces are not fixed so we might as well allocate them in the registration function. The chain of registered hooks on any list is expected to be small so the cost of walking that list to find the entry we are unregistering should also be small. Performing the management of the dynamically allocated list entries in the registration and unregistration functions keeps the complexity from spreading. Signed-off-by: "Eric W. Biederman" --- include/linux/netfilter.h | 14 +++- include/net/netns/netfilter.h | 1 + net/netfilter/core.c | 182 +++++++++++++++++++++++++++++++++++++----- 3 files changed, 173 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 60e89348a91d..9bbd110ec81b 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include #ifdef CONFIG_NETFILTER static inline int NF_DROP_GETERR(int verdict) @@ -118,6 +120,13 @@ struct nf_sockopt_ops { }; /* Function to register/unregister hook points. */ +int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops); +void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops); +int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n); +void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n); + int nf_register_hook(struct nf_hook_ops *reg); void nf_unregister_hook(struct nf_hook_ops *reg); int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n); @@ -128,8 +137,6 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n); int nf_register_sockopt(struct nf_sockopt_ops *reg); void nf_unregister_sockopt(struct nf_sockopt_ops *reg); -extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; - #ifdef HAVE_JUMP_LABEL extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; @@ -167,7 +174,8 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, int (*okfn)(struct sock *, struct sk_buff *), int thresh) { - struct list_head *nf_hook_list = &nf_hooks[pf][hook]; + struct net *net = dev_net(indev ? indev : outdev); + struct list_head *nf_hook_list = &net->nf.hooks[pf][hook]; if (nf_hook_list_active(nf_hook_list, pf, hook)) { struct nf_hook_state state; diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 532e4ba64f49..38aa4983e2a9 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -14,5 +14,6 @@ struct netns_nf { #ifdef CONFIG_SYSCTL struct ctl_table_header *nf_log_dir_header; #endif + struct list_head hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; }; #endif diff --git a/net/netfilter/core.c b/net/netfilter/core.c index fa4d3c111d3f..56ead1a1711c 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -52,9 +52,6 @@ void nf_unregister_afinfo(const struct nf_afinfo *afinfo) } EXPORT_SYMBOL_GPL(nf_unregister_afinfo); -struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; -EXPORT_SYMBOL(nf_hooks); - #ifdef HAVE_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); @@ -62,27 +59,40 @@ EXPORT_SYMBOL(nf_hooks_needed); static DEFINE_MUTEX(nf_hook_mutex); -static struct list_head *find_nf_hook_list(const struct nf_hook_ops *reg) +static struct list_head *find_nf_hook_list(struct net *net, + const struct nf_hook_ops *reg) { struct list_head *nf_hook_list = NULL; if (reg->pf != NFPROTO_NETDEV) - nf_hook_list = &nf_hooks[reg->pf][reg->hooknum]; + nf_hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; else if (reg->hooknum == NF_NETDEV_INGRESS) { #ifdef CONFIG_NETFILTER_INGRESS - if (reg->dev) + if (reg->dev && dev_net(reg->dev) == net) nf_hook_list = ®->dev->nf_hooks_ingress; #endif } return nf_hook_list; } -int nf_register_hook(struct nf_hook_ops *reg) +int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { struct list_head *nf_hook_list; - struct nf_hook_ops *elem; + struct nf_hook_ops *elem, *new; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; - nf_hook_list = find_nf_hook_list(reg); + new->hook = reg->hook; + new->dev = reg->dev; + new->owner = reg->owner; + new->priv = reg->priv; + new->pf = reg->pf; + new->hooknum = reg->hooknum; + new->priority = reg->priority; + + nf_hook_list = find_nf_hook_list(net, reg); if (!nf_hook_list) return -ENOENT; @@ -91,7 +101,7 @@ int nf_register_hook(struct nf_hook_ops *reg) if (reg->priority < elem->priority) break; } - list_add_rcu(®->list, elem->list.prev); + list_add_rcu(&new->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); #ifdef CONFIG_NETFILTER_INGRESS if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) @@ -102,13 +112,35 @@ int nf_register_hook(struct nf_hook_ops *reg) #endif return 0; } -EXPORT_SYMBOL(nf_register_hook); +EXPORT_SYMBOL(nf_register_net_hook); -void nf_unregister_hook(struct nf_hook_ops *reg) +void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { + struct list_head *nf_hook_list; + struct nf_hook_ops *elem; + + nf_hook_list = find_nf_hook_list(net, reg); + if (!nf_hook_list) + return; + mutex_lock(&nf_hook_mutex); - list_del_rcu(®->list); + list_for_each_entry(elem, nf_hook_list, list) { + if ((reg->hook == elem->hook) && + (reg->dev == elem->dev) && + (reg->owner == elem->owner) && + (reg->priv == elem->priv) && + (reg->pf == elem->pf) && + (reg->hooknum == elem->hooknum) && + (reg->priority == elem->priority)) { + list_del_rcu(&elem->list); + break; + } + } mutex_unlock(&nf_hook_mutex); + if (&elem->list == nf_hook_list) { + WARN(1, "nf_unregister_net_hook: hook not found!\n"); + return; + } #ifdef CONFIG_NETFILTER_INGRESS if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) net_dec_ingress_queue(); @@ -117,7 +149,77 @@ void nf_unregister_hook(struct nf_hook_ops *reg) static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); - nf_queue_nf_hook_drop(reg); + nf_queue_nf_hook_drop(elem); + kfree(elem); +} +EXPORT_SYMBOL(nf_unregister_net_hook); + +int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = nf_register_net_hook(net, ®[i]); + if (err) + goto err; + } + return err; + +err: + if (i > 0) + nf_unregister_net_hooks(net, reg, i); + return err; +} +EXPORT_SYMBOL(nf_register_net_hooks); + +void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n) +{ + while (n-- > 0) + nf_unregister_net_hook(net, ®[n]); +} +EXPORT_SYMBOL(nf_unregister_net_hooks); + +static LIST_HEAD(nf_hook_list); + +int nf_register_hook(struct nf_hook_ops *reg) +{ + struct net *net, *last; + int ret; + + rtnl_lock(); + for_each_net(net) { + ret = nf_register_net_hook(net, reg); + if (ret && ret != -ENOENT) + goto rollback; + } + list_add_tail(®->list, &nf_hook_list); + rtnl_unlock(); + + return 0; +rollback: + last = net; + for_each_net(net) { + if (net == last) + break; + nf_unregister_net_hook(net, reg); + } + rtnl_unlock(); + return ret; +} +EXPORT_SYMBOL(nf_register_hook); + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ + struct net *net; + + rtnl_lock(); + list_del(®->list); + for_each_net(net) + nf_unregister_net_hook(net, reg); + rtnl_unlock(); } EXPORT_SYMBOL(nf_unregister_hook); @@ -294,8 +396,46 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); EXPORT_SYMBOL(nf_nat_decode_session_hook); #endif +static int nf_register_hook_list(struct net *net) +{ + struct nf_hook_ops *elem; + int ret; + + rtnl_lock(); + list_for_each_entry(elem, &nf_hook_list, list) { + ret = nf_register_net_hook(net, elem); + if (ret && ret != -ENOENT) + goto out_undo; + } + rtnl_unlock(); + return 0; + +out_undo: + list_for_each_entry_continue_reverse(elem, &nf_hook_list, list) + nf_unregister_net_hook(net, elem); + rtnl_unlock(); + return ret; +} + +static void nf_unregister_hook_list(struct net *net) +{ + struct nf_hook_ops *elem; + + rtnl_lock(); + list_for_each_entry(elem, &nf_hook_list, list) + nf_unregister_net_hook(net, elem); + rtnl_unlock(); +} + static int __net_init netfilter_net_init(struct net *net) { + int i, h, ret; + + for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) { + for (h = 0; h < NF_MAX_HOOKS; h++) + INIT_LIST_HEAD(&net->nf.hooks[i][h]); + } + #ifdef CONFIG_PROC_FS net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", net->proc_net); @@ -306,11 +446,16 @@ static int __net_init netfilter_net_init(struct net *net) return -ENOMEM; } #endif - return 0; + ret = nf_register_hook_list(net); + if (ret) + remove_proc_entry("netfilter", net->proc_net); + + return ret; } static void __net_exit netfilter_net_exit(struct net *net) { + nf_unregister_hook_list(net); remove_proc_entry("netfilter", net->proc_net); } @@ -321,12 +466,7 @@ static struct pernet_operations netfilter_net_ops = { int __init netfilter_init(void) { - int i, h, ret; - - for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) { - for (h = 0; h < NF_MAX_HOOKS; h++) - INIT_LIST_HEAD(&nf_hooks[i][h]); - } + int ret; ret = register_pernet_subsys(&netfilter_net_ops); if (ret < 0) -- cgit v1.2.3 From e7c8899f3e6f2830136cf6e115c4a55ce7a3920a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jul 2015 17:51:07 +0200 Subject: netfilter: move tee_active to core This prepares for a TEE like expression in nftables. We want to ensure only one duplicate is sent, so both will use the same percpu variable to detect duplication. The other use case is detection of recursive call to xtables, but since we don't want dependency from nft to xtables core its put into core.c instead of the x_tables core. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 11 +++++++++++ net/netfilter/core.c | 3 +++ net/netfilter/xt_TEE.c | 13 ++++++------- 3 files changed, 20 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 9bbd110ec81b..e01da73ee6c4 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -390,4 +390,15 @@ extern struct nfq_ct_hook __rcu *nfq_ct_hook; static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} #endif +/** + * nf_skb_duplicated - TEE target has sent a packet + * + * When a xtables target sends a packet, the OUTPUT and POSTROUTING + * hooks are traversed again, i.e. nft and xtables are invoked recursively. + * + * This is used by xtables TEE target to prevent the duplicated skb from + * being duplicated again. + */ +DECLARE_PER_CPU(bool, nf_skb_duplicated); + #endif /*__LINUX_NETFILTER_H*/ diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 56ead1a1711c..6896cee8b733 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -34,6 +34,9 @@ EXPORT_SYMBOL(nf_afinfo); const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); +DEFINE_PER_CPU(bool, nf_skb_duplicated); +EXPORT_SYMBOL_GPL(nf_skb_duplicated); + int nf_register_afinfo(const struct nf_afinfo *afinfo) { mutex_lock(&afinfo_mutex); diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index a747eb475b68..8950e79c4dc9 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -37,7 +37,6 @@ struct xt_tee_priv { }; static const union nf_inet_addr tee_zero_address; -static DEFINE_PER_CPU(bool, tee_active); static struct net *pick_net(struct sk_buff *skb) { @@ -88,7 +87,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) const struct xt_tee_tginfo *info = par->targinfo; struct iphdr *iph; - if (__this_cpu_read(tee_active)) + if (__this_cpu_read(nf_skb_duplicated)) return XT_CONTINUE; /* * Copy the skb, and route the copy. Will later return %XT_CONTINUE for @@ -125,9 +124,9 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) ip_send_check(iph); if (tee_tg_route4(skb, info)) { - __this_cpu_write(tee_active, true); + __this_cpu_write(nf_skb_duplicated, true); ip_local_out(skb); - __this_cpu_write(tee_active, false); + __this_cpu_write(nf_skb_duplicated, false); } else { kfree_skb(skb); } @@ -170,7 +169,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; - if (__this_cpu_read(tee_active)) + if (__this_cpu_read(nf_skb_duplicated)) return XT_CONTINUE; skb = pskb_copy(skb, GFP_ATOMIC); if (skb == NULL) @@ -188,9 +187,9 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) --iph->hop_limit; } if (tee_tg_route6(skb, info)) { - __this_cpu_write(tee_active, true); + __this_cpu_write(nf_skb_duplicated, true); ip6_local_out(skb); - __this_cpu_write(tee_active, false); + __this_cpu_write(nf_skb_duplicated, false); } else { kfree_skb(skb); } -- cgit v1.2.3 From 7814b6ec6d0d63444abdb49554166c8cfcbd063e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jul 2015 17:51:08 +0200 Subject: netfilter: xtables: don't save/restore jumpstack offset In most cases there is no reentrancy into ip/ip6tables. For skbs sent by REJECT or SYNPROXY targets, there is one level of reentrancy, but its not relevant as those targets issue an absolute verdict, i.e. the jumpstack can be clobbered since its not used after the target issues absolute verdict (ACCEPT, DROP, STOLEN, etc). So the only special case where it is relevant is the TEE target, which returns XT_CONTINUE. This patch changes ip(6)_do_table to always use the jump stack starting from 0. When we detect we're operating on an skb sent via TEE (percpu nf_skb_duplicated is 1) we switch to an alternate stack to leave the original one alone. Since there is no TEE support for arptables, it doesn't need to test if tee is active. The jump stack overflow tests are no longer needed as well -- since ->stacksize is the largest call depth we cannot exceed it. A much better alternative to the external jumpstack would be to just declare a jumps[32] stack on the local stack frame, but that would mean we'd have to reject iptables rulesets that used to work before. Another alternative would be to start rejecting rulesets with a larger call depth, e.g. 1000 -- in this case it would be feasible to allocate the entire stack in the percpu area which would avoid one dereference. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 - net/ipv4/netfilter/arp_tables.c | 11 +++-------- net/ipv4/netfilter/ip_tables.c | 37 ++++++++++++++++++++----------------- net/ipv6/netfilter/ip6_tables.c | 26 ++++++++++++++------------ net/netfilter/x_tables.c | 22 +++++++++++----------- 5 files changed, 48 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 286098a5667f..149284557ca7 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -222,7 +222,6 @@ struct xt_table_info { * @stacksize jumps (number of user chains) can possibly be made. */ unsigned int stacksize; - unsigned int __percpu *stackptr; void ***jumpstack; unsigned char entries[0] __aligned(8); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index ae6d0a124213..969fdbe6fbb5 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -280,6 +280,9 @@ unsigned int arpt_do_table(struct sk_buff *skb, table_base = private->entries; jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; + /* No TEE support for arptables, so no need to switch to alternate + * stack. All targets that reenter must return absolute verdicts. + */ e = get_entry(table_base, private->hook_entry[hook]); acpar.in = state->in; @@ -325,11 +328,6 @@ unsigned int arpt_do_table(struct sk_buff *skb, } if (table_base + v != arpt_next_entry(e)) { - - if (stackidx >= private->stacksize) { - verdict = NF_DROP; - break; - } jumpstack[stackidx++] = e; } @@ -337,9 +335,6 @@ unsigned int arpt_do_table(struct sk_buff *skb, continue; } - /* Targets which reenter must return - * abs. verdicts - */ acpar.target = t->u.kernel.target; acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 5e44b35a8de8..a2e4b018a254 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -296,12 +296,13 @@ ipt_do_table(struct sk_buff *skb, const char *indev, *outdev; const void *table_base; struct ipt_entry *e, **jumpstack; - unsigned int *stackptr, origptr, cpu; + unsigned int stackidx, cpu; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; /* Initialization */ + stackidx = 0; ip = ip_hdr(skb); indev = state->in ? state->in->name : nulldevname; outdev = state->out ? state->out->name : nulldevname; @@ -331,13 +332,20 @@ ipt_do_table(struct sk_buff *skb, smp_read_barrier_depends(); table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; - stackptr = per_cpu_ptr(private->stackptr, cpu); - origptr = *stackptr; + + /* Switch to alternate jumpstack if we're being invoked via TEE. + * TEE issues XT_CONTINUE verdict on original skb so we must not + * clobber the jumpstack. + * + * For recursion via REJECT or SYNPROXY the stack will be clobbered + * but it is no problem since absolute verdict is issued by these. + */ + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); - pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n", - table->name, hook, origptr, + pr_debug("Entering %s(hook %u), UF %p\n", + table->name, hook, get_entry(table_base, private->underflow[hook])); do { @@ -383,28 +391,24 @@ ipt_do_table(struct sk_buff *skb, verdict = (unsigned int)(-v) - 1; break; } - if (*stackptr <= origptr) { + if (stackidx == 0) { e = get_entry(table_base, private->underflow[hook]); pr_debug("Underflow (this is normal) " "to %p\n", e); } else { - e = jumpstack[--*stackptr]; + e = jumpstack[--stackidx]; pr_debug("Pulled %p out from pos %u\n", - e, *stackptr); + e, stackidx); e = ipt_next_entry(e); } continue; } if (table_base + v != ipt_next_entry(e) && !(e->ip.flags & IPT_F_GOTO)) { - if (*stackptr >= private->stacksize) { - verdict = NF_DROP; - break; - } - jumpstack[(*stackptr)++] = e; + jumpstack[stackidx++] = e; pr_debug("Pushed %p into pos %u\n", - e, *stackptr - 1); + e, stackidx - 1); } e = get_entry(table_base, v); @@ -423,9 +427,8 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - pr_debug("Exiting %s; resetting sp from %u to %u\n", - __func__, *stackptr, origptr); - *stackptr = origptr; + pr_debug("Exiting %s; sp at %u\n", __func__, stackidx); + xt_write_recseq_end(addend); local_bh_enable(); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index baf032179918..531281f0ff86 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -324,12 +324,13 @@ ip6t_do_table(struct sk_buff *skb, const char *indev, *outdev; const void *table_base; struct ip6t_entry *e, **jumpstack; - unsigned int *stackptr, origptr, cpu; + unsigned int stackidx, cpu; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; /* Initialization */ + stackidx = 0; indev = state->in ? state->in->name : nulldevname; outdev = state->out ? state->out->name : nulldevname; /* We handle fragments by dealing with the first fragment as @@ -357,8 +358,15 @@ ip6t_do_table(struct sk_buff *skb, cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; - stackptr = per_cpu_ptr(private->stackptr, cpu); - origptr = *stackptr; + + /* Switch to alternate jumpstack if we're being invoked via TEE. + * TEE issues XT_CONTINUE verdict on original skb so we must not + * clobber the jumpstack. + * + * For recursion via REJECT or SYNPROXY the stack will be clobbered + * but it is no problem since absolute verdict is issued by these. + */ + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); @@ -406,20 +414,16 @@ ip6t_do_table(struct sk_buff *skb, verdict = (unsigned int)(-v) - 1; break; } - if (*stackptr <= origptr) + if (stackidx == 0) e = get_entry(table_base, private->underflow[hook]); else - e = ip6t_next_entry(jumpstack[--*stackptr]); + e = ip6t_next_entry(jumpstack[--stackidx]); continue; } if (table_base + v != ip6t_next_entry(e) && !(e->ipv6.flags & IP6T_F_GOTO)) { - if (*stackptr >= private->stacksize) { - verdict = NF_DROP; - break; - } - jumpstack[(*stackptr)++] = e; + jumpstack[stackidx++] = e; } e = get_entry(table_base, v); @@ -437,8 +441,6 @@ ip6t_do_table(struct sk_buff *skb, break; } while (!acpar.hotdrop); - *stackptr = origptr; - xt_write_recseq_end(addend); local_bh_enable(); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 4db7d60d42fa..154447e519ab 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -67,9 +67,6 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = { [NFPROTO_IPV6] = "ip6", }; -/* Allow this many total (re)entries. */ -static const unsigned int xt_jumpstack_multiplier = 2; - /* Registration hooks for targets. */ int xt_register_target(struct xt_target *target) { @@ -688,8 +685,6 @@ void xt_free_table_info(struct xt_table_info *info) kvfree(info->jumpstack); } - free_percpu(info->stackptr); - kvfree(info); } EXPORT_SYMBOL(xt_free_table_info); @@ -737,10 +732,6 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) unsigned int size; int cpu; - i->stackptr = alloc_percpu(unsigned int); - if (i->stackptr == NULL) - return -ENOMEM; - size = sizeof(void **) * nr_cpu_ids; if (size > PAGE_SIZE) i->jumpstack = vzalloc(size); @@ -753,8 +744,17 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) if (i->stacksize == 0) return 0; - i->stacksize *= xt_jumpstack_multiplier; - size = sizeof(void *) * i->stacksize; + /* Jumpstack needs to be able to record two full callchains, one + * from the first rule set traversal, plus one table reentrancy + * via -j TEE without clobbering the callchain that brought us to + * TEE target. + * + * This is done by allocating two jumpstacks per cpu, on reentry + * the upper half of the stack is used. + * + * see the jumpstack setup in ipt_do_table() for more details. + */ + size = sizeof(void *) * i->stacksize * 2u; for_each_possible_cpu(cpu) { if (size > PAGE_SIZE) i->jumpstack[cpu] = vmalloc_node(size, -- cgit v1.2.3 From dcebd3153e0a7749bb054ab73fa4e1ca33e9d3f9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jul 2015 17:51:09 +0200 Subject: netfilter: add and use jump label for xt_tee Don't bother testing if we need to switch to alternate stack unless TEE target is used. Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 7 +++++++ net/ipv4/netfilter/ip_tables.c | 3 ++- net/ipv6/netfilter/ip6_tables.c | 3 ++- net/netfilter/x_tables.c | 3 +++ net/netfilter/xt_TEE.c | 2 ++ 5 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 149284557ca7..b006b719183f 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -3,6 +3,7 @@ #include +#include #include /** @@ -280,6 +281,12 @@ void xt_free_table_info(struct xt_table_info *info); */ DECLARE_PER_CPU(seqcount_t, xt_recseq); +/* xt_tee_enabled - true if x_tables needs to handle reentrancy + * + * Enabled if current ip(6)tables ruleset has at least one -j TEE rule. + */ +extern struct static_key xt_tee_enabled; + /** * xt_write_recseq_begin - start of a write section * diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index a2e4b018a254..ff585bdbf850 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -340,7 +340,8 @@ ipt_do_table(struct sk_buff *skb, * For recursion via REJECT or SYNPROXY the stack will be clobbered * but it is no problem since absolute verdict is issued by these. */ - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + if (static_key_false(&xt_tee_enabled)) + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 531281f0ff86..ea6d105063c2 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -366,7 +366,8 @@ ip6t_do_table(struct sk_buff *skb, * For recursion via REJECT or SYNPROXY the stack will be clobbered * but it is no problem since absolute verdict is issued by these. */ - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + if (static_key_false(&xt_tee_enabled)) + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 154447e519ab..9b42b5ea6dcd 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -727,6 +727,9 @@ EXPORT_SYMBOL_GPL(xt_compat_unlock); DEFINE_PER_CPU(seqcount_t, xt_recseq); EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); +struct static_key xt_tee_enabled __read_mostly; +EXPORT_SYMBOL_GPL(xt_tee_enabled); + static int xt_jumpstack_alloc(struct xt_table_info *i) { unsigned int size; diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 8950e79c4dc9..c5d6556dbc5e 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -251,6 +251,7 @@ static int tee_tg_check(const struct xt_tgchk_param *par) } else info->priv = NULL; + static_key_slow_inc(&xt_tee_enabled); return 0; } @@ -262,6 +263,7 @@ static void tee_tg_destroy(const struct xt_tgdtor_param *par) unregister_netdevice_notifier(&info->priv->notifier); kfree(info->priv); } + static_key_slow_dec(&xt_tee_enabled); } static struct xt_target tee_tg_reg[] __read_mostly = { -- cgit v1.2.3 From d746d707a8b1421a4ba46b497cb5d59e20161645 Mon Sep 17 00:00:00 2001 From: Anuradha Karuppiah Date: Tue, 14 Jul 2015 13:43:19 -0700 Subject: net core: Add protodown support. This patch introduces the proto_down flag that can be used by user space applications to notify switch drivers that errors have been detected on the device. The switch driver can react to protodown notification by doing a phys down on the associated switch port. Signed-off-by: Anuradha Karuppiah Signed-off-by: Andy Gospodarek Signed-off-by: Roopa Prabhu Signed-off-by: Wilson Kok Signed-off-by: David S. Miller --- include/linux/netdevice.h | 14 ++++++++++++++ net/core/dev.c | 20 ++++++++++++++++++++ net/core/net-sysfs.c | 14 ++++++++++++++ 3 files changed, 48 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e20979dfd6a9..45cfd797eb77 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1041,6 +1041,12 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * TX queue. * int (*ndo_get_iflink)(const struct net_device *dev); * Called to get the iflink value of this device. + * void (*ndo_change_proto_down)(struct net_device *dev, + * bool proto_down); + * This function is used to pass protocol port error state information + * to the switch driver. The switch driver can react to the proto_down + * by doing a phys down on the associated switch port. + * */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1211,6 +1217,8 @@ struct net_device_ops { int queue_index, u32 maxrate); int (*ndo_get_iflink)(const struct net_device *dev); + int (*ndo_change_proto_down)(struct net_device *dev, + bool proto_down); }; /** @@ -1502,6 +1510,10 @@ enum netdev_priv_flags { * * @qdisc_tx_busylock: XXX: need comments on this one * + * @proto_down: protocol port state information can be sent to the + * switch driver and used to set the phys state of the + * switch port. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -1762,6 +1774,7 @@ struct net_device { #endif struct phy_device *phydev; struct lock_class_key *qdisc_tx_busylock; + bool proto_down; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -2982,6 +2995,7 @@ int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid); int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len); +int dev_change_proto_down(struct net_device *dev, bool proto_down); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/net/core/dev.c b/net/core/dev.c index 69445a33ace6..8810b6bbebfe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6074,6 +6074,26 @@ int dev_get_phys_port_name(struct net_device *dev, } EXPORT_SYMBOL(dev_get_phys_port_name); +/** + * dev_change_proto_down - update protocol port state information + * @dev: device + * @proto_down: new value + * + * This info can be used by switch drivers to set the phys state of the + * port. + */ +int dev_change_proto_down(struct net_device *dev, bool proto_down) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_change_proto_down) + return -EOPNOTSUPP; + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_change_proto_down(dev, proto_down); +} +EXPORT_SYMBOL(dev_change_proto_down); + /** * dev_new_index - allocate an ifindex * @net: the applicable net namespace diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 18b34d771ed4..194c1d03b2b3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -404,6 +404,19 @@ static ssize_t group_store(struct device *dev, struct device_attribute *attr, NETDEVICE_SHOW(group, fmt_dec); static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store); +static int change_proto_down(struct net_device *dev, unsigned long proto_down) +{ + return dev_change_proto_down(dev, (bool) proto_down); +} + +static ssize_t proto_down_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_proto_down); +} +NETDEVICE_SHOW_RW(proto_down, fmt_dec); + static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -501,6 +514,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, + &dev_attr_proto_down.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); -- cgit v1.2.3 From 0c4f691ff6791e55ac831666df0b49b1679c56e4 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sat, 18 Jul 2015 18:24:48 -0700 Subject: net: don't reforward packets already forwarded by offload device Just before queuing skb for xmit on port, check if skb has been marked by switchdev port driver as already fordwarded by device. If so, drop skb. A non-zero skb->offload_fwd_mark field is set by the switchdev port driver/device on ingress to indicate the skb has already been forwarded by the device to egress ports with matching dev->skb_mark. The switchdev port driver would assign a non-zero dev->offload_skb_mark for each device port netdev during registration, for example. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Acked-by: Roopa Prabhu Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ include/linux/skbuff.h | 9 ++++++++- net/core/dev.c | 10 ++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 45cfd797eb77..8364f29e08be 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1456,6 +1456,8 @@ enum netdev_priv_flags { * * @xps_maps: XXX: need comments on this one * + * @offload_fwd_mark: Offload device fwding mark + * * @trans_start: Time (in jiffies) of last Tx * @watchdog_timeo: Represents the timeout that is used by * the watchdog ( see dev_watchdog() ) @@ -1697,6 +1699,10 @@ struct net_device { struct xps_dev_maps __rcu *xps_maps; #endif +#ifdef CONFIG_NET_SWITCHDEV + u32 offload_fwd_mark; +#endif + /* These may be needed for future network-power-down code. */ /* diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d6cdd6e87d53..af7a09650fa2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -506,6 +506,7 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1, * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking + * @offload_fwd_mark: fwding offload mark * @mark: Generic packet mark * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information @@ -650,9 +651,15 @@ struct sk_buff { unsigned int sender_cpu; }; #endif + union { #ifdef CONFIG_NETWORK_SECMARK - __u32 secmark; + __u32 secmark; +#endif +#ifdef CONFIG_NET_SWITCHDEV + __u32 offload_fwd_mark; #endif + }; + union { __u32 mark; __u32 reserved_tailroom; diff --git a/net/core/dev.c b/net/core/dev.c index 8810b6bbebfe..2ee15afb412d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3061,6 +3061,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) else skb_dst_force(skb); +#ifdef CONFIG_NET_SWITCHDEV + /* Don't forward if offload device already forwarded */ + if (skb->offload_fwd_mark && + skb->offload_fwd_mark == dev->offload_fwd_mark) { + consume_skb(skb); + rc = NET_XMIT_SUCCESS; + goto out; + } +#endif + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); -- cgit v1.2.3 From d754f98b502ad9a8c7570d494e1eaa0e6bc0350c Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sat, 18 Jul 2015 18:24:49 -0700 Subject: net: add phys ID compare helper to test if two IDs are the same Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ net/switchdev/switchdev.c | 8 ++------ 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8364f29e08be..607b5f41f46f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -766,6 +766,13 @@ struct netdev_phys_item_id { unsigned char id_len; }; +static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a, + struct netdev_phys_item_id *b) +{ + return a->id_len == b->id_len && + memcmp(a->id, b->id, a->id_len) == 0; +} + typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 9f2add3cba26..4e5bba50ccff 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -910,13 +910,9 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) if (switchdev_port_attr_get(dev, &attr)) return NULL; - if (nhsel > 0) { - if (prev_attr.u.ppid.id_len != attr.u.ppid.id_len) + if (nhsel > 0 && + !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid)) return NULL; - if (memcmp(prev_attr.u.ppid.id, attr.u.ppid.id, - attr.u.ppid.id_len)) - return NULL; - } prev_attr = attr; } -- cgit v1.2.3 From 6acc23266054a9969737b435fa012f87465dbc50 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 16 Jul 2015 21:50:50 +0200 Subject: net: remove skb_frag_add_head It's not used anywhere. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index af7a09650fa2..6bd96fe9416a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2678,12 +2678,6 @@ static inline void skb_frag_list_init(struct sk_buff *skb) skb_shinfo(skb)->frag_list = NULL; } -static inline void skb_frag_add_head(struct sk_buff *skb, struct sk_buff *frag) -{ - frag->next = skb_shinfo(skb)->frag_list; - skb_shinfo(skb)->frag_list = frag; -} - #define skb_walk_frags(skb, iter) \ for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next) -- cgit v1.2.3 From f4c190eb8b4f80b12dc98ce7d54a3bea0e4e7e69 Mon Sep 17 00:00:00 2001 From: Joachim Eastwood Date: Fri, 17 Jul 2015 00:26:12 +0200 Subject: stmmac: drop custom_* fields from plat_stmmacenet_data Both of these fields are unused and has been unused since they were added 3 and 5 years ago. Drop them since they are clearly not very useful. Signed-off-by: Joachim Eastwood Signed-off-by: David S. Miller --- Documentation/networking/stmmac.txt | 4 ---- include/linux/stmmac.h | 2 -- 2 files changed, 6 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt index e655e2453c98..5fddefa69baf 100644 --- a/Documentation/networking/stmmac.txt +++ b/Documentation/networking/stmmac.txt @@ -139,8 +139,6 @@ struct plat_stmmacenet_data { void (*free)(struct platform_device *pdev, void *priv); int (*init)(struct platform_device *pdev, void *priv); void (*exit)(struct platform_device *pdev, void *priv); - void *custom_cfg; - void *custom_data; void *bsp_priv; }; @@ -186,8 +184,6 @@ Where: which will be stored in bsp_priv, and then passed to init and exit callbacks. init/exit callbacks should not use or modify platform data. - o custom_cfg/custom_data: this is a custom configuration that can be passed - while initializing the resources. o bsp_priv: another private pointer. For MDIO bus The we have: diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index c735f5c91eea..c86a20047cb1 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -123,8 +123,6 @@ struct plat_stmmacenet_data { void (*free)(struct platform_device *pdev, void *priv); int (*init)(struct platform_device *pdev, void *priv); void (*exit)(struct platform_device *pdev, void *priv); - void *custom_cfg; - void *custom_data; void *bsp_priv; }; -- cgit v1.2.3 From 4e10df9a60d96ced321dd2af71da558c6b750078 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 20 Jul 2015 20:34:18 -0700 Subject: bpf: introduce bpf_skb_vlan_push/pop() helpers Allow eBPF programs attached to TC qdiscs call skb_vlan_push/pop via helper functions. These functions may change skb->data/hlen which are cached by some JITs to improve performance of ld_abs/ld_ind instructions. Therefore JITs need to recognize bpf_skb_vlan_push/pop() calls, re-compute header len and re-cache skb->data/hlen back into cpu registers. Note, skb->data/hlen are not directly accessible from the programs, so any changes to skb->data done either by these helpers or by other TC actions are safe. eBPF JIT supported by three architectures: - arm64 JIT is using bpf_load_pointer() without caching, so it's ok as-is. - x64 JIT re-caches skb->data/hlen unconditionally after vlan_push/pop calls (experiments showed that conditional re-caching is slower). - s390 JIT falls back to interpreter for now when bpf_skb_vlan_push() is present in the program (re-caching is tbd). These helpers allow more scalable handling of vlan from the programs. Instead of creating thousands of vlan netdevs on top of eth0 and attaching TC+ingress+bpf to all of them, the program can be attached to eth0 directly and manipulate vlans as necessary. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/s390/net/bpf_jit_comp.c | 4 +++ arch/x86/net/bpf_jit_comp.c | 80 +++++++++++++++++++++++--------------------- include/linux/bpf.h | 2 ++ include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 2 ++ net/core/filter.c | 48 ++++++++++++++++++++++++++ 6 files changed, 99 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index fee782acc2ee..79c731e8d178 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -973,6 +973,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i */ const u64 func = (u64)__bpf_call_base + imm; + if (bpf_helper_changes_skb_data((void *)func)) + /* TODO reload skb->data, hlen */ + return -1; + REG_SET_SEEN(BPF_REG_5); jit->seen |= SEEN_FUNC; /* lg %w1,(%l) */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 579a8fd74be0..6c335a8fc086 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -315,6 +315,26 @@ static void emit_bpf_tail_call(u8 **pprog) *pprog = prog; } + +static void emit_load_skb_data_hlen(u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + + /* r9d = skb->len - skb->data_len (headlen) + * r10 = skb->data + */ + /* mov %r9d, off32(%rdi) */ + EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len)); + + /* sub %r9d, off32(%rdi) */ + EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len)); + + /* mov %r10, off32(%rdi) */ + EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data)); + *pprog = prog; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { @@ -329,36 +349,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, emit_prologue(&prog); - if (seen_ld_abs) { - /* r9d : skb->len - skb->data_len (headlen) - * r10 : skb->data - */ - if (is_imm8(offsetof(struct sk_buff, len))) - /* mov %r9d, off8(%rdi) */ - EMIT4(0x44, 0x8b, 0x4f, - offsetof(struct sk_buff, len)); - else - /* mov %r9d, off32(%rdi) */ - EMIT3_off32(0x44, 0x8b, 0x8f, - offsetof(struct sk_buff, len)); - - if (is_imm8(offsetof(struct sk_buff, data_len))) - /* sub %r9d, off8(%rdi) */ - EMIT4(0x44, 0x2b, 0x4f, - offsetof(struct sk_buff, data_len)); - else - EMIT3_off32(0x44, 0x2b, 0x8f, - offsetof(struct sk_buff, data_len)); - - if (is_imm8(offsetof(struct sk_buff, data))) - /* mov %r10, off8(%rdi) */ - EMIT4(0x4c, 0x8b, 0x57, - offsetof(struct sk_buff, data)); - else - /* mov %r10, off32(%rdi) */ - EMIT3_off32(0x4c, 0x8b, 0x97, - offsetof(struct sk_buff, data)); - } + if (seen_ld_abs) + emit_load_skb_data_hlen(&prog); for (i = 0; i < insn_cnt; i++, insn++) { const s32 imm32 = insn->imm; @@ -367,6 +359,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 b1 = 0, b2 = 0, b3 = 0; s64 jmp_offset; u8 jmp_cond; + bool reload_skb_data; int ilen; u8 *func; @@ -818,12 +811,18 @@ xadd: if (is_imm8(insn->off)) func = (u8 *) __bpf_call_base + imm32; jmp_offset = func - (image + addrs[i]); if (seen_ld_abs) { - EMIT2(0x41, 0x52); /* push %r10 */ - EMIT2(0x41, 0x51); /* push %r9 */ - /* need to adjust jmp offset, since - * pop %r9, pop %r10 take 4 bytes after call insn - */ - jmp_offset += 4; + reload_skb_data = bpf_helper_changes_skb_data(func); + if (reload_skb_data) { + EMIT1(0x57); /* push %rdi */ + jmp_offset += 22; /* pop, mov, sub, mov */ + } else { + EMIT2(0x41, 0x52); /* push %r10 */ + EMIT2(0x41, 0x51); /* push %r9 */ + /* need to adjust jmp offset, since + * pop %r9, pop %r10 take 4 bytes after call insn + */ + jmp_offset += 4; + } } if (!imm32 || !is_simm32(jmp_offset)) { pr_err("unsupported bpf func %d addr %p image %p\n", @@ -832,8 +831,13 @@ xadd: if (is_imm8(insn->off)) } EMIT1_off32(0xE8, jmp_offset); if (seen_ld_abs) { - EMIT2(0x41, 0x59); /* pop %r9 */ - EMIT2(0x41, 0x5A); /* pop %r10 */ + if (reload_skb_data) { + EMIT1(0x5F); /* pop %rdi */ + emit_load_skb_data_hlen(&prog); + } else { + EMIT2(0x41, 0x59); /* pop %r9 */ + EMIT2(0x41, 0x5A); /* pop %r10 */ + } } break; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4383476a0d48..139d6d2e123f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -192,5 +192,7 @@ extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; +extern const struct bpf_func_proto bpf_skb_vlan_push_proto; +extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; #endif /* _LINUX_BPF_H */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 17724f6ea983..69d00555ce35 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -411,6 +411,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); void bpf_int_jit_compile(struct bpf_prog *fp); +bool bpf_helper_changes_skb_data(void *func); #ifdef CONFIG_BPF_JIT typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2de87e58b12b..2f6c83d714e9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -256,6 +256,8 @@ enum bpf_func_id { * Return: classid if != 0 */ BPF_FUNC_get_cgroup_classid, + BPF_FUNC_skb_vlan_push, /* bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) */ + BPF_FUNC_skb_vlan_pop, /* bpf_skb_vlan_pop(skb) */ __BPF_FUNC_MAX_ID, }; diff --git a/net/core/filter.c b/net/core/filter.c index 247450a5e387..50338071fac4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1437,6 +1437,50 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + __be16 vlan_proto = (__force __be16) r2; + + if (unlikely(vlan_proto != htons(ETH_P_8021Q) && + vlan_proto != htons(ETH_P_8021AD))) + vlan_proto = htons(ETH_P_8021Q); + + return skb_vlan_push(skb, vlan_proto, vlan_tci); +} + +const struct bpf_func_proto bpf_skb_vlan_push_proto = { + .func = bpf_skb_vlan_push, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + + return skb_vlan_pop(skb); +} + +const struct bpf_func_proto bpf_skb_vlan_pop_proto = { + .func = bpf_skb_vlan_pop, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +bool bpf_helper_changes_skb_data(void *func) +{ + if (func == bpf_skb_vlan_push) + return true; + if (func == bpf_skb_vlan_pop) + return true; + return false; +} + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -1476,6 +1520,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_clone_redirect_proto; case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_skb_vlan_push: + return &bpf_skb_vlan_push_proto; + case BPF_FUNC_skb_vlan_pop: + return &bpf_skb_vlan_pop_proto; default: return sk_filter_func_proto(func_id); } -- cgit v1.2.3 From 499a24256862714539e902c0499b67da2bb3ab72 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:46 +0200 Subject: lwtunnel: infrastructure for handling light weight tunnels like mpls Provides infrastructure to parse/dump/store encap information for light weight tunnels like mpls. Encap information for such tunnels is associated with fib routes. This infrastructure is based on previous suggestions from Eric Biederman to follow the xfrm infrastructure. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/lwtunnel.h | 6 ++ include/net/lwtunnel.h | 132 +++++++++++++++++++++++++++++++ include/uapi/linux/lwtunnel.h | 15 ++++ net/Kconfig | 7 ++ net/core/Makefile | 1 + net/core/lwtunnel.c | 179 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 340 insertions(+) create mode 100644 include/linux/lwtunnel.h create mode 100644 include/net/lwtunnel.h create mode 100644 include/uapi/linux/lwtunnel.h create mode 100644 net/core/lwtunnel.c (limited to 'include/linux') diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h new file mode 100644 index 000000000000..97f32f8b4ae1 --- /dev/null +++ b/include/linux/lwtunnel.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_LWTUNNEL_H_ +#define _LINUX_LWTUNNEL_H_ + +#include + +#endif /* _LINUX_LWTUNNEL_H_ */ diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h new file mode 100644 index 000000000000..df24b3611ff4 --- /dev/null +++ b/include/net/lwtunnel.h @@ -0,0 +1,132 @@ +#ifndef __NET_LWTUNNEL_H +#define __NET_LWTUNNEL_H 1 + +#include +#include +#include +#include +#include + +#define LWTUNNEL_HASH_BITS 7 +#define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) + +/* lw tunnel state flags */ +#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1 + +struct lwtunnel_state { + __u16 type; + __u16 flags; + atomic_t refcnt; + int len; + __u8 data[0]; +}; + +struct lwtunnel_encap_ops { + int (*build_state)(struct net_device *dev, struct nlattr *encap, + struct lwtunnel_state **ts); + int (*output)(struct sock *sk, struct sk_buff *skb); + int (*fill_encap)(struct sk_buff *skb, + struct lwtunnel_state *lwtstate); + int (*get_encap_size)(struct lwtunnel_state *lwtstate); + int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b); +}; + +extern const struct lwtunnel_encap_ops __rcu * + lwtun_encaps[LWTUNNEL_ENCAP_MAX+1]; + +#ifdef CONFIG_LWTUNNEL +static inline void lwtunnel_state_get(struct lwtunnel_state *lws) +{ + atomic_inc(&lws->refcnt); +} + +static inline void lwtunnel_state_put(struct lwtunnel_state *lws) +{ + if (!lws) + return; + + if (atomic_dec_and_test(&lws->refcnt)) + kfree(lws); +} + +static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) +{ + if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT)) + return true; + + return false; +} + +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, + unsigned int num); +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, + unsigned int num); +int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, + struct lwtunnel_state **lws); +int lwtunnel_fill_encap(struct sk_buff *skb, + struct lwtunnel_state *lwtstate); +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); +struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); +int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); + +#else + +static inline void lwtunnel_state_get(struct lwtunnel_state *lws) +{ +} + +static inline void lwtunnel_state_put(struct lwtunnel_state *lws) +{ +} + +static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) +{ + return false; +} + +static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, + unsigned int num) +{ + return -EOPNOTSUPP; + +} + +static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, + unsigned int num) +{ + return -EOPNOTSUPP; +} + +static inline int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, + struct lwtunnel_state **lws) +{ + return -EOPNOTSUPP; +} + +static inline int lwtunnel_fill_encap(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + return 0; +} + +static inline int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) +{ + return 0; +} + +static inline struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len) +{ + return NULL; +} + +static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a, + struct lwtunnel_state *b) +{ + return 0; +} + +#endif + +#endif /* __NET_LWTUNNEL_H */ diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h new file mode 100644 index 000000000000..aa611d931a31 --- /dev/null +++ b/include/uapi/linux/lwtunnel.h @@ -0,0 +1,15 @@ +#ifndef _UAPI_LWTUNNEL_H_ +#define _UAPI_LWTUNNEL_H_ + +#include + +enum lwtunnel_encap_types { + LWTUNNEL_ENCAP_NONE, + LWTUNNEL_ENCAP_MPLS, + __LWTUNNEL_ENCAP_MAX, +}; + +#define LWTUNNEL_ENCAP_MAX (__LWTUNNEL_ENCAP_MAX - 1) + + +#endif /* _UAPI_LWTUNNEL_H_ */ diff --git a/net/Kconfig b/net/Kconfig index 57a7c5af3175..7021c1bf44d6 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -374,6 +374,13 @@ source "net/caif/Kconfig" source "net/ceph/Kconfig" source "net/nfc/Kconfig" +config LWTUNNEL + bool "Network light weight tunnels" + ---help--- + This feature provides an infrastructure to support light weight + tunnels like mpls. There is no netdevice associated with a light + weight tunnel endpoint. Tunnel encapsulation parameters are stored + with light weight tunnel state associated with fib routes. endif # if NET diff --git a/net/core/Makefile b/net/core/Makefile index fec0856dd6c0..086b01fbe1bd 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o +obj-$(CONFIG_LWTUNNEL) += lwtunnel.o diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c new file mode 100644 index 000000000000..d7ae3a235b4b --- /dev/null +++ b/net/core/lwtunnel.c @@ -0,0 +1,179 @@ +/* + * lwtunnel Infrastructure for light weight tunnels like mpls + * + * Authors: Roopa Prabhu, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) +{ + struct lwtunnel_state *lws; + + lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC); + + return lws; +} +EXPORT_SYMBOL(lwtunnel_state_alloc); + +const struct lwtunnel_encap_ops __rcu * + lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly; + +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, + unsigned int num) +{ + if (num > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + return !cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[num], + NULL, ops) ? 0 : -1; +} +EXPORT_SYMBOL(lwtunnel_encap_add_ops); + +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, + unsigned int encap_type) +{ + int ret; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + ret = (cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[encap_type], + ops, NULL) == ops) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_encap_del_ops); + +int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, struct lwtunnel_state **lws) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return ret; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + if (likely(ops && ops->build_state)) + ret = ops->build_state(dev, encap, lws); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_build_state); + +int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + struct nlattr *nest; + int ret = -EINVAL; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + nest = nla_nest_start(skb, RTA_ENCAP); + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->fill_encap)) + ret = ops->fill_encap(skb, lwtstate); + rcu_read_unlock(); + + if (ret) + goto nla_put_failure; + nla_nest_end(skb, nest); + ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type); + if (ret) + goto nla_put_failure; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + + return (ret == -EOPNOTSUPP ? 0 : ret); +} +EXPORT_SYMBOL(lwtunnel_fill_encap); + +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->get_encap_size)) + ret = nla_total_size(ops->get_encap_size(lwtstate)); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_get_encap_size); + +int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!a && !b) + return 0; + + if (!a || !b) + return 1; + + if (a->type != b->type) + return 1; + + if (a->type == LWTUNNEL_ENCAP_NONE || + a->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[a->type]); + if (likely(ops && ops->cmp_encap)) + ret = ops->cmp_encap(a, b); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_cmp_encap); -- cgit v1.2.3 From e3e4712ec0961ed586a8db340bd994c4ad7f5dba Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:53 +0200 Subject: mpls: ip tunnel support This implementation uses lwtunnel infrastructure to register hooks for mpls tunnel encaps. It picks cues from iptunnel_encaps infrastructure and previous mpls iptunnel RFC patches from Eric W. Biederman and Robert Shearman Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/mpls_iptunnel.h | 6 + include/net/mpls_iptunnel.h | 29 +++++ include/uapi/linux/mpls_iptunnel.h | 28 +++++ net/mpls/Kconfig | 8 +- net/mpls/Makefile | 1 + net/mpls/mpls_iptunnel.c | 233 +++++++++++++++++++++++++++++++++++++ 6 files changed, 304 insertions(+), 1 deletion(-) create mode 100644 include/linux/mpls_iptunnel.h create mode 100644 include/net/mpls_iptunnel.h create mode 100644 include/uapi/linux/mpls_iptunnel.h create mode 100644 net/mpls/mpls_iptunnel.c (limited to 'include/linux') diff --git a/include/linux/mpls_iptunnel.h b/include/linux/mpls_iptunnel.h new file mode 100644 index 000000000000..ef29eb2d6dfd --- /dev/null +++ b/include/linux/mpls_iptunnel.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_MPLS_IPTUNNEL_H +#define _LINUX_MPLS_IPTUNNEL_H + +#include + +#endif /* _LINUX_MPLS_IPTUNNEL_H */ diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h new file mode 100644 index 000000000000..4757997f76ed --- /dev/null +++ b/include/net/mpls_iptunnel.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2015 Cumulus Networks, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef _NET_MPLS_IPTUNNEL_H +#define _NET_MPLS_IPTUNNEL_H 1 + +#define MAX_NEW_LABELS 2 + +struct mpls_iptunnel_encap { + u32 label[MAX_NEW_LABELS]; + u32 labels; +}; + +static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate) +{ + return (struct mpls_iptunnel_encap *)lwtstate->data; +} + +#endif diff --git a/include/uapi/linux/mpls_iptunnel.h b/include/uapi/linux/mpls_iptunnel.h new file mode 100644 index 000000000000..d80a0498f77e --- /dev/null +++ b/include/uapi/linux/mpls_iptunnel.h @@ -0,0 +1,28 @@ +/* + * mpls tunnel api + * + * Authors: + * Roopa Prabhu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_MPLS_IPTUNNEL_H +#define _UAPI_LINUX_MPLS_IPTUNNEL_H + +/* MPLS tunnel attributes + * [RTA_ENCAP] = { + * [MPLS_IPTUNNEL_DST] + * } + */ +enum { + MPLS_IPTUNNEL_UNSPEC, + MPLS_IPTUNNEL_DST, + __MPLS_IPTUNNEL_MAX, +}; +#define MPLS_IPTUNNEL_MAX (__MPLS_IPTUNNEL_MAX - 1) + +#endif /* _UAPI_LINUX_MPLS_IPTUNNEL_H */ diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig index 17bde799c854..5c467ef97311 100644 --- a/net/mpls/Kconfig +++ b/net/mpls/Kconfig @@ -24,7 +24,13 @@ config NET_MPLS_GSO config MPLS_ROUTING tristate "MPLS: routing support" - help + ---help--- Add support for forwarding of mpls packets. +config MPLS_IPTUNNEL + tristate "MPLS: IP over MPLS tunnel support" + depends on LWTUNNEL && MPLS_ROUTING + ---help--- + mpls ip tunnel support. + endif # MPLS diff --git a/net/mpls/Makefile b/net/mpls/Makefile index 65bbe68c72e6..9ca923625016 100644 --- a/net/mpls/Makefile +++ b/net/mpls/Makefile @@ -3,5 +3,6 @@ # obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o +obj-$(CONFIG_MPLS_IPTUNNEL) += mpls_iptunnel.o mpls_router-y := af_mpls.o diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c new file mode 100644 index 000000000000..eea096f21ba5 --- /dev/null +++ b/net/mpls/mpls_iptunnel.c @@ -0,0 +1,233 @@ +/* + * mpls tunnels An implementation mpls tunnels using the light weight tunnel + * infrastructure + * + * Authors: Roopa Prabhu, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = { + [MPLS_IPTUNNEL_DST] = { .type = NLA_U32 }, +}; + +static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) +{ + /* The size of the layer 2.5 labels to be added for this route */ + return en->labels * sizeof(struct mpls_shim_hdr); +} + +int mpls_output(struct sock *sk, struct sk_buff *skb) +{ + struct mpls_iptunnel_encap *tun_encap_info; + struct mpls_shim_hdr *hdr; + struct net_device *out_dev; + unsigned int hh_len; + unsigned int new_header_size; + unsigned int mtu; + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = NULL; + struct rt6_info *rt6 = NULL; + struct lwtunnel_state *lwtstate = NULL; + int err = 0; + bool bos; + int i; + unsigned int ttl; + + /* Obtain the ttl */ + if (skb->protocol == htons(ETH_P_IP)) { + ttl = ip_hdr(skb)->ttl; + rt = (struct rtable *)dst; + lwtstate = rt->rt_lwtstate; + } else if (skb->protocol == htons(ETH_P_IPV6)) { + ttl = ipv6_hdr(skb)->hop_limit; + rt6 = (struct rt6_info *)dst; + lwtstate = rt6->rt6i_lwtstate; + } else { + goto drop; + } + + skb_orphan(skb); + + /* Find the output device */ + out_dev = rcu_dereference(dst->dev); + if (!mpls_output_possible(out_dev) || + !lwtstate || skb_warn_if_lro(skb)) + goto drop; + + skb_forward_csum(skb); + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + /* Verify the destination can hold the packet */ + new_header_size = mpls_encap_size(tun_encap_info); + mtu = mpls_dev_mtu(out_dev); + if (mpls_pkt_too_big(skb, mtu - new_header_size)) + goto drop; + + hh_len = LL_RESERVED_SPACE(out_dev); + if (!out_dev->header_ops) + hh_len = 0; + + /* Ensure there is enough space for the headers in the skb */ + if (skb_cow(skb, hh_len + new_header_size)) + goto drop; + + skb_push(skb, new_header_size); + skb_reset_network_header(skb); + + skb->dev = out_dev; + skb->protocol = htons(ETH_P_MPLS_UC); + + /* Push the new labels */ + hdr = mpls_hdr(skb); + bos = true; + for (i = tun_encap_info->labels - 1; i >= 0; i--) { + hdr[i] = mpls_entry_encode(tun_encap_info->label[i], + ttl, 0, bos); + bos = false; + } + + if (rt) + err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway, + skb); + else if (rt6) + err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway, + skb); + if (err) + net_dbg_ratelimited("%s: packet transmission failed: %d\n", + __func__, err); + + return 0; + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int mpls_build_state(struct net_device *dev, struct nlattr *nla, + struct lwtunnel_state **ts) +{ + struct mpls_iptunnel_encap *tun_encap_info; + struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1]; + struct lwtunnel_state *newts; + int tun_encap_info_len; + int ret; + + ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla, + mpls_iptunnel_policy); + if (ret < 0) + return ret; + + if (!tb[MPLS_IPTUNNEL_DST]) + return -EINVAL; + + tun_encap_info_len = sizeof(*tun_encap_info); + + newts = lwtunnel_state_alloc(tun_encap_info_len); + if (!newts) + return -ENOMEM; + + newts->len = tun_encap_info_len; + tun_encap_info = mpls_lwtunnel_encap(newts); + ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS, + &tun_encap_info->labels, tun_encap_info->label); + if (ret) + goto errout; + newts->type = LWTUNNEL_ENCAP_MPLS; + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + + *ts = newts; + + return 0; + +errout: + kfree(newts); + *ts = NULL; + + return ret; +} + +static int mpls_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct mpls_iptunnel_encap *tun_encap_info; + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels, + tun_encap_info->label)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + struct mpls_iptunnel_encap *tun_encap_info; + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + return nla_total_size(tun_encap_info->labels * 4); +} + +static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a); + struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b); + int l; + + if (a_hdr->labels != b_hdr->labels) + return 1; + + for (l = 0; l < MAX_NEW_LABELS; l++) + if (a_hdr->label[l] != b_hdr->label[l]) + return 1; + return 0; +} + +static const struct lwtunnel_encap_ops mpls_iptun_ops = { + .build_state = mpls_build_state, + .output = mpls_output, + .fill_encap = mpls_fill_encap_info, + .get_encap_size = mpls_encap_nlsize, + .cmp_encap = mpls_encap_cmp, +}; + +static int __init mpls_iptunnel_init(void) +{ + return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); +} +module_init(mpls_iptunnel_init); + +static void __exit mpls_iptunnel_exit(void) +{ + lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); +} +module_exit(mpls_iptunnel_exit); + +MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From ee122c79d4227f6ec642157834b6a90fcffa4382 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:43:58 +0200 Subject: vxlan: Flow based tunneling Allows putting a VXLAN device into a new flow-based mode in which skbs with a ip_tunnel_info dst metadata attached will be encapsulated according to the instructions stored in there with the VXLAN device defaults taken into consideration. Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is set, the packet processing will populate a ip_tunnel_info struct for each packet received and attach it to the skb using the new metadata dst. The metadata structure will contain the outer header and tunnel header fields which have been stripped off. Layers further up in the stack such as routing, tc or netfitler can later match on these fields and perform forwarding. It is the responsibility of upper layers to ensure that the flag is set if the metadata is needed. The flag limits the additional cost of metadata collecting based on demand. This prepares the VXLAN device to be steered by the routing and other subsystems which allows to support encapsulation for a large number of tunnel endpoints and tunnel ids through a single net_device which improves the scalability. It also allows for OVS to leverage this mode which in turn allows for the removal of the OVS specific VXLAN code. Because the skb is currently scrubed in vxlan_rcv(), the attachment of the new dst metadata is postponed until after scrubing which requires the temporary addition of a new member to vxlan_metadata. This member is removed again in a later commit after the indirect VXLAN receive API has been removed. Signed-off-by: Thomas Graf Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 149 ++++++++++++++++++++++++++++++++++++------- include/linux/skbuff.h | 1 + include/net/dst_metadata.h | 13 ++++ include/net/ip_tunnels.h | 14 ++++ include/net/vxlan.h | 10 ++- include/uapi/linux/if_link.h | 1 + 6 files changed, 165 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ec86a11743fd..06c092b05a51 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -49,6 +49,7 @@ #include #include #endif +#include #define VXLAN_VERSION "0.1" @@ -140,6 +141,11 @@ struct vxlan_dev { static u32 vxlan_salt __read_mostly; static struct workqueue_struct *vxlan_wq; +static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) +{ + return vs->flags & VXLAN_F_COLLECT_METADATA; +} + #if IS_ENABLED(CONFIG_IPV6) static inline bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) @@ -1164,10 +1170,13 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { + struct metadata_dst *tun_dst = NULL; + struct ip_tunnel_info *info; struct vxlan_sock *vs; struct vxlanhdr *vxh; u32 flags, vni; - struct vxlan_metadata md = {0}; + struct vxlan_metadata _md; + struct vxlan_metadata *md = &_md; /* Need Vxlan and inner Ethernet header to be present */ if (!pskb_may_pull(skb, VXLAN_HLEN)) @@ -1202,6 +1211,33 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) vni &= VXLAN_VNI_MASK; } + if (vxlan_collect_metadata(vs)) { + const struct iphdr *iph = ip_hdr(skb); + + tun_dst = metadata_dst_alloc(sizeof(*md), GFP_ATOMIC); + if (!tun_dst) + goto drop; + + info = &tun_dst->u.tun_info; + info->key.ipv4_src = iph->saddr; + info->key.ipv4_dst = iph->daddr; + info->key.ipv4_tos = iph->tos; + info->key.ipv4_ttl = iph->ttl; + info->key.tp_src = udp_hdr(skb)->source; + info->key.tp_dst = udp_hdr(skb)->dest; + + info->mode = IP_TUNNEL_INFO_RX; + info->key.tun_flags = TUNNEL_KEY; + info->key.tun_id = cpu_to_be64(vni >> 8); + if (udp_hdr(skb)->check != 0) + info->key.tun_flags |= TUNNEL_CSUM; + + md = ip_tunnel_info_opts(info, sizeof(*md)); + md->tun_dst = tun_dst; + } else { + memset(md, 0, sizeof(*md)); + } + /* For backwards compatibility, only allow reserved fields to be * used by VXLAN extensions if explicitly requested. */ @@ -1209,13 +1245,16 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) struct vxlanhdr_gbp *gbp; gbp = (struct vxlanhdr_gbp *)vxh; - md.gbp = ntohs(gbp->policy_id); + md->gbp = ntohs(gbp->policy_id); + + if (tun_dst) + info->key.tun_flags |= TUNNEL_VXLAN_OPT; if (gbp->dont_learn) - md.gbp |= VXLAN_GBP_DONT_LEARN; + md->gbp |= VXLAN_GBP_DONT_LEARN; if (gbp->policy_applied) - md.gbp |= VXLAN_GBP_POLICY_APPLIED; + md->gbp |= VXLAN_GBP_POLICY_APPLIED; flags &= ~VXLAN_GBP_USED_BITS; } @@ -1233,8 +1272,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) goto bad_flags; } - md.vni = vxh->vx_vni; - vs->rcv(vs, skb, &md); + md->vni = vxh->vx_vni; + vs->rcv(vs, skb, md); return 0; drop: @@ -1247,6 +1286,9 @@ bad_flags: ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); error: + if (tun_dst) + dst_release((struct dst_entry *)tun_dst); + /* Return non vxlan pkt */ return 1; } @@ -1263,7 +1305,12 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, int err = 0; union vxlan_addr *remote_ip; - vni = ntohl(md->vni) >> 8; + /* For flow based devices, map all packets to VNI 0 */ + if (vs->flags & VXLAN_F_FLOW_BASED) + vni = 0; + else + vni = ntohl(md->vni) >> 8; + /* Is this VNI defined? */ vxlan = vxlan_vs_find_vni(vs, vni); if (!vxlan) @@ -1292,12 +1339,19 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, #endif } + if (md->tun_dst) { + skb_dst_set(skb, (struct dst_entry *)md->tun_dst); + md->tun_dst = NULL; + } + if ((vxlan->flags & VXLAN_F_LEARN) && vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) goto drop; skb_reset_network_header(skb); - skb->mark = md->gbp; + /* In flow-based mode, GBP is carried in dst_metadata */ + if (!(vs->flags & VXLAN_F_FLOW_BASED)) + skb->mark = md->gbp; if (oip6) err = IP6_ECN_decapsulate(oip6, skb); @@ -1330,6 +1384,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, return; drop: + if (md->tun_dst) + dst_release((struct dst_entry *)md->tun_dst); + /* Consume bad packet */ kfree_skb(skb); } @@ -1878,22 +1935,40 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_rdst *rdst, bool did_rsc) { + struct ip_tunnel_info *info = skb_tunnel_info(skb); struct vxlan_dev *vxlan = netdev_priv(dev); struct sock *sk = vxlan->vn_sock->sock->sk; struct rtable *rt = NULL; const struct iphdr *old_iph; struct flowi4 fl4; union vxlan_addr *dst; - struct vxlan_metadata md; + union vxlan_addr remote_ip; + struct vxlan_metadata _md; + struct vxlan_metadata *md = &_md; __be16 src_port = 0, dst_port; u32 vni; __be16 df = 0; __u8 tos, ttl; int err; + u32 flags = vxlan->flags; - dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; - vni = rdst->remote_vni; - dst = &rdst->remote_ip; + if (rdst) { + dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; + vni = rdst->remote_vni; + dst = &rdst->remote_ip; + } else { + if (!info) { + WARN_ONCE(1, "%s: Missing encapsulation instructions\n", + dev->name); + goto drop; + } + + dst_port = info->key.tp_dst ? : vxlan->dst_port; + vni = be64_to_cpu(info->key.tun_id); + remote_ip.sin.sin_family = AF_INET; + remote_ip.sin.sin_addr.s_addr = info->key.ipv4_dst; + dst = &remote_ip; + } if (vxlan_addr_any(dst)) { if (did_rsc) { @@ -1918,8 +1993,25 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, vxlan->port_max, true); if (dst->sa.sa_family == AF_INET) { + if (info) { + if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) + df = htons(IP_DF); + if (info->key.tun_flags & TUNNEL_CSUM) + flags |= VXLAN_F_UDP_CSUM; + else + flags &= ~VXLAN_F_UDP_CSUM; + + ttl = info->key.ipv4_ttl; + tos = info->key.ipv4_tos; + + if (info->options_len) + md = ip_tunnel_info_opts(info, sizeof(*md)); + } else { + md->gbp = skb->mark; + } + memset(&fl4, 0, sizeof(fl4)); - fl4.flowi4_oif = rdst->remote_ifindex; + fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0; fl4.flowi4_tos = RT_TOS(tos); fl4.flowi4_mark = skb->mark; fl4.flowi4_proto = IPPROTO_UDP; @@ -1958,14 +2050,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - md.vni = htonl(vni << 8); - md.gbp = skb->mark; - + md->vni = htonl(vni << 8); err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr, dst->sin.sin_addr.s_addr, tos, ttl, df, - src_port, dst_port, &md, + src_port, dst_port, md, !net_eq(vxlan->net, dev_net(vxlan->dev)), - vxlan->flags); + flags); if (err < 0) { /* skb is already freed. */ skb = NULL; @@ -1980,7 +2070,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, u32 flags; memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = rdst->remote_ifindex; + fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0; fl6.daddr = dst->sin6.sin6_addr; fl6.saddr = vxlan->saddr.sin6.sin6_addr; fl6.flowi6_mark = skb->mark; @@ -2018,11 +2108,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } ttl = ttl ? : ip6_dst_hoplimit(ndst); - md.vni = htonl(vni << 8); - md.gbp = skb->mark; + md->vni = htonl(vni << 8); + md->gbp = skb->mark; err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr, - 0, ttl, src_port, dst_port, &md, + 0, ttl, src_port, dst_port, md, !net_eq(vxlan->net, dev_net(vxlan->dev)), vxlan->flags); #endif @@ -2051,6 +2141,7 @@ tx_free: static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); + const struct ip_tunnel_info *info = skb_tunnel_info(skb); struct ethhdr *eth; bool did_rsc = false; struct vxlan_rdst *rdst, *fdst = NULL; @@ -2078,6 +2169,12 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) #endif } + if (vxlan->flags & VXLAN_F_FLOW_BASED && + info && info->mode == IP_TUNNEL_INFO_TX) { + vxlan_xmit_one(skb, dev, NULL, false); + return NETDEV_TX_OK; + } + f = vxlan_find_mac(vxlan, eth->h_dest); did_rsc = false; @@ -2405,6 +2502,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, + [IFLA_VXLAN_FLOWBASED] = { .type = NLA_U8 }, [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, @@ -2681,6 +2779,10 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, if (data[IFLA_VXLAN_LIMIT]) vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); + if (data[IFLA_VXLAN_FLOWBASED] && + nla_get_u8(data[IFLA_VXLAN_FLOWBASED])) + vxlan->flags |= VXLAN_F_FLOW_BASED; + if (data[IFLA_VXLAN_PORT_RANGE]) { const struct ifla_vxlan_port_range *p = nla_data(data[IFLA_VXLAN_PORT_RANGE]); @@ -2777,6 +2879,7 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_FLOWBASED */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ nla_total_size(sizeof(struct ifla_vxlan_port_range)) + @@ -2843,6 +2946,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) !!(vxlan->flags & VXLAN_F_L2MISS)) || nla_put_u8(skb, IFLA_VXLAN_L3MISS, !!(vxlan->flags & VXLAN_F_L3MISS)) || + nla_put_u8(skb, IFLA_VXLAN_FLOWBASED, + !!(vxlan->flags & VXLAN_F_FLOW_BASED)) || nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) || nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) || nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port) || diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6bd96fe9416a..648a2c241993 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3469,5 +3469,6 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb) skb_network_header(skb); return hdr_len + skb_gso_transport_seglen(skb); } + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 4f7694f3c7d0..e843937fb30a 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -8,6 +8,9 @@ struct metadata_dst { struct dst_entry dst; size_t opts_len; + union { + struct ip_tunnel_info tun_info; + } u; }; static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) @@ -20,6 +23,16 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) return NULL; } +static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) +{ + struct metadata_dst *md_dst = skb_metadata_dst(skb); + + if (md_dst) + return &md_dst->u.tun_info; + + return NULL; +} + static inline bool skb_valid_dst(const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 6b9d559ce5f5..d11530f1c1e2 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -38,10 +38,19 @@ struct ip_tunnel_key { __be16 tp_dst; } __packed __aligned(4); /* Minimize padding. */ +/* Indicates whether the tunnel info structure represents receive + * or transmit tunnel parameters. + */ +enum { + IP_TUNNEL_INFO_RX, + IP_TUNNEL_INFO_TX, +}; + struct ip_tunnel_info { struct ip_tunnel_key key; const void *options; u8 options_len; + u8 mode; }; /* 6rd prefix/relay information */ @@ -284,6 +293,11 @@ static inline void iptunnel_xmit_stats(int err, } } +static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n) +{ + return info + 1; +} + #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */ diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 0082b5d33d7d..80a2da29e088 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -7,6 +7,7 @@ #include #include #include +#include #define VNI_HASH_BITS 10 #define VNI_HASH_SIZE (1< Date: Wed, 22 Jul 2015 16:38:25 +0900 Subject: ipv6: sysctl to restrict candidate source addresses Per RFC 6724, section 4, "Candidate Source Addresses": It is RECOMMENDED that the candidate source addresses be the set of unicast addresses assigned to the interface that will be used to send to the destination (the "outgoing" interface). Add a sysctl to enable this behaviour. Signed-off-by: Erik Kline Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 7 +++++++ include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 22 +++++++++++++++++++--- 4 files changed, 28 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index f63aeefd2c24..1a5ab21bcca5 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1460,6 +1460,13 @@ router_solicitations - INTEGER routers are present. Default: 3 +use_oif_addrs_only - BOOLEAN + When enabled, the candidate source addresses for destinations + routed via this interface are restricted to the set of addresses + configured on this interface (vis. RFC 6724, section 4). + + Default: false + use_tempaddr - INTEGER Preference for Privacy Extensions (RFC3041). <= 0 : disable Privacy Extensions diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 1319a6bb6b82..06ed637225b8 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -57,6 +57,7 @@ struct ipv6_devconf { bool initialized; struct in6_addr secret; } stable_secret; + __s32 use_oif_addrs_only; void *sysctl; }; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 5efa54ae567c..641a146ead7d 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -171,6 +171,7 @@ enum { DEVCONF_USE_OPTIMISTIC, DEVCONF_ACCEPT_RA_MTU, DEVCONF_STABLE_SECRET, + DEVCONF_USE_OIF_ADDRS_ONLY, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 32153c248959..eb0c6a3a8a00 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -211,7 +211,8 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_ra_mtu = 1, .stable_secret = { .initialized = false, - } + }, + .use_oif_addrs_only = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -253,6 +254,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .stable_secret = { .initialized = false, }, + .use_oif_addrs_only = 0, }; /* Check if a valid qdisc is available */ @@ -1472,11 +1474,16 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, * include addresses assigned to interfaces * belonging to the same site as the outgoing * interface.) + * - "It is RECOMMENDED that the candidate source addresses + * be the set of unicast addresses assigned to the + * interface that will be used to send to the destination + * (the 'outgoing' interface)." (RFC 6724) */ if (dst_dev) { + idev = __in6_dev_get(dst_dev); if ((dst_type & IPV6_ADDR_MULTICAST) || - dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) { - idev = __in6_dev_get(dst_dev); + dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL || + (idev && idev->cnf.use_oif_addrs_only)) { use_oif_addr = true; } } @@ -4607,6 +4614,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu; /* we omit DEVCONF_STABLE_SECRET for now */ + array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; } static inline size_t inet6_ifla6_size(void) @@ -5605,6 +5613,14 @@ static struct addrconf_sysctl_table .mode = 0600, .proc_handler = addrconf_sysctl_stable_secret, }, + { + .procname = "use_oif_addrs_only", + .data = &ipv6_devconf.use_oif_addrs_only, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + + }, { /* sentinel */ } -- cgit v1.2.3 From 3bbd14e0a2e3a988b1b5fe702a2539bd8d0ec622 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 20 Jul 2015 13:32:52 +0200 Subject: netfilter: rename local nf_hook_list to hook_list 085db2c04557 ("netfilter: Per network namespace netfilter hooks.") introduced a new nf_hook_list that is global, so let's avoid this overlap. Signed-off-by: Pablo Neira Ayuso Acked-by: "Eric W. Biederman" --- include/linux/netfilter.h | 14 +++++++------- net/netfilter/core.c | 28 ++++++++++++++-------------- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index e01da73ee6c4..d788ce62d826 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -140,20 +140,20 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg); #ifdef HAVE_JUMP_LABEL extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; -static inline bool nf_hook_list_active(struct list_head *nf_hook_list, +static inline bool nf_hook_list_active(struct list_head *hook_list, u_int8_t pf, unsigned int hook) { if (__builtin_constant_p(pf) && __builtin_constant_p(hook)) return static_key_false(&nf_hooks_needed[pf][hook]); - return !list_empty(nf_hook_list); + return !list_empty(hook_list); } #else -static inline bool nf_hook_list_active(struct list_head *nf_hook_list, +static inline bool nf_hook_list_active(struct list_head *hook_list, u_int8_t pf, unsigned int hook) { - return !list_empty(nf_hook_list); + return !list_empty(hook_list); } #endif @@ -175,12 +175,12 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, int thresh) { struct net *net = dev_net(indev ? indev : outdev); - struct list_head *nf_hook_list = &net->nf.hooks[pf][hook]; + struct list_head *hook_list = &net->nf.hooks[pf][hook]; - if (nf_hook_list_active(nf_hook_list, pf, hook)) { + if (nf_hook_list_active(hook_list, pf, hook)) { struct nf_hook_state state; - nf_hook_state_init(&state, nf_hook_list, hook, thresh, + nf_hook_state_init(&state, hook_list, hook, thresh, pf, indev, outdev, sk, okfn); return nf_hook_slow(skb, &state); } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 0ecb2b52f276..2a5a0704245c 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -62,20 +62,20 @@ EXPORT_SYMBOL(nf_hooks_needed); static DEFINE_MUTEX(nf_hook_mutex); -static struct list_head *find_nf_hook_list(struct net *net, +static struct list_head *nf_find_hook_list(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *nf_hook_list = NULL; + struct list_head *hook_list = NULL; if (reg->pf != NFPROTO_NETDEV) - nf_hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; + hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; else if (reg->hooknum == NF_NETDEV_INGRESS) { #ifdef CONFIG_NETFILTER_INGRESS if (reg->dev && dev_net(reg->dev) == net) - nf_hook_list = ®->dev->nf_hooks_ingress; + hook_list = ®->dev->nf_hooks_ingress; #endif } - return nf_hook_list; + return hook_list; } struct nf_hook_entry { @@ -85,7 +85,7 @@ struct nf_hook_entry { int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *nf_hook_list; + struct list_head *hook_list; struct nf_hook_entry *entry; struct nf_hook_ops *elem; @@ -96,14 +96,14 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) entry->orig_ops = reg; entry->ops = *reg; - nf_hook_list = find_nf_hook_list(net, reg); - if (!nf_hook_list) { + hook_list = nf_find_hook_list(net, reg); + if (!hook_list) { kfree(entry); return -ENOENT; } mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, nf_hook_list, list) { + list_for_each_entry(elem, hook_list, list) { if (reg->priority < elem->priority) break; } @@ -122,16 +122,16 @@ EXPORT_SYMBOL(nf_register_net_hook); void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *nf_hook_list; + struct list_head *hook_list; struct nf_hook_entry *entry; struct nf_hook_ops *elem; - nf_hook_list = find_nf_hook_list(net, reg); - if (!nf_hook_list) + hook_list = nf_find_hook_list(net, reg); + if (!hook_list) return; mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, nf_hook_list, list) { + list_for_each_entry(elem, hook_list, list) { entry = container_of(elem, struct nf_hook_entry, ops); if (entry->orig_ops == reg) { list_del_rcu(&entry->ops.list); @@ -139,7 +139,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) } } mutex_unlock(&nf_hook_mutex); - if (&elem->list == nf_hook_list) { + if (&elem->list == hook_list) { WARN(1, "nf_unregister_net_hook: hook not found!\n"); return; } -- cgit v1.2.3 From e0910bace663b78c026b73bbd711a24ccf410531 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 23 Jul 2015 15:43:56 +0200 Subject: lwtunnel: export linux/lwtunnel.h to userspace Note also that include/linux/lwtunnel.h is not needed. CC: Thomas Graf CC: Roopa Prabhu Fixes: 499a24256862 ("lwtunnel: infrastructure for handling light weight tunnels like mpls") Signed-off-by: Nicolas Dichtel Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/lwtunnel.h | 6 ------ include/uapi/linux/Kbuild | 1 + 2 files changed, 1 insertion(+), 6 deletions(-) delete mode 100644 include/linux/lwtunnel.h (limited to 'include/linux') diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h deleted file mode 100644 index 97f32f8b4ae1..000000000000 --- a/include/linux/lwtunnel.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _LINUX_LWTUNNEL_H_ -#define _LINUX_LWTUNNEL_H_ - -#include - -#endif /* _LINUX_LWTUNNEL_H_ */ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 1ff9942718fe..aafb9937b162 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -243,6 +243,7 @@ header-y += limits.h header-y += llc.h header-y += loop.h header-y += lp.h +header-y += lwtunnel.h header-y += magic.h header-y += major.h header-y += map_to_7segment.h -- cgit v1.2.3 From 2be6967cdbc95a9960b620defedbf5e02e2af619 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 23 Jul 2015 23:35:56 +0300 Subject: net/mlx5e: Support ETH_RSS_HASH_XOR The ConnectX-4 HW implements inverted XOR8. To make it act as XOR we re-order the HW RSS indirection table. Set XOR to be the default RSS hash function and add ethtool API to control it. Signed-off-by: Saeed Mahameed Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 39 ++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 46 +++++++++++++++++----- include/linux/mlx5/mlx5_ifc.h | 6 +-- 4 files changed, 79 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 3d23bd657e3c..61d8433392aa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -195,6 +195,7 @@ struct mlx5e_params { u16 rx_hash_log_tbl_sz; bool lro_en; u32 lro_wqe_sz; + u8 rss_hfunc; }; enum { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 388938482ff9..cb2853570504 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -662,6 +662,43 @@ out: return err; } +static int mlx5e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, + u8 *hfunc) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + + if (hfunc) + *hfunc = priv->params.rss_hfunc; + + return 0; +} + +static int mlx5e_set_rxfh(struct net_device *netdev, const u32 *indir, + const u8 *key, const u8 hfunc) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + int err = 0; + + if (hfunc == ETH_RSS_HASH_NO_CHANGE) + return 0; + + if ((hfunc != ETH_RSS_HASH_XOR) && + (hfunc != ETH_RSS_HASH_TOP)) + return -EINVAL; + + mutex_lock(&priv->state_lock); + + priv->params.rss_hfunc = hfunc; + if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { + mlx5e_close_locked(priv->netdev); + err = mlx5e_open_locked(priv->netdev); + } + + mutex_unlock(&priv->state_lock); + + return err; +} + const struct ethtool_ops mlx5e_ethtool_ops = { .get_drvinfo = mlx5e_get_drvinfo, .get_link = ethtool_op_get_link, @@ -676,4 +713,6 @@ const struct ethtool_ops mlx5e_ethtool_ops = { .set_coalesce = mlx5e_set_coalesce, .get_settings = mlx5e_get_settings, .set_settings = mlx5e_set_settings, + .get_rxfh = mlx5e_get_rxfh, + .set_rxfh = mlx5e_set_rxfh, }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 40206da1f9d7..07d36275021e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1158,6 +1158,24 @@ static void mlx5e_close_tises(struct mlx5e_priv *priv) mlx5e_close_tis(priv, tc); } +static int mlx5e_rx_hash_fn(int hfunc) +{ + return (hfunc == ETH_RSS_HASH_TOP) ? + MLX5_RX_HASH_FN_TOEPLITZ : + MLX5_RX_HASH_FN_INVERTED_XOR8; +} + +static int mlx5e_bits_invert(unsigned long a, int size) +{ + int inv = 0; + int i; + + for (i = 0; i < size; i++) + inv |= (test_bit(size - i - 1, &a) ? 1 : 0) << i; + + return inv; +} + static int mlx5e_open_rqt(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; @@ -1166,11 +1184,10 @@ static int mlx5e_open_rqt(struct mlx5e_priv *priv) void *rqtc; int inlen; int err; - int sz; + int log_tbl_sz = priv->params.rx_hash_log_tbl_sz; + int sz = 1 << log_tbl_sz; int i; - sz = 1 << priv->params.rx_hash_log_tbl_sz; - inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz; in = mlx5_vzalloc(inlen); if (!in) @@ -1182,8 +1199,12 @@ static int mlx5e_open_rqt(struct mlx5e_priv *priv) MLX5_SET(rqtc, rqtc, rqt_max_size, sz); for (i = 0; i < sz; i++) { - int ix = i % priv->params.num_channels; + int ix = i; + + if (priv->params.rss_hfunc == ETH_RSS_HASH_XOR) + ix = mlx5e_bits_invert(i, log_tbl_sz); + ix = ix % priv->params.num_channels; MLX5_SET(rqtc, rqtc, rq_num[i], priv->channel[ix]->rq.rqn); } @@ -1254,12 +1275,16 @@ static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 *tirc, int tt) MLX5_SET(tirc, tirc, indirect_table, priv->rqtn); MLX5_SET(tirc, tirc, rx_hash_fn, - MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ); - MLX5_SET(tirc, tirc, rx_hash_symmetric, 1); - netdev_rss_key_fill(MLX5_ADDR_OF(tirc, tirc, - rx_hash_toeplitz_key), - MLX5_FLD_SZ_BYTES(tirc, - rx_hash_toeplitz_key)); + mlx5e_rx_hash_fn(priv->params.rss_hfunc)); + if (priv->params.rss_hfunc == ETH_RSS_HASH_TOP) { + void *rss_key = MLX5_ADDR_OF(tirc, tirc, + rx_hash_toeplitz_key); + size_t len = MLX5_FLD_SZ_BYTES(tirc, + rx_hash_toeplitz_key); + + MLX5_SET(tirc, tirc, rx_hash_symmetric, 1); + netdev_rss_key_fill(rss_key, len); + } break; } @@ -1700,6 +1725,7 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev, MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ; priv->params.num_tc = 1; priv->params.default_vlan_prio = 0; + priv->params.rss_hfunc = ETH_RSS_HASH_XOR; priv->params.lro_en = false && !!MLX5_CAP_ETH(priv->mdev, lro_cap); priv->params.lro_wqe_sz = diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6d2f6fee041c..c60a62bba652 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1936,9 +1936,9 @@ enum { }; enum { - MLX5_TIRC_RX_HASH_FN_HASH_NONE = 0x0, - MLX5_TIRC_RX_HASH_FN_HASH_INVERTED_XOR8 = 0x1, - MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ = 0x2, + MLX5_RX_HASH_FN_NONE = 0x0, + MLX5_RX_HASH_FN_INVERTED_XOR8 = 0x1, + MLX5_RX_HASH_FN_TOEPLITZ = 0x2, }; enum { -- cgit v1.2.3 From 311c7c71c9bb8786c96fee353fe9886c08b017fe Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 23 Jul 2015 23:35:57 +0300 Subject: net/mlx5e: Allocate DMA coherent memory on reader NUMA node By affinity hints and XPS, each mlx5e channel is assigned a CPU core. Channel DMA coherent memory that is written by the NIC and read by SW (e.g CQ buffer) is allocated on the NUMA node of the CPU core assigned for the channel. Channel DMA coherent memory that is written by SW and read by the NIC (e.g SQ/RQ buffer) is allocated on the NUMA node of the NIC. Doorbell record (written by SW and read by the NIC) is an exception since it is accessed by SW more frequently. Signed-off-by: Saeed Mahameed Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/alloc.c | 48 +++++++++++++++++++---- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 11 ++++-- drivers/net/ethernet/mellanox/mlx5/core/main.c | 6 ++- drivers/net/ethernet/mellanox/mlx5/core/wq.c | 12 +++--- drivers/net/ethernet/mellanox/mlx5/core/wq.h | 3 +- include/linux/mlx5/driver.h | 8 ++++ 6 files changed, 70 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c index 0715b497511f..6cb38304669f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c @@ -45,15 +45,34 @@ * register it in a memory region at HCA virtual address 0. */ -int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf) +static void *mlx5_dma_zalloc_coherent_node(struct mlx5_core_dev *dev, + size_t size, dma_addr_t *dma_handle, + int node) +{ + struct mlx5_priv *priv = &dev->priv; + int original_node; + void *cpu_handle; + + mutex_lock(&priv->alloc_mutex); + original_node = dev_to_node(&dev->pdev->dev); + set_dev_node(&dev->pdev->dev, node); + cpu_handle = dma_zalloc_coherent(&dev->pdev->dev, size, + dma_handle, GFP_KERNEL); + set_dev_node(&dev->pdev->dev, original_node); + mutex_unlock(&priv->alloc_mutex); + return cpu_handle; +} + +int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size, + struct mlx5_buf *buf, int node) { dma_addr_t t; buf->size = size; buf->npages = 1; buf->page_shift = (u8)get_order(size) + PAGE_SHIFT; - buf->direct.buf = dma_zalloc_coherent(&dev->pdev->dev, - size, &t, GFP_KERNEL); + buf->direct.buf = mlx5_dma_zalloc_coherent_node(dev, size, + &t, node); if (!buf->direct.buf) return -ENOMEM; @@ -66,6 +85,11 @@ int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf) return 0; } + +int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf) +{ + return mlx5_buf_alloc_node(dev, size, buf, dev->priv.numa_node); +} EXPORT_SYMBOL_GPL(mlx5_buf_alloc); void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf) @@ -75,7 +99,8 @@ void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf) } EXPORT_SYMBOL_GPL(mlx5_buf_free); -static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct device *dma_device) +static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct mlx5_core_dev *dev, + int node) { struct mlx5_db_pgdir *pgdir; @@ -84,8 +109,9 @@ static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct device *dma_device) return NULL; bitmap_fill(pgdir->bitmap, MLX5_DB_PER_PAGE); - pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE, - &pgdir->db_dma, GFP_KERNEL); + + pgdir->db_page = mlx5_dma_zalloc_coherent_node(dev, PAGE_SIZE, + &pgdir->db_dma, node); if (!pgdir->db_page) { kfree(pgdir); return NULL; @@ -118,7 +144,7 @@ static int mlx5_alloc_db_from_pgdir(struct mlx5_db_pgdir *pgdir, return 0; } -int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db) +int mlx5_db_alloc_node(struct mlx5_core_dev *dev, struct mlx5_db *db, int node) { struct mlx5_db_pgdir *pgdir; int ret = 0; @@ -129,7 +155,7 @@ int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db) if (!mlx5_alloc_db_from_pgdir(pgdir, db)) goto out; - pgdir = mlx5_alloc_db_pgdir(&(dev->pdev->dev)); + pgdir = mlx5_alloc_db_pgdir(dev, node); if (!pgdir) { ret = -ENOMEM; goto out; @@ -145,6 +171,12 @@ out: return ret; } +EXPORT_SYMBOL_GPL(mlx5_db_alloc_node); + +int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db) +{ + return mlx5_db_alloc_node(dev, db, dev->priv.numa_node); +} EXPORT_SYMBOL_GPL(mlx5_db_alloc); void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 07d36275021e..57cc8960b73b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -272,6 +272,8 @@ static int mlx5e_create_rq(struct mlx5e_channel *c, int err; int i; + param->wq.db_numa_node = cpu_to_node(c->cpu); + err = mlx5_wq_ll_create(mdev, ¶m->wq, rqc_wq, &rq->wq, &rq->wq_ctrl); if (err) @@ -502,6 +504,8 @@ static int mlx5e_create_sq(struct mlx5e_channel *c, if (err) return err; + param->wq.db_numa_node = cpu_to_node(c->cpu); + err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) @@ -702,7 +706,8 @@ static int mlx5e_create_cq(struct mlx5e_channel *c, int err; u32 i; - param->wq.numa = cpu_to_node(c->cpu); + param->wq.buf_numa_node = cpu_to_node(c->cpu); + param->wq.db_numa_node = cpu_to_node(c->cpu); param->eq_ix = c->ix; err = mlx5_cqwq_create(mdev, ¶m->wq, param->cqc, &cq->wq, @@ -1000,7 +1005,7 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv, MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size); MLX5_SET(wq, wq, pd, priv->pdn); - param->wq.numa = dev_to_node(&priv->mdev->pdev->dev); + param->wq.buf_numa_node = dev_to_node(&priv->mdev->pdev->dev); param->wq.linear = 1; } @@ -1014,7 +1019,7 @@ static void mlx5e_build_sq_param(struct mlx5e_priv *priv, MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); MLX5_SET(wq, wq, pd, priv->pdn); - param->wq.numa = dev_to_node(&priv->mdev->pdev->dev); + param->wq.buf_numa_node = dev_to_node(&priv->mdev->pdev->dev); } static void mlx5e_build_common_cq_param(struct mlx5e_priv *priv, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index afad529838de..c34eafbf1c04 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -455,7 +455,7 @@ static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i) struct mlx5_priv *priv = &mdev->priv; struct msix_entry *msix = priv->msix_arr; int irq = msix[i + MLX5_EQ_VEC_COMP_BASE].vector; - int numa_node = dev_to_node(&mdev->pdev->dev); + int numa_node = priv->numa_node; int err; if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) { @@ -668,6 +668,10 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev) INIT_LIST_HEAD(&priv->pgdir_list); spin_lock_init(&priv->mkey_lock); + mutex_init(&priv->alloc_mutex); + + priv->numa_node = dev_to_node(&dev->pdev->dev); + priv->dbg_root = debugfs_create_dir(dev_name(&pdev->dev), mlx5_debugfs_root); if (!priv->dbg_root) return -ENOMEM; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c index 8388411582cf..ce21ee5b2357 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c @@ -73,13 +73,14 @@ int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, wq->log_stride = MLX5_GET(wq, wqc, log_wq_stride); wq->sz_m1 = (1 << MLX5_GET(wq, wqc, log_wq_sz)) - 1; - err = mlx5_db_alloc(mdev, &wq_ctrl->db); + err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node); if (err) { mlx5_core_warn(mdev, "mlx5_db_alloc() failed, %d\n", err); return err; } - err = mlx5_buf_alloc(mdev, mlx5_wq_cyc_get_byte_size(wq), &wq_ctrl->buf); + err = mlx5_buf_alloc_node(mdev, mlx5_wq_cyc_get_byte_size(wq), + &wq_ctrl->buf, param->buf_numa_node); if (err) { mlx5_core_warn(mdev, "mlx5_buf_alloc() failed, %d\n", err); goto err_db_free; @@ -108,13 +109,14 @@ int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, wq->log_sz = MLX5_GET(cqc, cqc, log_cq_size); wq->sz_m1 = (1 << wq->log_sz) - 1; - err = mlx5_db_alloc(mdev, &wq_ctrl->db); + err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node); if (err) { mlx5_core_warn(mdev, "mlx5_db_alloc() failed, %d\n", err); return err; } - err = mlx5_buf_alloc(mdev, mlx5_cqwq_get_byte_size(wq), &wq_ctrl->buf); + err = mlx5_buf_alloc_node(mdev, mlx5_cqwq_get_byte_size(wq), + &wq_ctrl->buf, param->buf_numa_node); if (err) { mlx5_core_warn(mdev, "mlx5_buf_alloc() failed, %d\n", err); goto err_db_free; @@ -144,7 +146,7 @@ int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, wq->log_stride = MLX5_GET(wq, wqc, log_wq_stride); wq->sz_m1 = (1 << MLX5_GET(wq, wqc, log_wq_sz)) - 1; - err = mlx5_db_alloc(mdev, &wq_ctrl->db); + err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node); if (err) { mlx5_core_warn(mdev, "mlx5_db_alloc() failed, %d\n", err); return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h index e0ddd69fb429..6c2a8f95093c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h @@ -37,7 +37,8 @@ struct mlx5_wq_param { int linear; - int numa; + int buf_numa_node; + int db_numa_node; }; struct mlx5_wq_ctrl { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5722d88c2429..1c0d5d062d7c 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -463,6 +463,10 @@ struct mlx5_priv { /* end: mr staff */ /* start: alloc staff */ + /* protect buffer alocation according to numa node */ + struct mutex alloc_mutex; + int numa_node; + struct mutex pgdir_mutex; struct list_head pgdir_list; /* end: alloc staff */ @@ -672,6 +676,8 @@ void mlx5_health_cleanup(void); void __init mlx5_health_init(void); void mlx5_start_health_poll(struct mlx5_core_dev *dev); void mlx5_stop_health_poll(struct mlx5_core_dev *dev); +int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size, + struct mlx5_buf *buf, int node); int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf); void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf); struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev, @@ -773,6 +779,8 @@ void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev); int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev); void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev); int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db); +int mlx5_db_alloc_node(struct mlx5_core_dev *dev, struct mlx5_db *db, + int node); void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db); const char *mlx5_command_str(int command); -- cgit v1.2.3 From 88a85f99e51fb2373259ab83c8bb130a9bbf3804 Mon Sep 17 00:00:00 2001 From: Achiad Shochat Date: Thu, 23 Jul 2015 23:35:59 +0300 Subject: net/mlx5e: TX latency optimization to save DMA reads A regular TX WQE execution involves two or more DMA reads - one to fetch the WQE, and another one per WQE gather entry. These DMA reads obviously increase the TX latency. There are two mlx5 mechanisms to bypass these DMA reads: 1) Inline WQE 2) Blue Flame (BF) An inline WQE contains a whole packet, thus saves the DMA read/s of the regular WQE gather entry/s. Inline WQE support was already added in the previous commit. A BF WQE is written directly to the device I/O mapped memory, thus enables saving the DMA read that fetches the WQE. The BF WQE I/O write must be in cache line granularity, thus uses the CPU write combining mechanism. A BF WQE I/O write acts also as a TX doorbell for notifying the device of new TX WQEs. A BF WQE is written to the same I/O mapped address as the regular TX doorbell, thus this address is being mapped twice - once by ioremap() and once by io_mapping_map_wc(). While both mechanisms reduce the TX latency, they both consume more CPU cycles than a regular WQE: - A BF WQE must still be written to host memory, in addition to being written directly to the device I/O mapped memory. - An inline WQE involves copying the SKB data into it. To handle this tradeoff, we introduce here a heuristic algorithm that strives to avoid using these two mechanisms in case the TX queue is being back-pressured by the device, and limit their usage rate otherwise. An inline WQE will always be "Blue Flamed" (written directly to the device I/O mapped memory) while a BF WQE may not be inlined (may contain gather entries). Preliminary testing using netperf UDP_RR shows that the latency goes down from 17.5us to 16.9us, while the message rate (tested with pktgen) stays the same. Signed-off-by: Achiad Shochat Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 24 +++++++++++++++------ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 12 ++++++----- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 26 ++++++++++++++++++----- drivers/net/ethernet/mellanox/mlx5/core/main.c | 26 +++++++++++++++++++++-- drivers/net/ethernet/mellanox/mlx5/core/uar.c | 6 ++++++ include/linux/mlx5/driver.h | 4 +++- 6 files changed, 79 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index d9dc506188c8..b66edd2c5a61 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -60,6 +60,7 @@ #define MLX5E_TX_CQ_POLL_BUDGET 128 #define MLX5E_UPDATE_STATS_INTERVAL 200 /* msecs */ +#define MLX5E_SQ_BF_BUDGET 16 static const char vport_strings[][ETH_GSTRING_LEN] = { /* vport statistics */ @@ -268,7 +269,9 @@ struct mlx5e_sq { /* dirtied @xmit */ u16 pc ____cacheline_aligned_in_smp; u32 dma_fifo_pc; - u32 bf_offset; + u16 bf_offset; + u16 prev_cc; + u8 bf_budget; struct mlx5e_sq_stats stats; struct mlx5e_cq cq; @@ -281,9 +284,10 @@ struct mlx5e_sq { struct mlx5_wq_cyc wq; u32 dma_fifo_mask; void __iomem *uar_map; + void __iomem *uar_bf_map; struct netdev_queue *txq; u32 sqn; - u32 bf_buf_size; + u16 bf_buf_size; u16 max_inline; u16 edge; struct device *pdev; @@ -493,8 +497,10 @@ int mlx5e_update_priv_params(struct mlx5e_priv *priv, struct mlx5e_params *new_params); static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq, - struct mlx5e_tx_wqe *wqe) + struct mlx5e_tx_wqe *wqe, int bf_sz) { + u16 ofst = MLX5_BF_OFFSET + sq->bf_offset; + /* ensure wqe is visible to device before updating doorbell record */ dma_wmb(); @@ -505,9 +511,15 @@ static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq, */ wmb(); - mlx5_write64((__be32 *)&wqe->ctrl, - sq->uar_map + MLX5_BF_OFFSET + sq->bf_offset, - NULL); + if (bf_sz) { + __iowrite64_copy(sq->uar_bf_map + ofst, &wqe->ctrl, bf_sz); + + /* flush the write-combining mapped buffer */ + wmb(); + + } else { + mlx5_write64((__be32 *)&wqe->ctrl, sq->uar_map + ofst, NULL); + } sq->bf_offset ^= sq->bf_buf_size; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index c55fad431cbf..4a87e9dcf52c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -514,6 +514,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c, sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; sq->uar_map = sq->uar.map; + sq->uar_bf_map = sq->uar.bf_map; sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2; sq->max_inline = param->max_inline; @@ -524,11 +525,12 @@ static int mlx5e_create_sq(struct mlx5e_channel *c, txq_ix = c->ix + tc * priv->params.num_channels; sq->txq = netdev_get_tx_queue(priv->netdev, txq_ix); - sq->pdev = c->pdev; - sq->mkey_be = c->mkey_be; - sq->channel = c; - sq->tc = tc; - sq->edge = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS; + sq->pdev = c->pdev; + sq->mkey_be = c->mkey_be; + sq->channel = c; + sq->tc = tc; + sq->edge = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS; + sq->bf_budget = MLX5E_SQ_BF_BUDGET; priv->txq_to_sq_map[txq_ix] = sq; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 351ac6982e22..64380bc0cd6a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -57,7 +57,7 @@ void mlx5e_send_nop(struct mlx5e_sq *sq, bool notify_hw) if (notify_hw) { cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; - mlx5e_tx_notify_hw(sq, wqe); + mlx5e_tx_notify_hw(sq, wqe, 0); } } @@ -110,7 +110,7 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb, } static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq, - struct sk_buff *skb) + struct sk_buff *skb, bool bf) { /* Some NIC TX decisions, e.g loopback, are based on the packet * headers and occur before the data gather. @@ -118,7 +118,7 @@ static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq, */ #define MLX5E_MIN_INLINE (ETH_HLEN + 2/*vlan tag*/) - if (skb_headlen(skb) <= sq->max_inline) + if (bf && (skb_headlen(skb) <= sq->max_inline)) return skb_headlen(skb); return MLX5E_MIN_INLINE; @@ -137,6 +137,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb) u8 opcode = MLX5_OPCODE_SEND; dma_addr_t dma_addr = 0; + bool bf = false; u16 headlen; u16 ds_cnt; u16 ihs; @@ -149,6 +150,11 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb) else sq->stats.csum_offload_none++; + if (sq->cc != sq->prev_cc) { + sq->prev_cc = sq->cc; + sq->bf_budget = (sq->cc == sq->pc) ? MLX5E_SQ_BF_BUDGET : 0; + } + if (skb_is_gso(skb)) { u32 payload_len; @@ -161,7 +167,10 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb) sq->stats.tso_packets++; sq->stats.tso_bytes += payload_len; } else { - ihs = mlx5e_get_inline_hdr_size(sq, skb); + bf = sq->bf_budget && + !skb->xmit_more && + !skb_shinfo(skb)->nr_frags; + ihs = mlx5e_get_inline_hdr_size(sq, skb, bf); MLX5E_TX_SKB_CB(skb)->num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN); } @@ -233,14 +242,21 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb) } if (!skb->xmit_more || netif_xmit_stopped(sq->txq)) { + int bf_sz = 0; + + if (bf && sq->uar_bf_map) + bf_sz = MLX5E_TX_SKB_CB(skb)->num_wqebbs << 3; + cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; - mlx5e_tx_notify_hw(sq, wqe); + mlx5e_tx_notify_hw(sq, wqe, bf_sz); } /* fill sq edge with nops to avoid wqe wrap around */ while ((sq->pc & wq->sz_m1) > sq->edge) mlx5e_send_nop(sq, false); + sq->bf_budget = bf ? sq->bf_budget - 1 : 0; + sq->stats.packets++; return NETDEV_TX_OK; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index c34eafbf1c04..603a8b0908ee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -654,6 +654,22 @@ static int mlx5_core_set_issi(struct mlx5_core_dev *dev) } #endif +static int map_bf_area(struct mlx5_core_dev *dev) +{ + resource_size_t bf_start = pci_resource_start(dev->pdev, 0); + resource_size_t bf_len = pci_resource_len(dev->pdev, 0); + + dev->priv.bf_mapping = io_mapping_create_wc(bf_start, bf_len); + + return dev->priv.bf_mapping ? 0 : -ENOMEM; +} + +static void unmap_bf_area(struct mlx5_core_dev *dev) +{ + if (dev->priv.bf_mapping) + io_mapping_free(dev->priv.bf_mapping); +} + static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev) { struct mlx5_priv *priv = &dev->priv; @@ -808,10 +824,13 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev) goto err_stop_eqs; } + if (map_bf_area(dev)) + dev_err(&pdev->dev, "Failed to map blue flame area\n"); + err = mlx5_irq_set_affinity_hints(dev); if (err) { dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n"); - goto err_free_comp_eqs; + goto err_unmap_bf_area; } MLX5_INIT_DOORBELL_LOCK(&priv->cq_uar_lock); @@ -823,7 +842,9 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev) return 0; -err_free_comp_eqs: +err_unmap_bf_area: + unmap_bf_area(dev); + free_comp_eqs(dev); err_stop_eqs: @@ -881,6 +902,7 @@ static void mlx5_dev_cleanup(struct mlx5_core_dev *dev) mlx5_cleanup_qp_table(dev); mlx5_cleanup_cq_table(dev); mlx5_irq_clear_affinity_hints(dev); + unmap_bf_area(dev); free_comp_eqs(dev); mlx5_stop_eqs(dev); mlx5_free_uuars(dev, &priv->uuari); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c index 9ef85873ceea..eb05c845ece9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c @@ -32,6 +32,7 @@ #include #include +#include #include #include #include "mlx5_core.h" @@ -246,6 +247,10 @@ int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar) goto err_free_uar; } + if (mdev->priv.bf_mapping) + uar->bf_map = io_mapping_map_wc(mdev->priv.bf_mapping, + uar->index << PAGE_SHIFT); + return 0; err_free_uar: @@ -257,6 +262,7 @@ EXPORT_SYMBOL(mlx5_alloc_map_uar); void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar) { + io_mapping_unmap(uar->bf_map); iounmap(uar->map); mlx5_cmd_free_uar(mdev, uar->index); } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1c0d5d062d7c..5fe0cae1a515 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -380,7 +380,7 @@ struct mlx5_uar { u32 index; struct list_head bf_list; unsigned free_bf_bmap; - void __iomem *wc_map; + void __iomem *bf_map; void __iomem *map; }; @@ -435,6 +435,8 @@ struct mlx5_priv { struct mlx5_uuar_info uuari; MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock); + struct io_mapping *bf_mapping; + /* pages stuff */ struct workqueue_struct *pg_wq; struct rb_root page_root; -- cgit v1.2.3 From 77fc29c4bbbbd01ee22c50ce8260fd0f2e08c124 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Mon, 27 Jul 2015 14:46:31 +0300 Subject: net/mlx4_core: Preparations for 802.1ad VLAN support mlx4_core preparation to support hardware accelerated 802.1ad VLAN device. To allow 802.1ad accelerated device, "packet has vlan" (phv) Firmware capability should be available. Firmware without the phv capability won't behave properly and can't support 802.1ad device acceleration. The driver checks the Firmware capability and sets the phv bit accordingly in SET_PORT command. Signed-off-by: Hadar Hen Zion Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/fw.c | 82 +++++++++++++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx4/fw.h | 1 + drivers/net/ethernet/mellanox/mlx4/main.c | 15 ++++++ drivers/net/ethernet/mellanox/mlx4/mlx4.h | 3 ++ include/linux/mlx4/device.h | 5 ++ 5 files changed, 106 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index e30bf57ad7a1..5a1c3d249530 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -154,6 +154,7 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) [26] = "Port ETS Scheduler support", [27] = "Port beacon support", [28] = "RX-ALL support", + [29] = "802.1ad offload support", }; int i; @@ -307,6 +308,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, #define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80 #define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS (1 << 31) +#define QUERY_FUNC_CAP_PHV_BIT 0x40 if (vhcr->op_modifier == 1) { struct mlx4_active_ports actv_ports = @@ -351,6 +353,12 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, MLX4_PUT(outbox->buf, dev->caps.phys_port_id[vhcr->in_modifier], QUERY_FUNC_CAP_PHYS_PORT_ID); + if (dev->caps.phv_bit[port]) { + field = QUERY_FUNC_CAP_PHV_BIT; + MLX4_PUT(outbox->buf, field, + QUERY_FUNC_CAP_FLAGS0_OFFSET); + } + } else if (vhcr->op_modifier == 0) { struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave); @@ -600,6 +608,9 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port, MLX4_GET(func_cap->phys_port_id, outbox, QUERY_FUNC_CAP_PHYS_PORT_ID); + MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS0_OFFSET); + func_cap->flags |= (field & QUERY_FUNC_CAP_PHV_BIT); + /* All other resources are allocated by the master, but we still report * 'num' and 'reserved' capabilities as follows: * - num remains the maximum resource index @@ -700,6 +711,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET 0x92 #define QUERY_DEV_CAP_BMME_FLAGS_OFFSET 0x94 #define QUERY_DEV_CAP_CONFIG_DEV_OFFSET 0x94 +#define QUERY_DEV_CAP_PHV_EN_OFFSET 0x96 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET 0x98 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET 0xa0 #define QUERY_DEV_CAP_ETH_BACKPL_OFFSET 0x9c @@ -898,6 +910,12 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CONFIG_DEV; if (field & (1 << 2)) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_IGNORE_FCS; + MLX4_GET(field, outbox, QUERY_DEV_CAP_PHV_EN_OFFSET); + if (field & 0x80) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PHV_EN; + if (field & 0x40) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN; + MLX4_GET(dev_cap->reserved_lkey, outbox, QUERY_DEV_CAP_RSVD_LKEY_OFFSET); MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETH_BACKPL_OFFSET); @@ -1992,6 +2010,10 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev, MLX4_GET(param->uar_page_sz, outbox, INIT_HCA_UAR_PAGE_SZ_OFFSET); MLX4_GET(param->log_uar_sz, outbox, INIT_HCA_LOG_UAR_SZ_OFFSET); + /* phv_check enable */ + MLX4_GET(byte_field, outbox, INIT_HCA_CACHELINE_SZ_OFFSET); + if (byte_field & 0x2) + param->phv_check_en = 1; out: mlx4_free_cmd_mailbox(dev, mailbox); @@ -2758,3 +2780,63 @@ int mlx4_ACCESS_REG_wrapper(struct mlx4_dev *dev, int slave, 0, MLX4_CMD_ACCESS_REG, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); } + +static int mlx4_SET_PORT_phv_bit(struct mlx4_dev *dev, u8 port, u8 phv_bit) +{ +#define SET_PORT_GEN_PHV_VALID 0x10 +#define SET_PORT_GEN_PHV_EN 0x80 + + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_set_port_general_context *context; + u32 in_mod; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + context = mailbox->buf; + + context->v_ignore_fcs |= SET_PORT_GEN_PHV_VALID; + if (phv_bit) + context->phv_en |= SET_PORT_GEN_PHV_EN; + + in_mod = MLX4_SET_PORT_GENERAL << 8 | port; + err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE, + MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_NATIVE); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv) +{ + int err; + struct mlx4_func_cap func_cap; + + memset(&func_cap, 0, sizeof(func_cap)); + err = mlx4_QUERY_FUNC_CAP(dev, 1, &func_cap); + if (!err) + *phv = func_cap.flags & QUERY_FUNC_CAP_PHV_BIT; + return err; +} +EXPORT_SYMBOL(get_phv_bit); + +int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val) +{ + int ret; + + if (mlx4_is_slave(dev)) + return -EPERM; + + if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN && + !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN)) { + ret = mlx4_SET_PORT_phv_bit(dev, port, new_val); + if (!ret) + dev->caps.phv_bit[port] = new_val; + return ret; + } + + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(set_phv_bit); diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h index 07cb7c2461ad..08de5555c2f4 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.h +++ b/drivers/net/ethernet/mellanox/mlx4/fw.h @@ -204,6 +204,7 @@ struct mlx4_init_hca_param { u16 cqe_size; /* For use only when CQE stride feature enabled */ u16 eqe_size; /* For use only when EQE stride feature enabled */ u8 rss_ip_frags; + u8 phv_check_en; /* for QUERY_HCA */ }; struct mlx4_init_ib_param { diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index d76f4257e305..6f35b6c06193 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -405,6 +405,21 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.max_gso_sz = dev_cap->max_gso_sz; dev->caps.max_rss_tbl_sz = dev_cap->max_rss_tbl_sz; + if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN) { + struct mlx4_init_hca_param hca_param; + + memset(&hca_param, 0, sizeof(hca_param)); + err = mlx4_QUERY_HCA(dev, &hca_param); + /* Turn off PHV_EN flag in case phv_check_en is set. + * phv_check_en is a HW check that parse the packet and verify + * phv bit was reported correctly in the wqe. To allow QinQ + * PHV_EN flag should be set and phv_check_en must be cleared + * otherwise QinQ packets will be drop by the HW. + */ + if (err || hca_param.phv_check_en) + dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_PHV_EN; + } + /* Sense port always allowed on supported devices for ConnectX-1 and -2 */ if (mlx4_priv(dev)->pci_dev_data & MLX4_PCI_DEV_FORCE_SENSE_PORT) dev->caps.flags |= MLX4_DEV_CAP_FLAG_SENSE_SUPPORT; diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index a092c5c34d43..232b2b55f23b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -787,6 +787,9 @@ struct mlx4_set_port_general_context { u8 pprx; u8 pfcrx; u16 reserved4; + u32 reserved5; + u8 phv_en; + u8 reserved6[3]; }; struct mlx4_set_port_rqp_calc_context { diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index fd13c1ce3b4a..bcbf8c72a77b 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -211,6 +211,8 @@ enum { MLX4_DEV_CAP_FLAG2_ETS_CFG = 1LL << 26, MLX4_DEV_CAP_FLAG2_PORT_BEACON = 1LL << 27, MLX4_DEV_CAP_FLAG2_IGNORE_FCS = 1LL << 28, + MLX4_DEV_CAP_FLAG2_PHV_EN = 1LL << 29, + MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN = 1LL << 30, }; enum { @@ -581,6 +583,7 @@ struct mlx4_caps { u64 phys_port_id[MLX4_MAX_PORTS + 1]; int tunnel_offload_mode; u8 rx_checksum_flags_port[MLX4_MAX_PORTS + 1]; + u8 phv_bit[MLX4_MAX_PORTS + 1]; u8 alloc_res_qp_mask; u32 dmfs_high_rate_qpn_base; u32 dmfs_high_rate_qpn_range; @@ -1332,6 +1335,8 @@ int mlx4_SET_PORT_BEACON(struct mlx4_dev *dev, u8 port, u16 time); int mlx4_SET_PORT_fcs_check(struct mlx4_dev *dev, u8 port, u8 ignore_fcs_value); int mlx4_SET_PORT_VXLAN(struct mlx4_dev *dev, u8 port, u8 steering, int enable); +int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val); +int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv); int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx); int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx); int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); -- cgit v1.2.3 From e802f8e4c54e6adf4215ef9fa3d6eea8fcb10bf9 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Mon, 27 Jul 2015 14:46:33 +0300 Subject: net/mlx4: Prepare VLAN macros for 802.1ad Hardware accelerated support To add Hardware accelerated support in 802.1ad vlan, replace Current VLAN macros to CVLAN. Replace: MLX4_WQE_CTRL_INS_VLAN MLX4_CQE_VLAN_PRESENT_MASK With: MLX4_WQE_CTRL_INS_CVLAN MLX4_CQE_CVLAN_PRESENT_MASK Signed-off-by: Hadar Hen Zion Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx4/cq.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 6 +++--- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 2 +- include/linux/mlx4/cq.h | 2 +- include/linux/mlx4/qp.h | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 36eb3d012b6d..180a8f7ec82d 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -871,7 +871,7 @@ repoll: if (is_eth) { wc->sl = be16_to_cpu(cqe->sl_vid) >> 13; if (be32_to_cpu(cqe->vlan_my_qpn) & - MLX4_CQE_VLAN_PRESENT_MASK) { + MLX4_CQE_CVLAN_PRESENT_MASK) { wc->vlan_id = be16_to_cpu(cqe->sl_vid) & MLX4_CQE_VID_MASK; } else { diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 12c65e1ad6a9..10f6c2f1d5a0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -726,7 +726,7 @@ static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va, hw_checksum = csum_unfold((__force __sum16)cqe->checksum); - if (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK) && + if (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK) && !(dev_features & NETIF_F_HW_VLAN_CTAG_RX)) { hw_checksum = get_fixed_vlan_csum(hw_checksum, hdr); hdr += sizeof(struct vlan_hdr); @@ -907,7 +907,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud gro_skb->csum_level = 1; if ((cqe->vlan_my_qpn & - cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK)) && + cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK)) && (dev->features & NETIF_F_HW_VLAN_CTAG_RX)) { u16 vid = be16_to_cpu(cqe->sl_vid); @@ -970,7 +970,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud PKT_HASH_TYPE_L3); if ((be32_to_cpu(cqe->vlan_my_qpn) & - MLX4_CQE_VLAN_PRESENT_MASK) && + MLX4_CQE_CVLAN_PRESENT_MASK) && (dev->features & NETIF_F_HW_VLAN_CTAG_RX)) __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), be16_to_cpu(cqe->sl_vid)); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index c10d98f6ad96..7c858f67ef28 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -958,7 +958,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ring->bf.offset ^= ring->bf.buf_size; } else { tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag); - tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * + tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN * !!skb_vlan_tag_present(skb); tx_desc->ctrl.fence_size = real_size; diff --git a/include/linux/mlx4/cq.h b/include/linux/mlx4/cq.h index e7ecc12a1163..899a97b20d27 100644 --- a/include/linux/mlx4/cq.h +++ b/include/linux/mlx4/cq.h @@ -88,7 +88,7 @@ struct mlx4_ts_cqe { enum { MLX4_CQE_L2_TUNNEL_IPOK = 1 << 31, - MLX4_CQE_VLAN_PRESENT_MASK = 1 << 29, + MLX4_CQE_CVLAN_PRESENT_MASK = 1 << 29, MLX4_CQE_L2_TUNNEL = 1 << 27, MLX4_CQE_L2_TUNNEL_CSUM = 1 << 26, MLX4_CQE_L2_TUNNEL_IPV4 = 1 << 25, diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index 6fed539e5456..6c619006c21f 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -272,7 +272,7 @@ enum { MLX4_WQE_CTRL_SOLICITED = 1 << 1, MLX4_WQE_CTRL_IP_CSUM = 1 << 4, MLX4_WQE_CTRL_TCP_UDP_CSUM = 1 << 5, - MLX4_WQE_CTRL_INS_VLAN = 1 << 6, + MLX4_WQE_CTRL_INS_CVLAN = 1 << 6, MLX4_WQE_CTRL_STRONG_ORDER = 1 << 7, MLX4_WQE_CTRL_FORCE_LOOPBACK = 1 << 0, }; -- cgit v1.2.3 From e38af4faf01d0b35df6995fb395e5fa4a4898289 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Mon, 27 Jul 2015 14:46:34 +0300 Subject: net/mlx4_en: Add support for hardware accelerated 802.1ad vlan To enable device support in accelerated 802.1ad vlan, the port capability "packet has vlan enable" (phv_en) should be set. Firmware won't work properly, in case phv_en is not set. The user can enable "phv_en" port capability with the new ethtool private flag phv-bit. The phv-bit private flag default value is OFF, users who are interested in 802.1ad hardware acceleration should turn ON the phv-bit private flag: $ ethtool --set-priv-flags eth1 phv-bit on Once the private flag is set, the device is ready for 802.1ad vlan acceleration. The user should also change the interface device features and turn on "tx-vlan-stag-hw-insert" which is off by default: $ ethtool -K eth1 tx-vlan-stag-hw-insert on "phv-bit" private flag setting is available only for Physical Functions(PF), the Virtual Function (VF) will be able to use the feature by setting "tx-vlan-stag-hw-insert" ethtool device feature only if the feature was enabled by the Hypervisor. Signed-off-by: Hadar Hen Zion Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 16 +++++++++ drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 46 +++++++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx4/en_rx.c | 16 ++++++++- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 13 ++++--- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 1 + include/linux/mlx4/cq.h | 1 + include/linux/mlx4/qp.h | 1 + 7 files changed, 89 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index 70f65534e786..f79d8124321e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -102,6 +102,7 @@ mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) static const char mlx4_en_priv_flags[][ETH_GSTRING_LEN] = { "blueflame", + "phv-bit" }; static const char main_strings[][ETH_GSTRING_LEN] = { @@ -1797,9 +1798,13 @@ static int mlx4_en_get_ts_info(struct net_device *dev, static int mlx4_en_set_priv_flags(struct net_device *dev, u32 flags) { struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; bool bf_enabled_new = !!(flags & MLX4_EN_PRIV_FLAGS_BLUEFLAME); bool bf_enabled_old = !!(priv->pflags & MLX4_EN_PRIV_FLAGS_BLUEFLAME); + bool phv_enabled_new = !!(flags & MLX4_EN_PRIV_FLAGS_PHV); + bool phv_enabled_old = !!(priv->pflags & MLX4_EN_PRIV_FLAGS_PHV); int i; + int ret = 0; if (bf_enabled_new != bf_enabled_old) { if (bf_enabled_new) { @@ -1825,6 +1830,17 @@ static int mlx4_en_set_priv_flags(struct net_device *dev, u32 flags) bf_enabled_new ? "Enabled" : "Disabled"); } + if (phv_enabled_new != phv_enabled_old) { + ret = set_phv_bit(mdev->dev, priv->port, (int)phv_enabled_new); + if (ret) + return ret; + else if (phv_enabled_new) + priv->pflags |= MLX4_EN_PRIV_FLAGS_PHV; + else + priv->pflags &= ~MLX4_EN_PRIV_FLAGS_PHV; + en_info(priv, "PHV bit %s\n", + phv_enabled_new ? "Enabled" : "Disabled"); + } return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index e0de2fd1ce12..4726122ea76b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2184,6 +2184,25 @@ static int mlx4_en_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } } +static netdev_features_t mlx4_en_fix_features(struct net_device *netdev, + netdev_features_t features) +{ + struct mlx4_en_priv *en_priv = netdev_priv(netdev); + struct mlx4_en_dev *mdev = en_priv->mdev; + + /* Since there is no support for separate RX C-TAG/S-TAG vlan accel + * enable/disable make sure S-TAG flag is always in same state as + * C-TAG. + */ + if (features & NETIF_F_HW_VLAN_CTAG_RX && + !(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN)) + features |= NETIF_F_HW_VLAN_STAG_RX; + else + features &= ~NETIF_F_HW_VLAN_STAG_RX; + + return features; +} + static int mlx4_en_set_features(struct net_device *netdev, netdev_features_t features) { @@ -2218,6 +2237,10 @@ static int mlx4_en_set_features(struct net_device *netdev, en_info(priv, "Turn %s TX vlan strip offload\n", (features & NETIF_F_HW_VLAN_CTAG_TX) ? "ON" : "OFF"); + if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_HW_VLAN_STAG_TX)) + en_info(priv, "Turn %s TX S-VLAN strip offload\n", + (features & NETIF_F_HW_VLAN_STAG_TX) ? "ON" : "OFF"); + if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_LOOPBACK)) { en_info(priv, "Turn %s loopback\n", (features & NETIF_F_LOOPBACK) ? "ON" : "OFF"); @@ -2460,6 +2483,7 @@ static const struct net_device_ops mlx4_netdev_ops = { .ndo_poll_controller = mlx4_en_netpoll, #endif .ndo_set_features = mlx4_en_set_features, + .ndo_fix_features = mlx4_en_fix_features, .ndo_setup_tc = mlx4_en_setup_tc, #ifdef CONFIG_RFS_ACCEL .ndo_rx_flow_steer = mlx4_en_filter_rfs, @@ -2500,6 +2524,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = { .ndo_poll_controller = mlx4_en_netpoll, #endif .ndo_set_features = mlx4_en_set_features, + .ndo_fix_features = mlx4_en_fix_features, .ndo_setup_tc = mlx4_en_setup_tc, #ifdef CONFIG_RFS_ACCEL .ndo_rx_flow_steer = mlx4_en_filter_rfs, @@ -2931,6 +2956,27 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, dev->hw_features |= NETIF_F_LOOPBACK | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX; + if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN)) { + dev->features |= NETIF_F_HW_VLAN_STAG_RX | + NETIF_F_HW_VLAN_STAG_FILTER; + dev->hw_features |= NETIF_F_HW_VLAN_STAG_RX; + } + + if (mlx4_is_slave(mdev->dev)) { + int phv; + + err = get_phv_bit(mdev->dev, port, &phv); + if (!err && phv) { + dev->hw_features |= NETIF_F_HW_VLAN_STAG_TX; + priv->pflags |= MLX4_EN_PRIV_FLAGS_PHV; + } + } else { + if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN && + !(mdev->dev->caps.flags2 & + MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN)) + dev->hw_features |= NETIF_F_HW_VLAN_STAG_TX; + } + if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP) dev->hw_features |= NETIF_F_RXFCS; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 10f6c2f1d5a0..a67fbb90d69e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -912,6 +912,12 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud u16 vid = be16_to_cpu(cqe->sl_vid); __vlan_hwaccel_put_tag(gro_skb, htons(ETH_P_8021Q), vid); + } else if ((be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_SVLAN_PRESENT_MASK) && + (dev->features & NETIF_F_HW_VLAN_STAG_RX)) { + __vlan_hwaccel_put_tag(gro_skb, + htons(ETH_P_8021AD), + be16_to_cpu(cqe->sl_vid)); } if (dev->features & NETIF_F_RXHASH) @@ -973,6 +979,11 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud MLX4_CQE_CVLAN_PRESENT_MASK) && (dev->features & NETIF_F_HW_VLAN_CTAG_RX)) __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), be16_to_cpu(cqe->sl_vid)); + else if ((be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_SVLAN_PRESENT_MASK) && + (dev->features & NETIF_F_HW_VLAN_STAG_RX)) + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD), + be16_to_cpu(cqe->sl_vid)); if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) { timestamp = mlx4_en_get_cqe_ts(cqe); @@ -1070,7 +1081,10 @@ static const int frag_sizes[] = { void mlx4_en_calc_rx_buf(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); - int eff_mtu = dev->mtu + ETH_HLEN + VLAN_HLEN; + /* VLAN_HLEN is added twice,to support skb vlan tagged with multiple + * headers. (For example: ETH_P_8021Q and ETH_P_8021AD). + */ + int eff_mtu = dev->mtu + ETH_HLEN + (2 * VLAN_HLEN); int buf_size = 0; int i = 0; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 7c858f67ef28..494e7762fdb1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -718,6 +718,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) u32 index, bf_index; __be32 op_own; u16 vlan_tag = 0; + u16 vlan_proto = 0; int i_frag; int lso_header_size; void *fragptr = NULL; @@ -750,9 +751,10 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_drop; } - if (skb_vlan_tag_present(skb)) + if (skb_vlan_tag_present(skb)) { vlan_tag = skb_vlan_tag_get(skb); - + vlan_proto = be16_to_cpu(skb->vlan_proto); + } netdev_txq_bql_enqueue_prefetchw(ring->tx_queue); @@ -958,8 +960,11 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ring->bf.offset ^= ring->bf.buf_size; } else { tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag); - tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN * - !!skb_vlan_tag_present(skb); + if (vlan_proto == ETH_P_8021AD) + tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_SVLAN; + else if (vlan_proto == ETH_P_8021Q) + tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN; + tx_desc->ctrl.fence_size = real_size; /* Ensure new descriptor hits memory diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 666d1669eb52..defcf8c395bf 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -95,6 +95,7 @@ */ #define MLX4_EN_PRIV_FLAGS_BLUEFLAME 1 +#define MLX4_EN_PRIV_FLAGS_PHV 2 #define MLX4_EN_WATCHDOG_TIMEOUT (15 * HZ) diff --git a/include/linux/mlx4/cq.h b/include/linux/mlx4/cq.h index 899a97b20d27..09cebe528488 100644 --- a/include/linux/mlx4/cq.h +++ b/include/linux/mlx4/cq.h @@ -89,6 +89,7 @@ struct mlx4_ts_cqe { enum { MLX4_CQE_L2_TUNNEL_IPOK = 1 << 31, MLX4_CQE_CVLAN_PRESENT_MASK = 1 << 29, + MLX4_CQE_SVLAN_PRESENT_MASK = 1 << 30, MLX4_CQE_L2_TUNNEL = 1 << 27, MLX4_CQE_L2_TUNNEL_CSUM = 1 << 26, MLX4_CQE_L2_TUNNEL_IPV4 = 1 << 25, diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index 6c619006c21f..de45a51b3f04 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -273,6 +273,7 @@ enum { MLX4_WQE_CTRL_IP_CSUM = 1 << 4, MLX4_WQE_CTRL_TCP_UDP_CSUM = 1 << 5, MLX4_WQE_CTRL_INS_CVLAN = 1 << 6, + MLX4_WQE_CTRL_INS_SVLAN = 1 << 7, MLX4_WQE_CTRL_STRONG_ORDER = 1 << 7, MLX4_WQE_CTRL_FORCE_LOOPBACK = 1 << 0, }; -- cgit v1.2.3 From 0933328a1b8adb6c8b2b8c8b823dad0295659c40 Mon Sep 17 00:00:00 2001 From: Joachim Eastwood Date: Wed, 29 Jul 2015 00:09:02 +0200 Subject: stmmac: remove unused stmmac_of_data struct As dwmac-* drivers that need OF match have been converted to use their own internal OF match data structure this can now be removed. Signed-off-by: Joachim Eastwood Signed-off-by: David S. Miller --- Documentation/networking/stmmac.txt | 2 -- include/linux/stmmac.h | 18 ------------------ 2 files changed, 20 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt index 5fddefa69baf..de5c42342ec3 100644 --- a/Documentation/networking/stmmac.txt +++ b/Documentation/networking/stmmac.txt @@ -274,8 +274,6 @@ capability register can replace what has been passed from the platform. Please see the following document: Documentation/devicetree/bindings/net/stmmac.txt -and the stmmac_of_data structure inside the include/linux/stmmac.h header file. - 4.11) This is a summary of the content of some relevant files: o stmmac_main.c: to implement the main network device driver; o stmmac_mdio.c: to provide mdio functions; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index c86a20047cb1..b43cd56b78e9 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -125,22 +125,4 @@ struct plat_stmmacenet_data { void (*exit)(struct platform_device *pdev, void *priv); void *bsp_priv; }; - -/* of_data for SoC glue layer device tree bindings */ - -struct stmmac_of_data { - int has_gmac; - int enh_desc; - int tx_coe; - int rx_coe; - int bugged_jumbo; - int pmt; - int riwt_off; - void (*fix_mac_speed)(void *priv, unsigned int speed); - void (*bus_setup)(void __iomem *ioaddr); - void *(*setup)(struct platform_device *pdev); - void (*free)(struct platform_device *pdev, void *priv); - int (*init)(struct platform_device *pdev, void *priv); - void (*exit)(struct platform_device *pdev, void *priv); -}; #endif -- cgit v1.2.3 From 75fee59550a9899fd9438ebc0a64c972829a8dd2 Mon Sep 17 00:00:00 2001 From: Joachim Eastwood Date: Wed, 29 Jul 2015 00:09:03 +0200 Subject: stmmac: remove setup/free glue callbacks As all dwmac-* drivers have been converted to have a proper probe function the setup callback can now be removed. Also remove the free callback that wasn't used by any driver. New dwmac-* drivers should implement standard probe and remove functions to preform any needed setup and teardown. Signed-off-by: Joachim Eastwood Signed-off-by: David S. Miller --- Documentation/networking/stmmac.txt | 8 ++------ drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c | 7 ------- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 3 --- include/linux/stmmac.h | 2 -- 4 files changed, 2 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt index de5c42342ec3..2903b1cf4d70 100644 --- a/Documentation/networking/stmmac.txt +++ b/Documentation/networking/stmmac.txt @@ -135,8 +135,6 @@ struct plat_stmmacenet_data { int maxmtu; void (*fix_mac_speed)(void *priv, unsigned int speed); void (*bus_setup)(void __iomem *ioaddr); - void *(*setup)(struct platform_device *pdev); - void (*free)(struct platform_device *pdev, void *priv); int (*init)(struct platform_device *pdev, void *priv); void (*exit)(struct platform_device *pdev, void *priv); void *bsp_priv; @@ -177,12 +175,10 @@ Where: o bus_setup: perform HW setup of the bus. For example, on some ST platforms this field is used to configure the AMBA bridge to generate more efficient STBus traffic. - o setup/init/exit: callbacks used for calling a custom initialization; + o init/exit: callbacks used for calling a custom initialization; this is sometime necessary on some platforms (e.g. ST boxes) where the HW needs to have set some PIO lines or system cfg - registers. setup should return a pointer to private data, - which will be stored in bsp_priv, and then passed to init and - exit callbacks. init/exit callbacks should not use or modify + registers. init/exit callbacks should not use or modify platform data. o bsp_priv: another private pointer. diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c index f4fe9f1a33b4..b1e5f24708c9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c @@ -46,13 +46,6 @@ static int dwmac_generic_probe(struct platform_device *pdev) plat_dat->unicast_filter_entries = 1; } - /* Custom setup (if needed) */ - if (plat_dat->setup) { - plat_dat->bsp_priv = plat_dat->setup(pdev); - if (IS_ERR(plat_dat->bsp_priv)) - return PTR_ERR(plat_dat->bsp_priv); - } - /* Custom initialisation (if needed) */ if (plat_dat->init) { ret = plat_dat->init(pdev, plat_dat->bsp_priv); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 55e569b330b2..1cb660405f35 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -300,9 +300,6 @@ int stmmac_pltfr_remove(struct platform_device *pdev) if (priv->plat->exit) priv->plat->exit(pdev, priv->plat->bsp_priv); - if (priv->plat->free) - priv->plat->free(pdev, priv->plat->bsp_priv); - return ret; } EXPORT_SYMBOL_GPL(stmmac_pltfr_remove); diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index b43cd56b78e9..eead8ab93c0a 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -119,8 +119,6 @@ struct plat_stmmacenet_data { int rx_fifo_size; void (*fix_mac_speed)(void *priv, unsigned int speed); void (*bus_setup)(void __iomem *ioaddr); - void *(*setup)(struct platform_device *pdev); - void (*free)(struct platform_device *pdev, void *priv); int (*init)(struct platform_device *pdev, void *priv); void (*exit)(struct platform_device *pdev, void *priv); void *bsp_priv; -- cgit v1.2.3 From 72b1e5e4cac72efa6b739b47e41f53e4520b4194 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 23 Jul 2015 16:21:30 +0200 Subject: netfilter: bridge: reduce nf_bridge_info to 32 bytes again We can use union for most of the temporary cruft (original ipv4/ipv6 address, source mac, physoutdev) since they're used during different stages of br netfilter traversal. Also get rid of the last two ->mask users. Shrinks struct from 48 to 32 on 64bit arch. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 12 +++++++++--- include/linux/skbuff.h | 19 +++++++++++++------ net/bridge/br_netfilter_hooks.c | 14 ++++++-------- net/bridge/br_netfilter_ipv6.c | 2 +- net/ipv4/netfilter/nf_defrag_ipv4.c | 7 ++----- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 7 ++----- 6 files changed, 33 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index 6d80fc686323..2437b8a5d7a9 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -17,9 +17,6 @@ enum nf_br_hook_priorities { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -#define BRNF_BRIDGED_DNAT 0x02 -#define BRNF_NF_BRIDGE_PREROUTING 0x08 - int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb); static inline void br_drop_fake_rtable(struct sk_buff *skb) @@ -63,8 +60,17 @@ nf_bridge_get_physoutdev(const struct sk_buff *skb) { return skb->nf_bridge ? skb->nf_bridge->physoutdev : NULL; } + +static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb) +{ + return skb->nf_bridge && skb->nf_bridge->in_prerouting; +} #else #define br_drop_fake_rtable(skb) do { } while (0) +static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb) +{ + return false; +} #endif /* CONFIG_BRIDGE_NETFILTER */ #endif diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d6cdd6e87d53..ac732e67a6c8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -173,17 +173,24 @@ struct nf_bridge_info { BRNF_PROTO_8021Q, BRNF_PROTO_PPPOE } orig_proto:8; - bool pkt_otherhost; + u8 pkt_otherhost:1; + u8 in_prerouting:1; + u8 bridged_dnat:1; __u16 frag_max_size; - unsigned int mask; struct net_device *physindev; union { - struct net_device *physoutdev; - char neigh_header[8]; - }; - union { + /* prerouting: detect dnat in orig/reply direction */ __be32 ipv4_daddr; struct in6_addr ipv6_daddr; + + /* after prerouting + nat detected: store original source + * mac since neigh resolution overwrites it, only used while + * skb is out in neigh layer. + */ + char neigh_header[8]; + + /* always valid & non-NULL from FORWARD on, for physdev match */ + struct net_device *physoutdev; }; }; #endif diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index c8b9bcfe997e..ec51c2ba30e9 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -284,7 +284,7 @@ int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb) nf_bridge->neigh_header, ETH_HLEN-ETH_ALEN); /* tell br_dev_xmit to continue with forwarding */ - nf_bridge->mask |= BRNF_BRIDGED_DNAT; + nf_bridge->bridged_dnat = 1; /* FIXME Need to refragment */ ret = neigh->output(neigh, skb); } @@ -356,7 +356,7 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb) skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } - nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; + nf_bridge->in_prerouting = 0; if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -444,7 +444,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb) nf_bridge->pkt_otherhost = true; } - nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING; + nf_bridge->in_prerouting = 1; nf_bridge->physindev = skb->dev; skb->dev = brnf_get_logical_dev(skb, skb->dev); @@ -850,10 +850,8 @@ static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { - if (skb->nf_bridge && - !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) { + if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) return NF_STOP; - } return NF_ACCEPT; } @@ -872,7 +870,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); skb_pull(skb, ETH_HLEN); - nf_bridge->mask &= ~BRNF_BRIDGED_DNAT; + nf_bridge->bridged_dnat = 0; BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN)); @@ -887,7 +885,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) static int br_nf_dev_xmit(struct sk_buff *skb) { - if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { + if (skb->nf_bridge && skb->nf_bridge->bridged_dnat) { br_nf_pre_routing_finish_bridge_slow(skb); return 1; } diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 13b7d1e3d185..77383bfe7ea3 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -174,7 +174,7 @@ static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } - nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; + nf_bridge->in_prerouting = 0; if (br_nf_ipv6_daddr_was_changed(skb, nf_bridge)) { skb_dst_drop(skb); v6ops->route_input(skb); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index c88b7d434718..b69e82bda215 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -49,12 +49,9 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, if (skb->nfct) zone = nf_ct_zone((struct nf_conn *)skb->nfct); #endif - -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge && - skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) + if (nf_bridge_in_prerouting(skb)) return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone; -#endif + if (hooknum == NF_INET_PRE_ROUTING) return IP_DEFRAG_CONNTRACK_IN + zone; else diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index a45db0b4785c..267fb8d5876e 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -39,12 +39,9 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, if (skb->nfct) zone = nf_ct_zone((struct nf_conn *)skb->nfct); #endif - -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge && - skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) + if (nf_bridge_in_prerouting(skb)) return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone; -#endif + if (hooknum == NF_INET_PRE_ROUTING) return IP6_DEFRAG_CONNTRACK_IN + zone; else -- cgit v1.2.3 From 7b36f92934e40d1ee24e5617ddedb852e10086ca Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 30 Jul 2015 12:42:47 +0200 Subject: bpf: provide helper that indicates eBPF was migrated During recent discussions we had with Michael, we found that it would be useful to have an indicator that tells the JIT that an eBPF program had been migrated from classic instructions into eBPF instructions, as only in that case A and X need to be cleared in the prologue. Such eBPF programs do not set a particular type, but all have BPF_PROG_TYPE_UNSPEC. Thus, introduce a small helper for cde66c2d88da ("s390/bpf: Only clear A and X for converted BPF programs") and possibly others in future. Signed-off-by: Daniel Borkmann Cc: Michael Holzheu Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/s390/net/bpf_jit_comp.c | 2 +- include/linux/filter.h | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index bbbac6da37af..9f4bbc09bf07 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1245,7 +1245,7 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp) jit->lit = jit->lit_start; jit->prg = 0; - bpf_jit_prologue(jit, fp->type == BPF_PROG_TYPE_UNSPEC); + bpf_jit_prologue(jit, bpf_prog_was_classic(fp)); for (i = 0; i < fp->len; i += insn_count) { insn_count = bpf_jit_insn(jit, fp, i); if (insn_count < 0) diff --git a/include/linux/filter.h b/include/linux/filter.h index 69d00555ce35..6b025491120d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -354,6 +354,16 @@ static inline unsigned int bpf_prog_size(unsigned int proglen) offsetof(struct bpf_prog, insns[proglen])); } +static inline bool bpf_prog_was_classic(const struct bpf_prog *prog) +{ + /* When classic BPF programs have been loaded and the arch + * does not have a classic BPF JIT (anymore), they have been + * converted via bpf_migrate_filter() to eBPF and thus always + * have an unspec program type. + */ + return prog->type == BPF_PROG_TYPE_UNSPEC; +} + #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) #ifdef CONFIG_DEBUG_SET_MODULE_RONX -- cgit v1.2.3 From b13138ef72178a13f34e33883f9f093f9e3b1bda Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 30 Jul 2015 12:42:49 +0200 Subject: bpf: also show process name/pid in bpf_jit_dump It can be useful for testing to see the actual process/pid who is loading a given filter. I was running some BPF test program and noticed unusual filter loads from time to time, triggered by some other application in the background. bpf_jit_disasm is still working after this change. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 6b025491120d..fa2cab985e57 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -438,8 +439,9 @@ void bpf_jit_free(struct bpf_prog *fp); static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, u32 pass, void *image) { - pr_err("flen=%u proglen=%u pass=%u image=%pK\n", - flen, proglen, pass, image); + pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen, + proglen, pass, image, current->comm, task_pid_nr(current)); + if (image) print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, 16, 1, image, proglen, false); -- cgit v1.2.3 From 8013d1d7eafb0589ca766db6b74026f76b7f5cb4 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 30 Jul 2015 14:28:42 +0800 Subject: net/ipv6: add sysctl option accept_ra_min_hop_limit Commit 6fd99094de2b ("ipv6: Don't reduce hop limit for an interface") disabled accept hop limit from RA if it is smaller than the current hop limit for security stuff. But this behavior kind of break the RFC definition. RFC 4861, 6.3.4. Processing Received Router Advertisements A Router Advertisement field (e.g., Cur Hop Limit, Reachable Time, and Retrans Timer) may contain a value denoting that it is unspecified. In such cases, the parameter should be ignored and the host should continue using whatever value it is already using. If the received Cur Hop Limit value is non-zero, the host SHOULD set its CurHopLimit variable to the received value. So add sysctl option accept_ra_min_hop_limit to let user choose the minimum hop limit value they can accept from RA. And set default to 1 to meet RFC standards. Signed-off-by: Hangbin Liu Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 8 ++++++++ include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 10 ++++++++++ net/ipv6/ndisc.c | 16 +++++++--------- 5 files changed, 27 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 1a5ab21bcca5..00d26d919459 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1340,6 +1340,14 @@ accept_ra_from_local - BOOLEAN disabled if accept_ra_from_local is disabled on a specific interface. +accept_ra_min_hop_limit - INTEGER + Minimum hop limit Information in Router Advertisement. + + Hop limit Information in Router Advertisement less than this + variable shall be ignored. + + Default: 1 + accept_ra_pinfo - BOOLEAN Learn Prefix Information in Router Advertisement. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 06ed637225b8..cb9dcad72372 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -29,6 +29,7 @@ struct ipv6_devconf { __s32 max_desync_factor; __s32 max_addresses; __s32 accept_ra_defrtr; + __s32 accept_ra_min_hop_limit; __s32 accept_ra_pinfo; #ifdef CONFIG_IPV6_ROUTER_PREF __s32 accept_ra_rtr_pref; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 641a146ead7d..80f3b74446a1 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -172,6 +172,7 @@ enum { DEVCONF_ACCEPT_RA_MTU, DEVCONF_STABLE_SECRET, DEVCONF_USE_OIF_ADDRS_ONLY, + DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index eb0c6a3a8a00..53e3a9d756b0 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -195,6 +195,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, .accept_ra_from_local = 0, + .accept_ra_min_hop_limit= 1, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -237,6 +238,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, .accept_ra_from_local = 0, + .accept_ra_min_hop_limit= 1, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -4588,6 +4590,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor; array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr; + array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit; array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; #ifdef CONFIG_IPV6_ROUTER_PREF array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref; @@ -5484,6 +5487,13 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "accept_ra_min_hop_limit", + .data = &ipv6_devconf.accept_ra_min_hop_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "accept_ra_pinfo", .data = &ipv6_devconf.accept_ra_pinfo, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 0a05b35a90fc..6e184e02fd3c 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1225,18 +1225,16 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (rt) rt6_set_expires(rt, jiffies + (HZ * lifetime)); - if (ra_msg->icmph.icmp6_hop_limit) { - /* Only set hop_limit on the interface if it is higher than - * the current hop_limit. - */ - if (in6_dev->cnf.hop_limit < ra_msg->icmph.icmp6_hop_limit) { + if (in6_dev->cnf.accept_ra_min_hop_limit < 256 && + ra_msg->icmph.icmp6_hop_limit) { + if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; + if (rt) + dst_metric_set(&rt->dst, RTAX_HOPLIMIT, + ra_msg->icmph.icmp6_hop_limit); } else { - ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than current\n"); + ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); } - if (rt) - dst_metric_set(&rt->dst, RTAX_HOPLIMIT, - ra_msg->icmph.icmp6_hop_limit); } skip_defrtr: -- cgit v1.2.3 From f70ea018da0631e10c26a02f5a82d626ffef5bd7 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 31 Jul 2015 16:52:10 -0700 Subject: net: Add functions to get skb->hash based on flow structures Add skb_get_hash_flowi6 and skb_get_hash_flowi4 which derive an sk_buff hash from flowi6 and flowi4 structures respectively. These functions can be called when creating a packet in the output path where the new sk_buff does not yet contain a fully formed packet that is parsable by flow dissector. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 21 +++++++++++++++++ net/core/flow_dissector.c | 58 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 75 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 648a2c241993..b7c1286e247d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -37,6 +37,7 @@ #include #include #include +#include /* A. Checksumming of received packets by device. * @@ -945,6 +946,26 @@ static inline __u32 skb_get_hash(struct sk_buff *skb) return skb->hash; } +__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6); + +static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) +{ + if (!skb->l4_hash && !skb->sw_hash) + __skb_get_hash_flowi6(skb, fl6); + + return skb->hash; +} + +__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl); + +static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4) +{ + if (!skb->l4_hash && !skb->sw_hash) + __skb_get_hash_flowi4(skb, fl4); + + return skb->hash; +} + __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb); static inline __u32 skb_get_hash_raw(const struct sk_buff *skb) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2a834c6179b9..11e6540fa386 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -590,6 +590,15 @@ void make_flow_keys_digest(struct flow_keys_digest *digest, } EXPORT_SYMBOL(make_flow_keys_digest); +static inline void __skb_set_sw_hash(struct sk_buff *skb, u32 hash, + struct flow_keys *keys) +{ + if (keys->ports.ports) + skb->l4_hash = 1; + skb->sw_hash = 1; + skb->hash = hash; +} + /** * __skb_get_hash: calculate a flow hash * @skb: sk_buff to calculate flow hash from @@ -609,10 +618,8 @@ void __skb_get_hash(struct sk_buff *skb) hash = ___skb_get_hash(skb, &keys, hashrnd); if (!hash) return; - if (keys.ports.ports) - skb->l4_hash = 1; - skb->sw_hash = 1; - skb->hash = hash; + + __skb_set_sw_hash(skb, hash, &keys); } EXPORT_SYMBOL(__skb_get_hash); @@ -624,6 +631,49 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) } EXPORT_SYMBOL(skb_get_hash_perturb); +__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) +{ + struct flow_keys keys; + + memset(&keys, 0, sizeof(keys)); + + memcpy(&keys.addrs.v6addrs.src, &fl6->saddr, + sizeof(keys.addrs.v6addrs.src)); + memcpy(&keys.addrs.v6addrs.dst, &fl6->daddr, + sizeof(keys.addrs.v6addrs.dst)); + keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + keys.ports.src = fl6->fl6_sport; + keys.ports.dst = fl6->fl6_dport; + keys.keyid.keyid = fl6->fl6_gre_key; + keys.tags.flow_label = (__force u32)fl6->flowlabel; + keys.basic.ip_proto = fl6->flowi6_proto; + + __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys); + + return skb->hash; +} +EXPORT_SYMBOL(__skb_get_hash_flowi6); + +__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4) +{ + struct flow_keys keys; + + memset(&keys, 0, sizeof(keys)); + + keys.addrs.v4addrs.src = fl4->saddr; + keys.addrs.v4addrs.dst = fl4->daddr; + keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + keys.ports.src = fl4->fl4_sport; + keys.ports.dst = fl4->fl4_dport; + keys.keyid.keyid = fl4->fl4_gre_key; + keys.basic.ip_proto = fl4->flowi4_proto; + + __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys); + + return skb->hash; +} +EXPORT_SYMBOL(__skb_get_hash_flowi4); + u32 __skb_get_poff(const struct sk_buff *skb, void *data, const struct flow_keys *keys, int hlen) { -- cgit v1.2.3 From d9eea403ca81f60cd535d354c77ada4c2bee8d66 Mon Sep 17 00:00:00 2001 From: Achiad Shochat Date: Tue, 4 Aug 2015 14:05:42 +0300 Subject: net/mlx5_core: Introduce access function to modify RSS/LRO params To be used by the mlx5 Eth driver in following commit. This is in preparation for netdev "light-weight" open/stop flow change described in previous commit. Signed-off-by: Achiad Shochat Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 12 ++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/transobj.h | 2 ++ include/linux/mlx5/mlx5_ifc.h | 9 ++++++++- 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c index c4f3f74908ec..e6453f61141e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c @@ -163,6 +163,18 @@ int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, return err; } +int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in, + int inlen) +{ + u32 out[MLX5_ST_SZ_DW(modify_tir_out)]; + + MLX5_SET(modify_tir_in, in, tirn, tirn); + MLX5_SET(modify_tir_in, in, opcode, MLX5_CMD_OP_MODIFY_TIR); + + memset(out, 0, sizeof(out)); + return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); +} + void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn) { u32 in[MLX5_ST_SZ_DW(destroy_tir_out)]; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h index 10bd75e7d9b1..d436c2d8b527 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h @@ -45,6 +45,8 @@ int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen); void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn); int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tirn); +int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in, + int inlen); void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn); int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tisn); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index c60a62bba652..469b7bda3304 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4050,6 +4050,13 @@ struct mlx5_ifc_modify_tis_in_bits { struct mlx5_ifc_tisc_bits ctx; }; +struct mlx5_ifc_modify_tir_bitmask_bits { + u8 reserved[0x20]; + + u8 reserved1[0x1f]; + u8 lro[0x1]; +}; + struct mlx5_ifc_modify_tir_out_bits { u8 status[0x8]; u8 reserved_0[0x18]; @@ -4071,7 +4078,7 @@ struct mlx5_ifc_modify_tir_in_bits { u8 reserved_3[0x20]; - u8 modify_bitmask[0x40]; + struct mlx5_ifc_modify_tir_bitmask_bits bitmask; u8 reserved_4[0x40]; -- cgit v1.2.3 From 5c50368f38317627421bf24a0b66b1af0d44eddc Mon Sep 17 00:00:00 2001 From: Achiad Shochat Date: Tue, 4 Aug 2015 14:05:43 +0300 Subject: net/mlx5e: Light-weight netdev open/stop Create/destroy TIRs, TISs and flow tables upon PCI probe/remove rather than upon the netdev ndo_open/stop. Upon ndo_stop(), redirect all RX traffic to the (lately introduced) "Drop RQ" and then close only the RX/TX rings, leaving the TIRs, TISs and flow tables alive. Signed-off-by: Achiad Shochat Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 237 ++++++++++++++------- drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 12 ++ drivers/net/ethernet/mellanox/mlx5/core/transobj.h | 2 + include/linux/mlx5/mlx5_ifc.h | 9 +- 4 files changed, 184 insertions(+), 76 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index baa7a69bb694..33d08bb11f84 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1301,14 +1301,18 @@ static void mlx5e_fill_rqt_rqns(struct mlx5e_priv *priv, void *rqtc, ix = ix % priv->params.num_channels; MLX5_SET(rqtc, rqtc, rq_num[i], - priv->channel[ix]->rq.rqn); + test_bit(MLX5E_STATE_OPENED, &priv->state) ? + priv->channel[ix]->rq.rqn : + priv->drop_rq.rqn); } break; default: /* MLX5E_SINGLE_RQ_RQT */ MLX5_SET(rqtc, rqtc, rq_num[0], - priv->channel[0]->rq.rqn); + test_bit(MLX5E_STATE_OPENED, &priv->state) ? + priv->channel[0]->rq.rqn : + priv->drop_rq.rqn); break; } @@ -1347,19 +1351,95 @@ static int mlx5e_open_rqt(struct mlx5e_priv *priv, enum mlx5e_rqt_ix rqt_ix) return err; } +static int mlx5e_redirect_rqt(struct mlx5e_priv *priv, enum mlx5e_rqt_ix rqt_ix) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u32 *in; + void *rqtc; + int inlen; + int log_sz; + int sz; + int err; + + log_sz = (rqt_ix == MLX5E_SINGLE_RQ_RQT) ? 0 : + priv->params.rx_hash_log_tbl_sz; + sz = 1 << log_sz; + + inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + sizeof(u32) * sz; + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx); + + MLX5_SET(rqtc, rqtc, rqt_actual_size, sz); + + mlx5e_fill_rqt_rqns(priv, rqtc, rqt_ix); + + MLX5_SET(modify_rqt_in, in, bitmask.rqn_list, 1); + + err = mlx5_core_modify_rqt(mdev, priv->rqtn[rqt_ix], in, inlen); + + kvfree(in); + + return err; +} + static void mlx5e_close_rqt(struct mlx5e_priv *priv, enum mlx5e_rqt_ix rqt_ix) { mlx5_core_destroy_rqt(priv->mdev, priv->rqtn[rqt_ix]); } +static void mlx5e_build_tir_ctx_lro(void *tirc, struct mlx5e_priv *priv) +{ + if (!priv->params.lro_en) + return; + +#define ROUGH_MAX_L2_L3_HDR_SZ 256 + + MLX5_SET(tirc, tirc, lro_enable_mask, + MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO | + MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO); + MLX5_SET(tirc, tirc, lro_max_ip_payload_size, + (priv->params.lro_wqe_sz - + ROUGH_MAX_L2_L3_HDR_SZ) >> 8); + MLX5_SET(tirc, tirc, lro_timeout_period_usecs, + MLX5_CAP_ETH(priv->mdev, + lro_timer_supported_periods[3])); +} + +static int mlx5e_modify_tir_lro(struct mlx5e_priv *priv, int tt) +{ + struct mlx5_core_dev *mdev = priv->mdev; + + void *in; + void *tirc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_tir_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_tir_in, in, bitmask.lro, 1); + tirc = MLX5_ADDR_OF(modify_tir_in, in, ctx); + + mlx5e_build_tir_ctx_lro(tirc, priv); + + err = mlx5_core_modify_tir(mdev, priv->tirn[tt], in, inlen); + + kvfree(in); + + return err; +} + static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 *tirc, int tt) { void *hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); MLX5_SET(tirc, tirc, transport_domain, priv->tdn); -#define ROUGH_MAX_L2_L3_HDR_SZ 256 - #define MLX5_HASH_IP (MLX5_HASH_FIELD_SEL_SRC_IP |\ MLX5_HASH_FIELD_SEL_DST_IP) @@ -1372,17 +1452,7 @@ static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 *tirc, int tt) MLX5_HASH_FIELD_SEL_DST_IP |\ MLX5_HASH_FIELD_SEL_IPSEC_SPI) - if (priv->params.lro_en) { - MLX5_SET(tirc, tirc, lro_enable_mask, - MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO | - MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO); - MLX5_SET(tirc, tirc, lro_max_ip_payload_size, - (priv->params.lro_wqe_sz - - ROUGH_MAX_L2_L3_HDR_SZ) >> 8); - MLX5_SET(tirc, tirc, lro_timeout_period_usecs, - MLX5_CAP_ETH(priv->mdev, - lro_timer_supported_periods[3])); - } + mlx5e_build_tir_ctx_lro(tirc, priv); MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT); @@ -1568,12 +1638,20 @@ static int mlx5e_set_dev_port_mtu(struct net_device *netdev) return 0; } +static void mlx5e_redirect_rqts(struct mlx5e_priv *priv) +{ + mlx5e_redirect_rqt(priv, MLX5E_INDIRECTION_RQT); + mlx5e_redirect_rqt(priv, MLX5E_SINGLE_RQ_RQT); +} + int mlx5e_open_locked(struct net_device *netdev) { struct mlx5e_priv *priv = netdev_priv(netdev); int num_txqs; int err; + set_bit(MLX5E_STATE_OPENED, &priv->state); + num_txqs = priv->params.num_channels * priv->params.num_tc; netif_set_real_num_tx_queues(netdev, num_txqs); netif_set_real_num_rx_queues(netdev, priv->params.num_channels); @@ -1582,83 +1660,32 @@ int mlx5e_open_locked(struct net_device *netdev) if (err) return err; - err = mlx5e_open_tises(priv); - if (err) { - netdev_err(netdev, "%s: mlx5e_open_tises failed, %d\n", - __func__, err); - return err; - } - err = mlx5e_open_channels(priv); if (err) { netdev_err(netdev, "%s: mlx5e_open_channels failed, %d\n", __func__, err); - goto err_close_tises; - } - - err = mlx5e_open_rqt(priv, MLX5E_INDIRECTION_RQT); - if (err) { - netdev_err(netdev, "%s: mlx5e_open_rqt(INDIR) failed, %d\n", - __func__, err); - goto err_close_channels; - } - - err = mlx5e_open_rqt(priv, MLX5E_SINGLE_RQ_RQT); - if (err) { - netdev_err(netdev, "%s: mlx5e_open_rqt(SINGLE) failed, %d\n", - __func__, err); - goto err_close_rqt_indir; - } - - err = mlx5e_open_tirs(priv); - if (err) { - netdev_err(netdev, "%s: mlx5e_open_tir failed, %d\n", - __func__, err); - goto err_close_rqt_single; - } - - err = mlx5e_open_flow_table(priv); - if (err) { - netdev_err(netdev, "%s: mlx5e_open_flow_table failed, %d\n", - __func__, err); - goto err_close_tirs; + return err; } err = mlx5e_add_all_vlan_rules(priv); if (err) { netdev_err(netdev, "%s: mlx5e_add_all_vlan_rules failed, %d\n", __func__, err); - goto err_close_flow_table; + goto err_close_channels; } mlx5e_init_eth_addr(priv); - set_bit(MLX5E_STATE_OPENED, &priv->state); - mlx5e_update_carrier(priv); + mlx5e_redirect_rqts(priv); mlx5e_set_rx_mode_core(priv); schedule_delayed_work(&priv->update_stats_work, 0); return 0; -err_close_flow_table: - mlx5e_close_flow_table(priv); - -err_close_tirs: - mlx5e_close_tirs(priv); - -err_close_rqt_single: - mlx5e_close_rqt(priv, MLX5E_SINGLE_RQ_RQT); - -err_close_rqt_indir: - mlx5e_close_rqt(priv, MLX5E_INDIRECTION_RQT); - err_close_channels: mlx5e_close_channels(priv); -err_close_tises: - mlx5e_close_tises(priv); - return err; } @@ -1682,13 +1709,9 @@ int mlx5e_close_locked(struct net_device *netdev) mlx5e_set_rx_mode_core(priv); mlx5e_del_all_vlan_rules(priv); + mlx5e_redirect_rqts(priv); netif_carrier_off(priv->netdev); - mlx5e_close_flow_table(priv); - mlx5e_close_tirs(priv); - mlx5e_close_rqt(priv, MLX5E_SINGLE_RQ_RQT); - mlx5e_close_rqt(priv, MLX5E_INDIRECTION_RQT); mlx5e_close_channels(priv); - mlx5e_close_tises(priv); return 0; } @@ -1766,6 +1789,8 @@ static int mlx5e_set_features(struct net_device *netdev, mlx5e_close_locked(priv->netdev); priv->params.lro_en = !!(features & NETIF_F_LRO); + mlx5e_modify_tir_lro(priv, MLX5E_TT_IPV4_TCP); + mlx5e_modify_tir_lro(priv, MLX5E_TT_IPV6_TCP); if (was_opened) err = mlx5e_open_locked(priv->netdev); @@ -2026,16 +2051,72 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev) goto err_dealloc_transport_domain; } + err = mlx5e_open_tises(priv); + if (err) { + mlx5_core_warn(mdev, "open tises failed, %d\n", err); + goto err_destroy_mkey; + } + + err = mlx5e_open_drop_rq(priv); + if (err) { + mlx5_core_err(mdev, "open drop rq failed, %d\n", err); + goto err_close_tises; + } + + err = mlx5e_open_rqt(priv, MLX5E_INDIRECTION_RQT); + if (err) { + mlx5_core_warn(mdev, "open rqt(INDIR) failed, %d\n", err); + goto err_close_drop_rq; + } + + err = mlx5e_open_rqt(priv, MLX5E_SINGLE_RQ_RQT); + if (err) { + mlx5_core_warn(mdev, "open rqt(SINGLE) failed, %d\n", err); + goto err_close_rqt_indir; + } + + err = mlx5e_open_tirs(priv); + if (err) { + mlx5_core_warn(mdev, "open tirs failed, %d\n", err); + goto err_close_rqt_single; + } + + err = mlx5e_open_flow_table(priv); + if (err) { + mlx5_core_warn(mdev, "open flow table failed, %d\n", err); + goto err_close_tirs; + } + + mlx5e_init_eth_addr(priv); + err = register_netdev(netdev); if (err) { mlx5_core_err(mdev, "register_netdev failed, %d\n", err); - goto err_destroy_mkey; + goto err_close_flow_table; } mlx5e_enable_async_events(priv); return priv; +err_close_flow_table: + mlx5e_close_flow_table(priv); + +err_close_tirs: + mlx5e_close_tirs(priv); + +err_close_rqt_single: + mlx5e_close_rqt(priv, MLX5E_SINGLE_RQ_RQT); + +err_close_rqt_indir: + mlx5e_close_rqt(priv, MLX5E_INDIRECTION_RQT); + +err_close_drop_rq: + mlx5e_close_drop_rq(priv); + +err_close_tises: + mlx5e_close_tises(priv); + err_destroy_mkey: mlx5_core_destroy_mkey(mdev, &priv->mr); @@ -2060,6 +2141,12 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv) struct net_device *netdev = priv->netdev; unregister_netdev(netdev); + mlx5e_close_flow_table(priv); + mlx5e_close_tirs(priv); + mlx5e_close_rqt(priv, MLX5E_SINGLE_RQ_RQT); + mlx5e_close_rqt(priv, MLX5E_INDIRECTION_RQT); + mlx5e_close_drop_rq(priv); + mlx5e_close_tises(priv); mlx5_core_destroy_mkey(priv->mdev, &priv->mr); mlx5_dealloc_transport_domain(priv->mdev, priv->tdn); mlx5_core_dealloc_pd(priv->mdev, priv->pdn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c index e6453f61141e..b4c87c7b0cf0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c @@ -387,6 +387,18 @@ int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen, return err; } +int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in, + int inlen) +{ + u32 out[MLX5_ST_SZ_DW(modify_rqt_out)]; + + MLX5_SET(modify_rqt_in, in, rqtn, rqtn); + MLX5_SET(modify_rqt_in, in, opcode, MLX5_CMD_OP_MODIFY_RQT); + + memset(out, 0, sizeof(out)); + return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); +} + void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn) { u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)]; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h index d436c2d8b527..74cae51436e4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h @@ -65,6 +65,8 @@ int mlx5_core_arm_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm); int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqtn); +int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in, + int inlen); void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn); #endif /* __TRANSOBJ_H__ */ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 469b7bda3304..dd2097455a2e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4123,6 +4123,13 @@ struct mlx5_ifc_modify_rqt_out_bits { u8 reserved_1[0x40]; }; +struct mlx5_ifc_rqt_bitmask_bits { + u8 reserved[0x20]; + + u8 reserved1[0x1f]; + u8 rqn_list[0x1]; +}; + struct mlx5_ifc_modify_rqt_in_bits { u8 opcode[0x10]; u8 reserved_0[0x10]; @@ -4135,7 +4142,7 @@ struct mlx5_ifc_modify_rqt_in_bits { u8 reserved_3[0x20]; - u8 modify_bitmask[0x40]; + struct mlx5_ifc_rqt_bitmask_bits bitmask; u8 reserved_4[0x40]; -- cgit v1.2.3 From efea389d3cc6427a9a94e92b2d7bf4c862f2cfcf Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 4 Aug 2015 14:05:47 +0300 Subject: net/mlx5_core: Support physical port counters Added physical port counters in the following standard formats to ethtool statistics: - IEEE 802.3 - RFC2863 - RFC2819 Signed-off-by: Gal Pressman Signed-off-by: Saeed Mahameed Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 75 ++++++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 10 ++- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 42 ++++++++++++ include/linux/mlx5/device.h | 10 +++ include/linux/mlx5/driver.h | 1 + 5 files changed, 137 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 35c33907a9ff..e9d7d90363a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -138,6 +138,80 @@ struct mlx5e_vport_stats { #define NUM_VPORT_COUNTERS 31 }; +static const char pport_strings[][ETH_GSTRING_LEN] = { + /* IEEE802.3 counters */ + "frames_tx", + "frames_rx", + "check_seq_err", + "alignment_err", + "octets_tx", + "octets_received", + "multicast_xmitted", + "broadcast_xmitted", + "multicast_rx", + "broadcast_rx", + "in_range_len_errors", + "out_of_range_len", + "too_long_errors", + "symbol_err", + "mac_control_tx", + "mac_control_rx", + "unsupported_op_rx", + "pause_ctrl_rx", + "pause_ctrl_tx", + + /* RFC2863 counters */ + "in_octets", + "in_ucast_pkts", + "in_discards", + "in_errors", + "in_unknown_protos", + "out_octets", + "out_ucast_pkts", + "out_discards", + "out_errors", + "in_multicast_pkts", + "in_broadcast_pkts", + "out_multicast_pkts", + "out_broadcast_pkts", + + /* RFC2819 counters */ + "drop_events", + "octets", + "pkts", + "broadcast_pkts", + "multicast_pkts", + "crc_align_errors", + "undersize_pkts", + "oversize_pkts", + "fragments", + "jabbers", + "collisions", + "p64octets", + "p65to127octets", + "p128to255octets", + "p256to511octets", + "p512to1023octets", + "p1024to1518octets", + "p1519to2047octets", + "p2048to4095octets", + "p4096to8191octets", + "p8192to10239octets", +}; + +#define NUM_IEEE_802_3_COUNTERS 19 +#define NUM_RFC_2863_COUNTERS 13 +#define NUM_RFC_2819_COUNTERS 21 +#define NUM_PPORT_COUNTERS (NUM_IEEE_802_3_COUNTERS + \ + NUM_RFC_2863_COUNTERS + \ + NUM_RFC_2819_COUNTERS) + +struct mlx5e_pport_stats { + __be64 IEEE_802_3_counters[NUM_IEEE_802_3_COUNTERS]; + __be64 RFC_2863_counters[NUM_RFC_2863_COUNTERS]; + __be64 RFC_2819_counters[NUM_RFC_2819_COUNTERS]; +}; + static const char rq_stats_strings[][ETH_GSTRING_LEN] = { "packets", "csum_none", @@ -180,6 +254,7 @@ struct mlx5e_sq_stats { struct mlx5e_stats { struct mlx5e_vport_stats vport; + struct mlx5e_pport_stats pport; }; struct mlx5e_params { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index b95aa3384c36..b549797b315f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -171,7 +171,7 @@ static int mlx5e_get_sset_count(struct net_device *dev, int sset) switch (sset) { case ETH_SS_STATS: - return NUM_VPORT_COUNTERS + + return NUM_VPORT_COUNTERS + NUM_PPORT_COUNTERS + priv->params.num_channels * NUM_RQ_STATS + priv->params.num_channels * priv->params.num_tc * NUM_SQ_STATS; @@ -200,6 +200,11 @@ static void mlx5e_get_strings(struct net_device *dev, strcpy(data + (idx++) * ETH_GSTRING_LEN, vport_strings[i]); + /* PPORT counters */ + for (i = 0; i < NUM_PPORT_COUNTERS; i++) + strcpy(data + (idx++) * ETH_GSTRING_LEN, + pport_strings[i]); + /* per channel counters */ for (i = 0; i < priv->params.num_channels; i++) for (j = 0; j < NUM_RQ_STATS; j++) @@ -234,6 +239,9 @@ static void mlx5e_get_ethtool_stats(struct net_device *dev, for (i = 0; i < NUM_VPORT_COUNTERS; i++) data[idx++] = ((u64 *)&priv->stats.vport)[i]; + for (i = 0; i < NUM_PPORT_COUNTERS; i++) + data[idx++] = be64_to_cpu(((__be64 *)&priv->stats.pport)[i]); + /* per channel counters */ for (i = 0; i < priv->params.num_channels; i++) for (j = 0; j < NUM_RQ_STATS; j++) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index b8023a7484e0..111427b33ec8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -82,6 +82,47 @@ static void mlx5e_update_carrier_work(struct work_struct *work) mutex_unlock(&priv->state_lock); } +static void mlx5e_update_pport_counters(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_pport_stats *s = &priv->stats.pport; + u32 *in; + u32 *out; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + + in = mlx5_vzalloc(sz); + out = mlx5_vzalloc(sz); + if (!in || !out) + goto free_out; + + MLX5_SET(ppcnt_reg, in, local_port, 1); + + MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP); + mlx5_core_access_reg(mdev, in, sz, out, + sz, MLX5_REG_PPCNT, 0, 0); + memcpy(s->IEEE_802_3_counters, + MLX5_ADDR_OF(ppcnt_reg, out, counter_set), + sizeof(s->IEEE_802_3_counters)); + + MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP); + mlx5_core_access_reg(mdev, in, sz, out, + sz, MLX5_REG_PPCNT, 0, 0); + memcpy(s->RFC_2863_counters, + MLX5_ADDR_OF(ppcnt_reg, out, counter_set), + sizeof(s->RFC_2863_counters)); + + MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP); + mlx5_core_access_reg(mdev, in, sz, out, + sz, MLX5_REG_PPCNT, 0, 0); + memcpy(s->RFC_2819_counters, + MLX5_ADDR_OF(ppcnt_reg, out, counter_set), + sizeof(s->RFC_2819_counters)); + +free_out: + kvfree(in); + kvfree(out); +} + void mlx5e_update_stats(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; @@ -202,6 +243,7 @@ void mlx5e_update_stats(struct mlx5e_priv *priv) s->tx_csum_offload = s->tx_packets - tx_offload_none; s->rx_csum_good = s->rx_packets - s->rx_csum_none; + mlx5e_update_pport_counters(priv); free_out: kvfree(out); } diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index b943cd9e2097..250b1ff8b48d 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1182,6 +1182,16 @@ enum { MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR = 0x40, }; +enum { + MLX5_IEEE_802_3_COUNTERS_GROUP = 0x0, + MLX5_RFC_2863_COUNTERS_GROUP = 0x1, + MLX5_RFC_2819_COUNTERS_GROUP = 0x2, + MLX5_RFC_3635_COUNTERS_GROUP = 0x3, + MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP = 0x5, + MLX5_PER_PRIORITY_COUNTERS_GROUP = 0x10, + MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP = 0x11 +}; + static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) { if (pkey_sz > MLX5_MAX_LOG_PKEY_TABLE) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5fe0cae1a515..2039546b0ec6 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -103,6 +103,7 @@ enum { MLX5_REG_PMTU = 0x5003, MLX5_REG_PTYS = 0x5004, MLX5_REG_PAOS = 0x5006, + MLX5_REG_PPCNT = 0x5008, MLX5_REG_PMAOS = 0x5012, MLX5_REG_PUDE = 0x5009, MLX5_REG_PMPE = 0x5010, -- cgit v1.2.3 From d92cff89a0c80e7e49796366e441d97f07b5d321 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 4 Aug 2015 18:26:19 +0200 Subject: net_dbg_ratelimited: turn into no-op when !DEBUG The pr_debug family of functions turns into a no-op when -DDEBUG is not specified, opting instead to call "no_printk", which gets compiled to a no-op (but retains gcc's nice warnings about printf-style arguments). The problem with net_dbg_ratelimited is that it is defined to be a variant of net_ratelimited_function, which expands to essentially: if (net_ratelimit()) pr_debug(fmt, ...); When DEBUG is not defined, then this becomes, if (net_ratelimit()) ; This seems benign, except it isn't. Firstly, there's the obvious overhead of calling net_ratelimit needlessly, which does quite some book keeping for the rate limiting. Given that the pr_debug and net_dbg_ratelimited family of functions are sprinkled liberally through performance critical code, with developers assuming they'll be compiled out to a no-op most of the time, we certainly do not want this needless book keeping. Secondly, and most visibly, even though no debug message is printed when DEBUG is not defined, if there is a flood of invocations, dmesg winds up peppered with messages such as "net_ratelimit: 320 callbacks suppressed". This is because our aforementioned net_ratelimit() function actually prints this text in some circumstances. It's especially odd to see this when there isn't any other accompanying debug message. So, in sum, it doesn't make sense to have this function's current behavior, and instead it should match what every other debug family of functions in the kernel does with !DEBUG -- nothing. This patch replaces calls to net_dbg_ratelimited when !DEBUG with no_printk, keeping with the idiom of all the other debug print helpers. Also, though not strictly neccessary, it guards the call with an if (0) so that all evaluation of any arguments are sure to be compiled out. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- include/linux/net.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 04aa06852771..049d4b03c4c4 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -239,8 +239,16 @@ do { \ net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__) #define net_info_ratelimited(fmt, ...) \ net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__) +#if defined(DEBUG) #define net_dbg_ratelimited(fmt, ...) \ net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__) +#else +#define net_dbg_ratelimited(fmt, ...) \ + do { \ + if (0) \ + no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ + } while (0) +#endif bool __net_get_random_once(void *buf, int nbytes, bool *done, struct static_key *done_key); -- cgit v1.2.3 From 3499abb249bb5ed9d21031944bc3059ec4aa2909 Mon Sep 17 00:00:00 2001 From: Andreas Schultz Date: Wed, 5 Aug 2015 17:51:45 +0200 Subject: netfilter: nfacct: per network namespace support - Move the nfnl_acct_list into the network namespace, initialize and destroy it per namespace - Keep track of refcnt on nfacct objects, the old logic does not longer work with a per namespace list - Adjust xt_nfacct to pass the namespace when registring objects Signed-off-by: Andreas Schultz Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink_acct.h | 3 +- include/net/net_namespace.h | 3 ++ net/netfilter/nfnetlink_acct.c | 71 ++++++++++++++++++++++---------- net/netfilter/xt_nfacct.c | 2 +- 4 files changed, 56 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink_acct.h b/include/linux/netfilter/nfnetlink_acct.h index 6ec975748742..80ca889b164e 100644 --- a/include/linux/netfilter/nfnetlink_acct.h +++ b/include/linux/netfilter/nfnetlink_acct.h @@ -2,6 +2,7 @@ #define _NFNL_ACCT_H_ #include +#include enum { NFACCT_NO_QUOTA = -1, @@ -11,7 +12,7 @@ enum { struct nf_acct; -struct nf_acct *nfnl_acct_find_get(const char *filter_name); +struct nf_acct *nfnl_acct_find_get(struct net *net, const char *filter_name); void nfnl_acct_put(struct nf_acct *acct); void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct); extern int nfnl_acct_overquota(const struct sk_buff *skb, diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index e951453e0a23..2dcea635ecce 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -118,6 +118,9 @@ struct net { #endif struct sock *nfnl; struct sock *nfnl_stash; +#if IS_ENABLED(CONFIG_NETFILTER_NETLINK_ACCT) + struct list_head nfnl_acct_list; +#endif #endif #ifdef CONFIG_WEXT_CORE struct sk_buff_head wext_nlevents; diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index c18af2f63eef..fefbf5f0b28d 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -27,8 +27,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure"); -static LIST_HEAD(nfnl_acct_list); - struct nf_acct { atomic64_t pkts; atomic64_t bytes; @@ -53,6 +51,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { struct nf_acct *nfacct, *matching = NULL; + struct net *net = sock_net(nfnl); char *acct_name; unsigned int size = 0; u32 flags = 0; @@ -64,7 +63,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, if (strlen(acct_name) == 0) return -EINVAL; - list_for_each_entry(nfacct, &nfnl_acct_list, head) { + list_for_each_entry(nfacct, &net->nfnl_acct_list, head) { if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) continue; @@ -124,7 +123,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS]))); } atomic_set(&nfacct->refcnt, 1); - list_add_tail_rcu(&nfacct->head, &nfnl_acct_list); + list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list); return 0; } @@ -185,6 +184,7 @@ nla_put_failure: static int nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); struct nf_acct *cur, *last; const struct nfacct_filter *filter = cb->data; @@ -196,7 +196,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->args[1] = 0; rcu_read_lock(); - list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { + list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) { if (last) { if (cur != last) continue; @@ -257,6 +257,7 @@ static int nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { + struct net *net = sock_net(nfnl); int ret = -ENOENT; struct nf_acct *cur; char *acct_name; @@ -283,7 +284,7 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, return -EINVAL; acct_name = nla_data(tb[NFACCT_NAME]); - list_for_each_entry(cur, &nfnl_acct_list, head) { + list_for_each_entry(cur, &net->nfnl_acct_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) @@ -336,19 +337,20 @@ static int nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { + struct net *net = sock_net(nfnl); char *acct_name; struct nf_acct *cur; int ret = -ENOENT; if (!tb[NFACCT_NAME]) { - list_for_each_entry(cur, &nfnl_acct_list, head) + list_for_each_entry(cur, &net->nfnl_acct_list, head) nfnl_acct_try_del(cur); return 0; } acct_name = nla_data(tb[NFACCT_NAME]); - list_for_each_entry(cur, &nfnl_acct_list, head) { + list_for_each_entry(cur, &net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0) continue; @@ -394,12 +396,12 @@ static const struct nfnetlink_subsystem nfnl_acct_subsys = { MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT); -struct nf_acct *nfnl_acct_find_get(const char *acct_name) +struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name) { struct nf_acct *cur, *acct = NULL; rcu_read_lock(); - list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { + list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) continue; @@ -422,7 +424,9 @@ EXPORT_SYMBOL_GPL(nfnl_acct_find_get); void nfnl_acct_put(struct nf_acct *acct) { - atomic_dec(&acct->refcnt); + if (atomic_dec_and_test(&acct->refcnt)) + kfree_rcu(acct, rcu_head); + module_put(THIS_MODULE); } EXPORT_SYMBOL_GPL(nfnl_acct_put); @@ -478,34 +482,59 @@ int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct) } EXPORT_SYMBOL_GPL(nfnl_acct_overquota); +static int __net_init nfnl_acct_net_init(struct net *net) +{ + INIT_LIST_HEAD(&net->nfnl_acct_list); + + return 0; +} + +static void __net_exit nfnl_acct_net_exit(struct net *net) +{ + struct nf_acct *cur, *tmp; + + list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) { + list_del_rcu(&cur->head); + + if (atomic_dec_and_test(&cur->refcnt)) + kfree_rcu(cur, rcu_head); + } +} + +static struct pernet_operations nfnl_acct_ops = { + .init = nfnl_acct_net_init, + .exit = nfnl_acct_net_exit, +}; + static int __init nfnl_acct_init(void) { int ret; + ret = register_pernet_subsys(&nfnl_acct_ops); + if (ret < 0) { + pr_err("nfnl_acct_init: failed to register pernet ops\n"); + goto err_out; + } + pr_info("nfnl_acct: registering with nfnetlink.\n"); ret = nfnetlink_subsys_register(&nfnl_acct_subsys); if (ret < 0) { pr_err("nfnl_acct_init: cannot register with nfnetlink.\n"); - goto err_out; + goto cleanup_pernet; } return 0; + +cleanup_pernet: + unregister_pernet_subsys(&nfnl_acct_ops); err_out: return ret; } static void __exit nfnl_acct_exit(void) { - struct nf_acct *cur, *tmp; - pr_info("nfnl_acct: unregistering from nfnetlink.\n"); nfnetlink_subsys_unregister(&nfnl_acct_subsys); - - list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) { - list_del_rcu(&cur->head); - /* We are sure that our objects have no clients at this point, - * it's safe to release them all without checking refcnt. */ - kfree_rcu(cur, rcu_head); - } + unregister_pernet_subsys(&nfnl_acct_ops); } module_init(nfnl_acct_init); diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index 8c646ed9c921..3048a7e3a90a 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -37,7 +37,7 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par) struct xt_nfacct_match_info *info = par->matchinfo; struct nf_acct *nfacct; - nfacct = nfnl_acct_find_get(info->name); + nfacct = nfnl_acct_find_get(par->net, info->name); if (nfacct == NULL) { pr_info("xt_nfacct: accounting object with name `%s' " "does not exists\n", info->name); -- cgit v1.2.3 From ffe8690c85b8426db7783064724d106702f1b1e8 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 6 Aug 2015 07:02:32 +0000 Subject: perf: add the necessary core perf APIs when accessing events counters in eBPF programs This patch add three core perf APIs: - perf_event_attrs(): export the struct perf_event_attr from struct perf_event; - perf_event_get(): get the struct perf_event from the given fd; - perf_event_read_local(): read the events counters active on the current CPU; These APIs are needed when accessing events counters in eBPF programs. The API perf_event_read_local() comes from Peter and I add the corresponding SOB. Signed-off-by: Kaixu Xia Signed-off-by: Peter Zijlstra Signed-off-by: David S. Miller --- include/linux/perf_event.h | 10 ++++++ kernel/events/core.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2027809433b3..092a0e8a479a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -641,6 +641,8 @@ extern int perf_event_init_task(struct task_struct *child); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); +extern struct perf_event *perf_event_get(unsigned int fd); +extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); @@ -659,6 +661,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); +extern u64 perf_event_read_local(struct perf_event *event); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); @@ -979,6 +982,12 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } +static inline struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } +static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) +{ + return ERR_PTR(-EINVAL); +} +static inline u64 perf_event_read_local(struct perf_event *event) { return -EINVAL; } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } @@ -1011,6 +1020,7 @@ static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } +static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL) diff --git a/kernel/events/core.c b/kernel/events/core.c index d3dae3419b99..e2c6a8886d4d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3212,6 +3212,59 @@ static inline u64 perf_event_count(struct perf_event *event) return __perf_event_count(event); } +/* + * NMI-safe method to read a local event, that is an event that + * is: + * - either for the current task, or for this CPU + * - does not have inherit set, for inherited task events + * will not be local and we cannot read them atomically + * - must not have a pmu::count method + */ +u64 perf_event_read_local(struct perf_event *event) +{ + unsigned long flags; + u64 val; + + /* + * Disabling interrupts avoids all counter scheduling (context + * switches, timer based rotation and IPIs). + */ + local_irq_save(flags); + + /* If this is a per-task event, it must be for current */ + WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) && + event->hw.target != current); + + /* If this is a per-CPU event, it must be for this CPU */ + WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) && + event->cpu != smp_processor_id()); + + /* + * It must not be an event with inherit set, we cannot read + * all child counters from atomic context. + */ + WARN_ON_ONCE(event->attr.inherit); + + /* + * It must not have a pmu::count method, those are not + * NMI safe. + */ + WARN_ON_ONCE(event->pmu->count); + + /* + * If the event is currently on this CPU, its either a per-task event, + * or local to this CPU. Furthermore it means its ACTIVE (otherwise + * oncpu == -1). + */ + if (event->oncpu == smp_processor_id()) + event->pmu->read(event); + + val = local64_read(&event->count); + local_irq_restore(flags); + + return val; +} + static u64 perf_event_read(struct perf_event *event) { /* @@ -8574,6 +8627,31 @@ void perf_event_delayed_put(struct task_struct *task) WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); } +struct perf_event *perf_event_get(unsigned int fd) +{ + int err; + struct fd f; + struct perf_event *event; + + err = perf_fget_light(fd, &f); + if (err) + return ERR_PTR(err); + + event = f.file->private_data; + atomic_long_inc(&event->refcount); + fdput(f); + + return event; +} + +const struct perf_event_attr *perf_event_attrs(struct perf_event *event) +{ + if (!event) + return ERR_PTR(-EINVAL); + + return &event->attr; +} + /* * inherit a event from parent task to child task: */ -- cgit v1.2.3 From 2a36f0b92eb638dd023870574eb471b1c56be9ad Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 6 Aug 2015 07:02:33 +0000 Subject: bpf: Make the bpf_prog_array_map more generic All the map backends are of generic nature. In order to avoid adding much special code into the eBPF core, rewrite part of the bpf_prog_array map code and make it more generic. So the new perf_event_array map type can reuse most of code with bpf_prog_array map and add fewer lines of special code. Signed-off-by: Wang Nan Signed-off-by: Kaixu Xia Signed-off-by: David S. Miller --- arch/x86/net/bpf_jit_comp.c | 6 ++-- include/linux/bpf.h | 8 +++-- kernel/bpf/arraymap.c | 80 +++++++++++++++++++++++++++------------------ kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c | 2 +- 5 files changed, 60 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index ec5214f39aa8..70efcd0940f9 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog) * goto out; * if (++tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; - * prog = array->prog[index]; + * prog = array->ptrs[index]; * if (prog == NULL) * goto out; * goto *(prog->bpf_func + prologue_size); @@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ - /* prog = array->prog[index]; */ + /* prog = array->ptrs[index]; */ EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */ - offsetof(struct bpf_array, prog)); + offsetof(struct bpf_array, ptrs)); EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ /* if (prog == NULL) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 139d6d2e123f..d495211d63d1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -24,6 +24,10 @@ struct bpf_map_ops { void *(*map_lookup_elem)(struct bpf_map *map, void *key); int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); int (*map_delete_elem)(struct bpf_map *map, void *key); + + /* funcs called by prog_array and perf_event_array map */ + void *(*map_fd_get_ptr) (struct bpf_map *map, int fd); + void (*map_fd_put_ptr) (void *ptr); }; struct bpf_map { @@ -142,13 +146,13 @@ struct bpf_array { bool owner_jited; union { char value[0] __aligned(8); - struct bpf_prog *prog[0] __aligned(8); + void *ptrs[0] __aligned(8); }; }; #define MAX_TAIL_CALL_CNT 32 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); -void bpf_prog_array_map_clear(struct bpf_map *map); +void bpf_fd_array_map_clear(struct bpf_map *map); bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index cb31229a6fa4..45df6572ecfd 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -150,15 +150,15 @@ static int __init register_array_map(void) } late_initcall(register_array_map); -static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) +static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) { - /* only bpf_prog file descriptors can be stored in prog_array map */ + /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) return ERR_PTR(-EINVAL); return array_map_alloc(attr); } -static void prog_array_map_free(struct bpf_map *map) +static void fd_array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map) /* make sure it's empty */ for (i = 0; i < array->map.max_entries; i++) - BUG_ON(array->prog[i] != NULL); + BUG_ON(array->ptrs[i] != NULL); kvfree(array); } -static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key) +static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) { return NULL; } /* only called from syscall */ -static int prog_array_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static int fd_array_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct bpf_prog *prog, *old_prog; + void *new_ptr, *old_ptr; u32 index = *(u32 *)key, ufd; if (map_flags != BPF_ANY) @@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key, return -E2BIG; ufd = *(u32 *)value; - prog = bpf_prog_get(ufd); - if (IS_ERR(prog)) - return PTR_ERR(prog); - - if (!bpf_prog_array_compatible(array, prog)) { - bpf_prog_put(prog); - return -EINVAL; - } + new_ptr = map->ops->map_fd_get_ptr(map, ufd); + if (IS_ERR(new_ptr)) + return PTR_ERR(new_ptr); - old_prog = xchg(array->prog + index, prog); - if (old_prog) - bpf_prog_put_rcu(old_prog); + old_ptr = xchg(array->ptrs + index, new_ptr); + if (old_ptr) + map->ops->map_fd_put_ptr(old_ptr); return 0; } -static int prog_array_map_delete_elem(struct bpf_map *map, void *key) +static int fd_array_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct bpf_prog *old_prog; + void *old_ptr; u32 index = *(u32 *)key; if (index >= array->map.max_entries) return -E2BIG; - old_prog = xchg(array->prog + index, NULL); - if (old_prog) { - bpf_prog_put_rcu(old_prog); + old_ptr = xchg(array->ptrs + index, NULL); + if (old_ptr) { + map->ops->map_fd_put_ptr(old_ptr); return 0; } else { return -ENOENT; } } +static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_prog *prog = bpf_prog_get(fd); + if (IS_ERR(prog)) + return prog; + + if (!bpf_prog_array_compatible(array, prog)) { + bpf_prog_put(prog); + return ERR_PTR(-EINVAL); + } + return prog; +} + +static void prog_fd_array_put_ptr(void *ptr) +{ + struct bpf_prog *prog = ptr; + + bpf_prog_put_rcu(prog); +} + /* decrement refcnt of all bpf_progs that are stored in this map */ -void bpf_prog_array_map_clear(struct bpf_map *map) +void bpf_fd_array_map_clear(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; for (i = 0; i < array->map.max_entries; i++) - prog_array_map_delete_elem(map, &i); + fd_array_map_delete_elem(map, &i); } static const struct bpf_map_ops prog_array_ops = { - .map_alloc = prog_array_map_alloc, - .map_free = prog_array_map_free, + .map_alloc = fd_array_map_alloc, + .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, - .map_lookup_elem = prog_array_map_lookup_elem, - .map_update_elem = prog_array_map_update_elem, - .map_delete_elem = prog_array_map_delete_elem, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_update_elem = fd_array_map_update_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = prog_fd_array_get_ptr, + .map_fd_put_ptr = prog_fd_array_put_ptr, }; static struct bpf_map_type_list prog_array_type __read_mostly = { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index fafa74161445..67c380cfa9ca 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -450,7 +450,7 @@ select_insn: tail_call_cnt++; - prog = READ_ONCE(array->prog[index]); + prog = READ_ONCE(array->ptrs[index]); if (unlikely(!prog)) goto out; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a1b14d197a4f..dc9b464fefa9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -72,7 +72,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp) /* prog_array stores refcnt-ed bpf_prog pointers * release them all when user space closes prog_array_fd */ - bpf_prog_array_map_clear(map); + bpf_fd_array_map_clear(map); bpf_map_put(map); return 0; -- cgit v1.2.3 From ea317b267e9d03a8241893aa176fba7661d07579 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 6 Aug 2015 07:02:34 +0000 Subject: bpf: Add new bpf map type to store the pointer to struct perf_event Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'. This map only stores the pointer to struct perf_event. The user space event FDs from perf_event_open() syscall are converted to the pointer to struct perf_event and stored in map. Signed-off-by: Kaixu Xia Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/arraymap.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d495211d63d1..4fc1f4070789 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -10,6 +10,7 @@ #include #include #include +#include struct bpf_map; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2ce13c109b00..a1814e8e53a7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -114,6 +114,7 @@ enum bpf_map_type { BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PROG_ARRAY, + BPF_MAP_TYPE_PERF_EVENT_ARRAY, }; enum bpf_prog_type { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 45df6572ecfd..29ace107f236 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -273,3 +273,60 @@ static int __init register_prog_array_map(void) return 0; } late_initcall(register_prog_array_map); + +static void perf_event_array_map_free(struct bpf_map *map) +{ + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) +{ + struct perf_event *event; + const struct perf_event_attr *attr; + + event = perf_event_get(fd); + if (IS_ERR(event)) + return event; + + attr = perf_event_attrs(event); + if (IS_ERR(attr)) + return (void *)attr; + + if (attr->type != PERF_TYPE_RAW && + attr->type != PERF_TYPE_HARDWARE) { + perf_event_release_kernel(event); + return ERR_PTR(-EINVAL); + } + return event; +} + +static void perf_event_fd_array_put_ptr(void *ptr) +{ + struct perf_event *event = ptr; + + perf_event_release_kernel(event); +} + +static const struct bpf_map_ops perf_event_array_ops = { + .map_alloc = fd_array_map_alloc, + .map_free = perf_event_array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_update_elem = fd_array_map_update_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = perf_event_fd_array_get_ptr, + .map_fd_put_ptr = perf_event_fd_array_put_ptr, +}; + +static struct bpf_map_type_list perf_event_array_type __read_mostly = { + .ops = &perf_event_array_ops, + .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, +}; + +static int __init register_perf_event_array_map(void) +{ + bpf_register_map_type(&perf_event_array_type); + return 0; +} +late_initcall(register_perf_event_array_map); -- cgit v1.2.3 From 35578d7984003097af2b1e34502bc943d40c1804 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 6 Aug 2015 07:02:35 +0000 Subject: bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter According to the perf_event_map_fd and index, the function bpf_perf_event_read() can convert the corresponding map value to the pointer to struct perf_event and return the Hardware PMU counter value. Signed-off-by: Kaixu Xia Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/verifier.c | 48 +++++++++++++++++++++++++++++++++--------------- kernel/trace/bpf_trace.c | 31 +++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4fc1f4070789..f57d7fed9ec3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -190,6 +190,7 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; +extern const struct bpf_func_proto bpf_perf_event_read_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a1814e8e53a7..92a48e2d5461 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -271,6 +271,7 @@ enum bpf_func_id { */ BPF_FUNC_skb_get_tunnel_key, BPF_FUNC_skb_set_tunnel_key, + BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(&map, index) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cd307df98cb3..48e1c7192560 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -238,6 +238,14 @@ static const char * const reg_type_str[] = { [CONST_IMM] = "imm", }; +static const struct { + int map_type; + int func_id; +} func_limit[] = { + {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, + {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, +}; + static void print_verifier_state(struct verifier_env *env) { enum bpf_reg_type t; @@ -837,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return err; } +static int check_map_func_compatibility(struct bpf_map *map, int func_id) +{ + bool bool_map, bool_func; + int i; + + if (!map) + return 0; + + for (i = 0; i <= ARRAY_SIZE(func_limit); i++) { + bool_map = (map->map_type == func_limit[i].map_type); + bool_func = (func_id == func_limit[i].func_id); + /* only when map & func pair match it can continue. + * don't allow any other map type to be passed into + * the special func; + */ + if (bool_map != bool_func) + return -EINVAL; + } + + return 0; +} + static int check_call(struct verifier_env *env, int func_id) { struct verifier_state *state = &env->cur_state; @@ -912,21 +942,9 @@ static int check_call(struct verifier_env *env, int func_id) return -EINVAL; } - if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY && - func_id != BPF_FUNC_tail_call) - /* prog_array map type needs extra care: - * only allow to pass it into bpf_tail_call() for now. - * bpf_map_delete_elem() can be allowed in the future, - * while bpf_map_update_elem() must only be done via syscall - */ - return -EINVAL; - - if (func_id == BPF_FUNC_tail_call && - map->map_type != BPF_MAP_TYPE_PROG_ARRAY) - /* don't allow any other map type to be passed into - * bpf_tail_call() - */ - return -EINVAL; + err = check_map_func_compatibility(map, func_id); + if (err) + return err; return 0; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 88a041adee90..ef9936df1b04 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -158,6 +158,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } +static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct perf_event *event; + + if (unlikely(index >= array->map.max_entries)) + return -E2BIG; + + event = (struct perf_event *)array->ptrs[index]; + if (!event) + return -ENOENT; + + /* + * we don't know if the function is run successfully by the + * return value. It can be judged in other places, such as + * eBPF programs. + */ + return perf_event_read_local(event); +} + +const struct bpf_func_proto bpf_perf_event_read_proto = { + .func = bpf_perf_event_read, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -183,6 +212,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return bpf_get_trace_printk_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_perf_event_read: + return &bpf_perf_event_read_proto; default: return NULL; } -- cgit v1.2.3 From 4c96b7dc0d393f12c17e0d81db15aa4a820a6ab3 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 12 Aug 2015 17:06:26 -0500 Subject: Add a matching set of device_ functions for determining mac/phy OF has some helper functions for parsing MAC and PHY settings. In cases where the platform is providing this information rather than the device itself, there needs to be similar functions for ACPI. These functions are slightly modified versions of the ones in of_net which can use information provided via DT or ACPI. Signed-off-by: Jeremy Linton Signed-off-by: David S. Miller --- drivers/base/property.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/property.h | 4 +++ 2 files changed, 77 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/property.c b/drivers/base/property.c index f3f6d167f3f1..2e8cd147f02d 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include /** * device_add_property_set - Add a collection of properties to a device object. @@ -533,3 +535,74 @@ bool device_dma_is_coherent(struct device *dev) return coherent; } EXPORT_SYMBOL_GPL(device_dma_is_coherent); + +/** + * device_get_phy_mode - Get phy mode for given device_node + * @dev: Pointer to the given device + * + * The function gets phy interface string from property 'phy-mode' or + * 'phy-connection-type', and return its index in phy_modes table, or errno in + * error case. + */ +int device_get_phy_mode(struct device *dev) +{ + const char *pm; + int err, i; + + err = device_property_read_string(dev, "phy-mode", &pm); + if (err < 0) + err = device_property_read_string(dev, + "phy-connection-type", &pm); + if (err < 0) + return err; + + for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++) + if (!strcasecmp(pm, phy_modes(i))) + return i; + + return -ENODEV; +} +EXPORT_SYMBOL_GPL(device_get_phy_mode); + +static void *device_get_mac_addr(struct device *dev, + const char *name, char *addr, + int alen) +{ + int ret = device_property_read_u8_array(dev, name, addr, alen); + + if (ret == 0 && is_valid_ether_addr(addr)) + return addr; + return NULL; +} + +/** + * Search the device tree for the best MAC address to use. 'mac-address' is + * checked first, because that is supposed to contain to "most recent" MAC + * address. If that isn't set, then 'local-mac-address' is checked next, + * because that is the default address. If that isn't set, then the obsolete + * 'address' is checked, just in case we're using an old device tree. + * + * Note that the 'address' property is supposed to contain a virtual address of + * the register set, but some DTS files have redefined that property to be the + * MAC address. + * + * All-zero MAC addresses are rejected, because those could be properties that + * exist in the device tree, but were not set by U-Boot. For example, the + * DTS could define 'mac-address' and 'local-mac-address', with zero MAC + * addresses. Some older U-Boots only initialized 'local-mac-address'. In + * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists + * but is all zeros. +*/ +void *device_get_mac_address(struct device *dev, char *addr, int alen) +{ + addr = device_get_mac_addr(dev, "mac-address", addr, alen); + if (addr) + return addr; + + addr = device_get_mac_addr(dev, "local-mac-address", addr, alen); + if (addr) + return addr; + + return device_get_mac_addr(dev, "address", addr, alen); +} +EXPORT_SYMBOL(device_get_mac_address); diff --git a/include/linux/property.h b/include/linux/property.h index 76ebde9c11d4..a59c6ee566c2 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -166,4 +166,8 @@ void device_add_property_set(struct device *dev, struct property_set *pset); bool device_dma_is_coherent(struct device *dev); +int device_get_phy_mode(struct device *dev); + +void *device_get_mac_address(struct device *dev, char *addr, int alen); + #endif /* _LINUX_PROPERTY_H_ */ -- cgit v1.2.3 From 35103d11173b8fea874183f8aa508ae71234d299 Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Thu, 13 Aug 2015 10:39:01 -0400 Subject: net: ipv6 sysctl option to ignore routes when nexthop link is down Like the ipv4 patch with a similar title, this adds a sysctl to allow the user to change routing behavior based on whether or not the interface associated with the nexthop was an up or down link. The default setting preserves the current behavior, but anyone that enables it will notice that nexthops on down interfaces will no longer be selected: net.ipv6.conf.all.ignore_routes_with_linkdown = 0 net.ipv6.conf.default.ignore_routes_with_linkdown = 0 net.ipv6.conf.lo.ignore_routes_with_linkdown = 0 ... When the above sysctls are set, not only will link status be reported to userspace, but an indication that a nexthop is dead and will not be used is also reported. 1000::/8 via 7000::2 dev p7p1 metric 1024 dead linkdown pref medium 1000::/8 via 8000::2 dev p8p1 metric 1024 pref medium 7000::/8 dev p7p1 proto kernel metric 256 dead linkdown pref medium 8000::/8 dev p8p1 proto kernel metric 256 pref medium 9000::/8 via 8000::2 dev p8p1 metric 2048 pref medium 9000::/8 via 7000::2 dev p7p1 metric 1024 dead linkdown pref medium fe80::/64 dev p7p1 proto kernel metric 256 dead linkdown pref medium fe80::/64 dev p8p1 proto kernel metric 256 pref medium This also adds devconf support and notification when sysctl values change. v2: drop use of rt6i_nhflags since it is not needed right now Signed-off-by: Andy Gospodarek Signed-off-by: Dinesh Dutt Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 105 +++++++++++++++++++++++++++++++++++++++++++++- net/ipv6/route.c | 11 ++++- 4 files changed, 116 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index cb9dcad72372..f1f32af6d9b9 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -31,6 +31,7 @@ struct ipv6_devconf { __s32 accept_ra_defrtr; __s32 accept_ra_min_hop_limit; __s32 accept_ra_pinfo; + __s32 ignore_routes_with_linkdown; #ifdef CONFIG_IPV6_ROUTER_PREF __s32 accept_ra_rtr_pref; __s32 rtr_probe_interval; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 80f3b74446a1..38b4fef20219 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -173,6 +173,7 @@ enum { DEVCONF_STABLE_SECRET, DEVCONF_USE_OIF_ADDRS_ONLY, DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT, + DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 53e3a9d756b0..5dfbac72f1ab 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -214,6 +214,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .initialized = false, }, .use_oif_addrs_only = 0, + .ignore_routes_with_linkdown = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -257,6 +258,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .initialized = false, }, .use_oif_addrs_only = 0, + .ignore_routes_with_linkdown = 0, }; /* Check if a valid qdisc is available */ @@ -472,6 +474,9 @@ static int inet6_netconf_msgsize_devconf(int type) if (type == -1 || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); + if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) + size += nla_total_size(4); + return size; } @@ -508,6 +513,11 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0) goto nla_put_failure; + if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && + nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + devconf->ignore_routes_with_linkdown) < 0) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -544,6 +554,7 @@ static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, + [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; static int inet6_netconf_get_devconf(struct sk_buff *in_skb, @@ -766,6 +777,63 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) rt6_purge_dflt_routers(net); return 1; } + +static void addrconf_linkdown_change(struct net *net, __s32 newf) +{ + struct net_device *dev; + struct inet6_dev *idev; + + for_each_netdev(net, dev) { + idev = __in6_dev_get(dev); + if (idev) { + int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf); + + idev->cnf.ignore_routes_with_linkdown = newf; + if (changed) + inet6_netconf_notify_devconf(dev_net(dev), + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + dev->ifindex, + &idev->cnf); + } + } +} + +static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf) +{ + struct net *net; + int old; + + if (!rtnl_trylock()) + return restart_syscall(); + + net = (struct net *)table->extra2; + old = *p; + *p = newf; + + if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) { + if ((!newf) ^ (!old)) + inet6_netconf_notify_devconf(net, + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt); + rtnl_unlock(); + return 0; + } + + if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) { + net->ipv6.devconf_dflt->ignore_routes_with_linkdown = newf; + addrconf_linkdown_change(net, newf); + if ((!newf) ^ (!old)) + inet6_netconf_notify_devconf(net, + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); + } + rtnl_unlock(); + + return 1; +} + #endif /* Nobody refers to this ifaddr, destroy it */ @@ -4616,6 +4684,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu; + array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown; /* we omit DEVCONF_STABLE_SECRET for now */ array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; } @@ -5338,6 +5407,34 @@ out: return err; } +static +int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + loff_t pos = *ppos; + struct ctl_table lctl; + int ret; + + /* ctl->data points to idev->cnf.ignore_routes_when_linkdown + * we should not modify it until we get the rtnl lock. + */ + lctl = *ctl; + lctl.data = &val; + + ret = proc_dointvec(&lctl, write, buffer, lenp, ppos); + + if (write) + ret = addrconf_fixup_linkdown(ctl, valp, val); + if (ret) + *ppos = pos; + return ret; +} + static struct addrconf_sysctl_table { struct ctl_table_header *sysctl_header; @@ -5629,7 +5726,13 @@ static struct addrconf_sysctl_table .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - + }, + { + .procname = "ignore_routes_with_linkdown", + .data = &ipv6_devconf.ignore_routes_with_linkdown, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_ignore_routes_with_linkdown, }, { /* sentinel */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 370f72785385..1c0217e61357 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -665,6 +665,12 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, { int m; bool match_do_rr = false; + struct inet6_dev *idev = rt->rt6i_idev; + struct net_device *dev = rt->dst.dev; + + if (dev && !netif_carrier_ok(dev) && + idev->cnf.ignore_routes_with_linkdown) + goto out; if (rt6_check_expired(rt)) goto out; @@ -2887,8 +2893,11 @@ static int rt6_fill_node(struct net *net, else rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; - if (!netif_carrier_ok(rt->dst.dev)) + if (!netif_carrier_ok(rt->dst.dev)) { rtm->rtm_flags |= RTNH_F_LINKDOWN; + if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) + rtm->rtm_flags |= RTNH_F_DEAD; + } rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; if (rt->rt6i_flags & RTF_DYNAMIC) -- cgit v1.2.3 From 4e3c89920cd3a6cfce22c6f537690747c26128dd Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 13 Aug 2015 14:59:00 -0600 Subject: net: Introduce VRF related flags and helpers Add a VRF_MASTER flag for interfaces and helper functions for determining if a device is a VRF_MASTER. Add link attribute for passing VRF_TABLE id. Add vrf_ptr to netdevice. Add various macros for determining if a device is a VRF device, the index of the master VRF device and table associated with VRF device. Signed-off-by: Shrijeet Mukherjee Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 20 +++++++ include/net/vrf.h | 139 +++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/if_link.h | 9 +++ 3 files changed, 168 insertions(+) create mode 100644 include/net/vrf.h (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 607b5f41f46f..f7a6ef2fae3a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1289,6 +1289,7 @@ enum netdev_priv_flags { IFF_XMIT_DST_RELEASE_PERM = 1<<22, IFF_IPVLAN_MASTER = 1<<23, IFF_IPVLAN_SLAVE = 1<<24, + IFF_VRF_MASTER = 1<<25, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1316,6 +1317,7 @@ enum netdev_priv_flags { #define IFF_XMIT_DST_RELEASE_PERM IFF_XMIT_DST_RELEASE_PERM #define IFF_IPVLAN_MASTER IFF_IPVLAN_MASTER #define IFF_IPVLAN_SLAVE IFF_IPVLAN_SLAVE +#define IFF_VRF_MASTER IFF_VRF_MASTER /** * struct net_device - The DEVICE structure. @@ -1432,6 +1434,7 @@ enum netdev_priv_flags { * @dn_ptr: DECnet specific data * @ip6_ptr: IPv6 specific data * @ax25_ptr: AX.25 specific data + * @vrf_ptr: VRF specific data * @ieee80211_ptr: IEEE 802.11 specific data, assign before registering * * @last_rx: Time of last Rx @@ -1650,6 +1653,7 @@ struct net_device { struct dn_dev __rcu *dn_ptr; struct inet6_dev __rcu *ip6_ptr; void *ax25_ptr; + struct net_vrf_dev __rcu *vrf_ptr; struct wireless_dev *ieee80211_ptr; struct wpan_dev *ieee802154_ptr; #if IS_ENABLED(CONFIG_MPLS_ROUTING) @@ -3808,6 +3812,22 @@ static inline bool netif_supports_nofcs(struct net_device *dev) return dev->priv_flags & IFF_SUPP_NOFCS; } +static inline bool netif_is_vrf(const struct net_device *dev) +{ + return dev->priv_flags & IFF_VRF_MASTER; +} + +static inline bool netif_index_is_vrf(struct net *net, int ifindex) +{ + struct net_device *dev = dev_get_by_index_rcu(net, ifindex); + bool rc = false; + + if (dev) + rc = netif_is_vrf(dev); + + return rc; +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { diff --git a/include/net/vrf.h b/include/net/vrf.h new file mode 100644 index 000000000000..0484d29d4589 --- /dev/null +++ b/include/net/vrf.h @@ -0,0 +1,139 @@ +/* + * include/net/net_vrf.h - adds vrf dev structure definitions + * Copyright (c) 2015 Cumulus Networks + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __LINUX_NET_VRF_H +#define __LINUX_NET_VRF_H + +struct net_vrf_dev { + struct rcu_head rcu; + int ifindex; /* ifindex of master dev */ + u32 tb_id; /* table id for VRF */ +}; + +struct slave { + struct list_head list; + struct net_device *dev; +}; + +struct slave_queue { + struct list_head all_slaves; + int num_slaves; +}; + +struct net_vrf { + struct slave_queue queue; + struct rtable *rth; + u32 tb_id; +}; + + +#if IS_ENABLED(CONFIG_NET_VRF) +/* called with rcu_read_lock() */ +static inline int vrf_master_ifindex_rcu(const struct net_device *dev) +{ + struct net_vrf_dev *vrf_ptr; + int ifindex = 0; + + if (!dev) + return 0; + + if (netif_is_vrf(dev)) + ifindex = dev->ifindex; + else { + vrf_ptr = rcu_dereference(dev->vrf_ptr); + if (vrf_ptr) + ifindex = vrf_ptr->ifindex; + } + + return ifindex; +} + +/* called with rcu_read_lock */ +static inline int vrf_dev_table_rcu(const struct net_device *dev) +{ + int tb_id = 0; + + if (dev) { + struct net_vrf_dev *vrf_ptr; + + vrf_ptr = rcu_dereference(dev->vrf_ptr); + if (vrf_ptr) + tb_id = vrf_ptr->tb_id; + } + return tb_id; +} + +static inline int vrf_dev_table(const struct net_device *dev) +{ + int tb_id; + + rcu_read_lock(); + tb_id = vrf_dev_table_rcu(dev); + rcu_read_unlock(); + + return tb_id; +} + +/* called with rtnl */ +static inline int vrf_dev_table_rtnl(const struct net_device *dev) +{ + int tb_id = 0; + + if (dev) { + struct net_vrf_dev *vrf_ptr; + + vrf_ptr = rtnl_dereference(dev->vrf_ptr); + if (vrf_ptr) + tb_id = vrf_ptr->tb_id; + } + return tb_id; +} + +/* caller has already checked netif_is_vrf(dev) */ +static inline struct rtable *vrf_dev_get_rth(const struct net_device *dev) +{ + struct rtable *rth = ERR_PTR(-ENETUNREACH); + struct net_vrf *vrf = netdev_priv(dev); + + if (vrf) { + rth = vrf->rth; + atomic_inc(&rth->dst.__refcnt); + } + return rth; +} + +#else +static inline int vrf_master_ifindex_rcu(const struct net_device *dev) +{ + return 0; +} + +static inline int vrf_dev_table_rcu(const struct net_device *dev) +{ + return 0; +} + +static inline int vrf_dev_table(const struct net_device *dev) +{ + return 0; +} + +static inline int vrf_dev_table_rtnl(const struct net_device *dev) +{ + return 0; +} + +static inline struct rtable *vrf_dev_get_rth(const struct net_device *dev) +{ + return ERR_PTR(-ENETUNREACH); +} +#endif + +#endif /* __LINUX_NET_VRF_H */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index d450be36add2..313c305fd1ad 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -341,6 +341,15 @@ enum macvlan_macaddr_mode { #define MACVLAN_FLAG_NOPROMISC 1 +/* VRF section */ +enum { + IFLA_VRF_UNSPEC, + IFLA_VRF_TABLE, + __IFLA_VRF_MAX +}; + +#define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1) + /* IPVLAN section */ enum { IFLA_IPVLAN_UNSPEC, -- cgit v1.2.3 From 2377799c084d86d22074cd4acd20edc32024d669 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 13 Jul 2015 12:17:25 +0200 Subject: average: provide macro to create static EWMA Having the EWMA parameters stored in the runtime struct imposes memory requirements for the constant values that could just be inlined in the code. This particularly makes sense if there are a lot of such structs, for example in mac80211 in the station table where each station has a number of these in an array, and there can be many stations. Provide a macro DECLARE_EWMA() that declares the necessary struct and inline functions to access it with the parameters hard-coded; using this also means the user no longer needs to 'select AVERAGE' as it's entirely self-contained. In the mac80211 case, on x86-64, this actually slightly *reduces* code size, while also saving 80 bytes of runtime memory per sta. Signed-off-by: Johannes Berg --- include/linux/average.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/average.h b/include/linux/average.h index c6028fd742c1..60f8226c5652 100644 --- a/include/linux/average.h +++ b/include/linux/average.h @@ -27,4 +27,43 @@ static inline unsigned long ewma_read(const struct ewma *avg) return avg->internal >> avg->factor; } +#define DECLARE_EWMA(name, _factor, _weight) \ + struct ewma_##name { \ + unsigned long internal; \ + }; \ + static inline void ewma_##name##_init(struct ewma_##name *e) \ + { \ + BUILD_BUG_ON(!__builtin_constant_p(_factor)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight)); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_factor); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight); \ + e->internal = 0; \ + } \ + static inline unsigned long \ + ewma_##name##_read(struct ewma_##name *e) \ + { \ + BUILD_BUG_ON(!__builtin_constant_p(_factor)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight)); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_factor); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight); \ + return e->internal >> ilog2(_factor); \ + } \ + static inline void ewma_##name##_add(struct ewma_##name *e, \ + unsigned long val) \ + { \ + unsigned long internal = ACCESS_ONCE(e->internal); \ + unsigned long weight = ilog2(_weight); \ + unsigned long factor = ilog2(_factor); \ + \ + BUILD_BUG_ON(!__builtin_constant_p(_factor)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight)); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_factor); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight); \ + \ + ACCESS_ONCE(e->internal) = internal ? \ + (((internal << weight) - internal) + \ + (val << factor)) >> weight : \ + (val << factor); \ + } + #endif /* _LINUX_AVERAGE_H */ -- cgit v1.2.3 From 8f9c98df949333f08b74e5df1caacf7e2c5e8552 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 19 Jul 2015 16:09:12 +0300 Subject: mac80211: fix BIT position for TDLS WIDE extended cap The bit was not according to ieee80211 specification. Fix that. Reviewed-by: Arik Nemtsov Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index b9c7897dc566..cfa906f28b7a 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2074,8 +2074,8 @@ enum ieee80211_tdls_actioncode { #define WLAN_EXT_CAPA5_TDLS_PROHIBITED BIT(6) #define WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED BIT(7) +#define WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED BIT(5) #define WLAN_EXT_CAPA8_OPMODE_NOTIF BIT(6) -#define WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED BIT(7) /* TDLS specific payload type in the LLC/SNAP header */ #define WLAN_TDLS_SNAP_RFTYPE 0x2 -- cgit v1.2.3 From 76b733d15874128ee2d0365b4cbe7d51decd8d37 Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Fri, 14 Aug 2015 22:33:30 +0200 Subject: nfc: st-nci: Remove duplicate file platform_data/st_nci.h commit "nfc: st-nci: Rename st21nfcb to st-nci" adds include/linux/platform_data/st_nci.h duplicated with include/linux/platform_data/st-nci.h. Only drivers/nfc/st-nci/i2c.c uses platform_data/st_nci.h. Cc: stable@vger.kernel.org Reported-by: Hauke Mehrtens Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- drivers/nfc/st-nci/i2c.c | 2 +- include/linux/platform_data/st_nci.h | 29 ----------------------------- 2 files changed, 1 insertion(+), 30 deletions(-) delete mode 100644 include/linux/platform_data/st_nci.h (limited to 'include/linux') diff --git a/drivers/nfc/st-nci/i2c.c b/drivers/nfc/st-nci/i2c.c index 06175ce769bb..b2a467c2d668 100644 --- a/drivers/nfc/st-nci/i2c.c +++ b/drivers/nfc/st-nci/i2c.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include "ndlc.h" diff --git a/include/linux/platform_data/st_nci.h b/include/linux/platform_data/st_nci.h deleted file mode 100644 index d9d400a297bd..000000000000 --- a/include/linux/platform_data/st_nci.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Driver include for ST NCI NFC chip family. - * - * Copyright (C) 2014-2015 STMicroelectronics SAS. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ - -#ifndef _ST_NCI_H_ -#define _ST_NCI_H_ - -#define ST_NCI_DRIVER_NAME "st_nci" - -struct st_nci_nfc_platform_data { - unsigned int gpio_reset; - unsigned int irq_polarity; -}; - -#endif /* _ST_NCI_H_ */ -- cgit v1.2.3 From fa8187c96471c49419c25d4ec3299d17d3f274b2 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 13 Aug 2015 19:01:06 +0200 Subject: net: declare new net_device priv_flag IFF_NO_QUEUE This private net_device flag can be set by drivers to inform that a device runs fine without a qdisc attached. This was formerly done by setting tx_queue_len to zero. Signed-off-by: Phil Sutter Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f7a6ef2fae3a..d131755516ca 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1262,6 +1262,7 @@ struct net_device_ops { * @IFF_LIVE_ADDR_CHANGE: device supports hardware address * change when it's running * @IFF_MACVLAN: Macvlan device + * @IFF_NO_QUEUE: device can run without qdisc attached */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1290,6 +1291,7 @@ enum netdev_priv_flags { IFF_IPVLAN_MASTER = 1<<23, IFF_IPVLAN_SLAVE = 1<<24, IFF_VRF_MASTER = 1<<25, + IFF_NO_QUEUE = 1<<26, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1318,6 +1320,7 @@ enum netdev_priv_flags { #define IFF_IPVLAN_MASTER IFF_IPVLAN_MASTER #define IFF_IPVLAN_SLAVE IFF_IPVLAN_SLAVE #define IFF_VRF_MASTER IFF_VRF_MASTER +#define IFF_NO_QUEUE IFF_NO_QUEUE /** * struct net_device - The DEVICE structure. -- cgit v1.2.3 From fbaff3ef859a86dc3df2128d2de9f8a6e255a967 Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Thu, 13 Aug 2015 18:34:03 -0700 Subject: net: fix endian check warning in etherdevice.h Sparse builds have been warning for a really long time now that etherdevice.h has a conversion that is unsafe. include/linux/etherdevice.h:79:32: warning: restricted __be16 degrades to integer This code change fixes the issue and generates the exact same assembly before/after (checked on x86_64) Fixes: 2c722fe1c821 (etherdevice: Optimize a few is__ether_addr functions) Signed-off-by: Jesse Brandeburg CC: Joe Perches Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 9012f8775208..eb049c622208 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -76,7 +76,7 @@ static inline bool is_link_local_ether_addr(const u8 *addr) #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return (((*(const u32 *)addr) ^ (*(const u32 *)b)) | - ((a[2] ^ b[2]) & m)) == 0; + (__force int)((a[2] ^ b[2]) & m)) == 0; #else return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0; #endif -- cgit v1.2.3 From 6fa1bcab6be6e9bd93f80e345c7e9a4ec7861df9 Mon Sep 17 00:00:00 2001 From: Achiad Shochat Date: Sun, 16 Aug 2015 16:04:50 +0300 Subject: net/mlx5e: Ethtool link speed setting fixes - Port speed settings are applied by the device only upon port admin status transition from DOWN to UP. So we enforce this transition regardless of the port's current operation state (which may be occasionally DOWN if for example the network cable is disconnected). - Fix the PORT_UP/DOWN device interface enum - Set the local_port bit in the device PAOS register - EXPORT the PAOS (Port Administrative and Operational Status) register set/query access functions. Signed-off-by: Achiad Shochat Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 25 ++++++---------------- drivers/net/ethernet/mellanox/mlx5/core/port.c | 14 ++++++++---- include/linux/mlx5/driver.h | 11 +++++----- 3 files changed, 23 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 975828521913..77158b31eb0b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -628,7 +628,7 @@ static int mlx5e_set_settings(struct net_device *netdev, u32 link_modes; u32 speed; u32 eth_proto_cap, eth_proto_admin; - u8 port_status; + enum mlx5_port_status ps; int err; speed = ethtool_cmd_speed(cmd); @@ -662,24 +662,13 @@ static int mlx5e_set_settings(struct net_device *netdev, if (link_modes == eth_proto_admin) goto out; - err = mlx5_set_port_proto(mdev, link_modes, MLX5_PTYS_EN); - if (err) { - netdev_err(netdev, "%s: set port eth proto admin failed: %d\n", - __func__, err); - goto out; - } + mlx5_query_port_admin_status(mdev, &ps); + if (ps == MLX5_PORT_UP) + mlx5_set_port_admin_status(mdev, MLX5_PORT_DOWN); + mlx5_set_port_proto(mdev, link_modes, MLX5_PTYS_EN); + if (ps == MLX5_PORT_UP) + mlx5_set_port_admin_status(mdev, MLX5_PORT_UP); - err = mlx5_query_port_status(mdev, &port_status); - if (err) - goto out; - - if (port_status == MLX5_PORT_DOWN) - return 0; - - err = mlx5_set_port_status(mdev, MLX5_PORT_DOWN); - if (err) - goto out; - err = mlx5_set_port_status(mdev, MLX5_PORT_UP); out: return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 70147999f657..f8db52bd8f18 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -216,22 +216,25 @@ int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin, } EXPORT_SYMBOL_GPL(mlx5_set_port_proto); -int mlx5_set_port_status(struct mlx5_core_dev *dev, - enum mlx5_port_status status) +int mlx5_set_port_admin_status(struct mlx5_core_dev *dev, + enum mlx5_port_status status) { u32 in[MLX5_ST_SZ_DW(paos_reg)]; u32 out[MLX5_ST_SZ_DW(paos_reg)]; memset(in, 0, sizeof(in)); + MLX5_SET(paos_reg, in, local_port, 1); MLX5_SET(paos_reg, in, admin_status, status); MLX5_SET(paos_reg, in, ase, 1); return mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_PAOS, 0, 1); } +EXPORT_SYMBOL_GPL(mlx5_set_port_admin_status); -int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status) +int mlx5_query_port_admin_status(struct mlx5_core_dev *dev, + enum mlx5_port_status *status) { u32 in[MLX5_ST_SZ_DW(paos_reg)]; u32 out[MLX5_ST_SZ_DW(paos_reg)]; @@ -239,14 +242,17 @@ int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status) memset(in, 0, sizeof(in)); + MLX5_SET(paos_reg, in, local_port, 1); + err = mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_PAOS, 0, 0); if (err) return err; - *status = MLX5_GET(paos_reg, out, oper_status); + *status = MLX5_GET(paos_reg, out, admin_status); return err; } +EXPORT_SYMBOL_GPL(mlx5_query_port_admin_status); static void mlx5_query_port_mtu(struct mlx5_core_dev *dev, int *admin_mtu, int *max_mtu, int *oper_mtu, u8 port) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 2039546b0ec6..4b5d7fc88d0f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -152,8 +152,8 @@ enum mlx5_dev_event { }; enum mlx5_port_status { - MLX5_PORT_UP = 1 << 1, - MLX5_PORT_DOWN = 1 << 2, + MLX5_PORT_UP = 1, + MLX5_PORT_DOWN = 2, }; struct mlx5_uuar_info { @@ -761,9 +761,10 @@ int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev, u8 local_port); int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin, int proto_mask); -int mlx5_set_port_status(struct mlx5_core_dev *dev, - enum mlx5_port_status status); -int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status); +int mlx5_set_port_admin_status(struct mlx5_core_dev *dev, + enum mlx5_port_status status); +int mlx5_query_port_admin_status(struct mlx5_core_dev *dev, + enum mlx5_port_status *status); int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port); void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 port); -- cgit v1.2.3 From 3c2d18ef22df1bdccfb11a5b85b29e4e61b9d9c6 Mon Sep 17 00:00:00 2001 From: Achiad Shochat Date: Sun, 16 Aug 2015 16:04:51 +0300 Subject: net/mlx5e: Support ethtool get/set_pauseparam Only rx/tx pause settings. Autoneg setting is currently not supported. Signed-off-by: Achiad Shochat Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 38 ++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/port.c | 42 ++++++++++++++++++++++ include/linux/mlx5/driver.h | 5 +++ 3 files changed, 85 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 77158b31eb0b..bce912688ca8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -820,6 +820,42 @@ static int mlx5e_set_tunable(struct net_device *dev, return err; } +static void mlx5e_get_pauseparam(struct net_device *netdev, + struct ethtool_pauseparam *pauseparam) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + int err; + + err = mlx5_query_port_pause(mdev, &pauseparam->rx_pause, + &pauseparam->tx_pause); + if (err) { + netdev_err(netdev, "%s: mlx5_query_port_pause failed:0x%x\n", + __func__, err); + } +} + +static int mlx5e_set_pauseparam(struct net_device *netdev, + struct ethtool_pauseparam *pauseparam) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + int err; + + if (pauseparam->autoneg) + return -EINVAL; + + err = mlx5_set_port_pause(mdev, + pauseparam->rx_pause ? 1 : 0, + pauseparam->tx_pause ? 1 : 0); + if (err) { + netdev_err(netdev, "%s: mlx5_set_port_pause failed:0x%x\n", + __func__, err); + } + + return err; +} + const struct ethtool_ops mlx5e_ethtool_ops = { .get_drvinfo = mlx5e_get_drvinfo, .get_link = ethtool_op_get_link, @@ -841,4 +877,6 @@ const struct ethtool_ops mlx5e_ethtool_ops = { .get_rxnfc = mlx5e_get_rxnfc, .get_tunable = mlx5e_get_tunable, .set_tunable = mlx5e_set_tunable, + .get_pauseparam = mlx5e_get_pauseparam, + .set_pauseparam = mlx5e_set_pauseparam, }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index f8db52bd8f18..821caaab9bfb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -334,3 +334,45 @@ int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev, return 0; } EXPORT_SYMBOL_GPL(mlx5_query_port_vl_hw_cap); + +int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause) +{ + u32 in[MLX5_ST_SZ_DW(pfcc_reg)]; + u32 out[MLX5_ST_SZ_DW(pfcc_reg)]; + int err; + + memset(in, 0, sizeof(in)); + MLX5_SET(pfcc_reg, in, local_port, 1); + MLX5_SET(pfcc_reg, in, pptx, tx_pause); + MLX5_SET(pfcc_reg, in, pprx, rx_pause); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_PFCC, 0, 1); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_set_port_pause); + +int mlx5_query_port_pause(struct mlx5_core_dev *dev, + u32 *rx_pause, u32 *tx_pause) +{ + u32 in[MLX5_ST_SZ_DW(pfcc_reg)]; + u32 out[MLX5_ST_SZ_DW(pfcc_reg)]; + int err; + + memset(in, 0, sizeof(in)); + MLX5_SET(pfcc_reg, in, local_port, 1); + + err = mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_PFCC, 0, 0); + if (err) + return err; + + if (rx_pause) + *rx_pause = MLX5_GET(pfcc_reg, out, pprx); + + if (tx_pause) + *tx_pause = MLX5_GET(pfcc_reg, out, pptx); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_port_pause); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 4b5d7fc88d0f..8b6d6f2154a4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -103,6 +103,7 @@ enum { MLX5_REG_PMTU = 0x5003, MLX5_REG_PTYS = 0x5004, MLX5_REG_PAOS = 0x5006, + MLX5_REG_PFCC = 0x5007, MLX5_REG_PPCNT = 0x5008, MLX5_REG_PMAOS = 0x5012, MLX5_REG_PUDE = 0x5009, @@ -774,6 +775,10 @@ void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu, int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev, u8 *vl_hw_cap, u8 local_port); +int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause); +int mlx5_query_port_pause(struct mlx5_core_dev *dev, + u32 *rx_pause, u32 *tx_pause); + int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq); void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq); int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq, -- cgit v1.2.3 From 2f52bdcf6ba1bd81597b1505a222430676548b3a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 16 Aug 2015 07:49:20 -0600 Subject: net: Updates to netif_index_is_vrf As Eric noted netif_index_is_vrf is not called with rcu_read_lock held, so wrap the dev_get_by_index_rcu in rcu_read_lock and unlock. If VRF is not enabled or oif is 0 skip the device lookup. In both cases index cannot be the VRF master. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d131755516ca..8a530f930754 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3822,12 +3822,22 @@ static inline bool netif_is_vrf(const struct net_device *dev) static inline bool netif_index_is_vrf(struct net *net, int ifindex) { - struct net_device *dev = dev_get_by_index_rcu(net, ifindex); bool rc = false; +#if IS_ENABLED(CONFIG_NET_VRF) + struct net_device *dev; + + if (ifindex == 0) + return false; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, ifindex); if (dev) rc = netif_is_vrf(dev); + rcu_read_unlock(); +#endif return rc; } -- cgit v1.2.3 From 808d28c468a89e8680396d98aea96024b6f5afdc Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 16 Aug 2015 10:26:49 -0600 Subject: net: Fix docbook warning for IFF_VRF_MASTER enum kbuild test robot reported: tree: git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master head: d52736e24fe2e927c26817256f8d1a3c8b5d51a0 commit: 4e3c89920cd3a6cfce22c6f537690747c26128dd [751/762] net: Introduce VRF related flags and helpers reproduce: make htmldocs >> Warning(include/linux/netdevice.h:1293): Enum value 'IFF_VRF_MASTER' not described in enum 'netdev_priv_flags' Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8a530f930754..4bd177faa90c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1262,6 +1262,7 @@ struct net_device_ops { * @IFF_LIVE_ADDR_CHANGE: device supports hardware address * change when it's running * @IFF_MACVLAN: Macvlan device + * @IFF_VRF_MASTER: device is a VRF master * @IFF_NO_QUEUE: device can run without qdisc attached */ enum netdev_priv_flags { -- cgit v1.2.3 From 74f4e0cc61080f63f28e8d519bdf437957e64217 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 14 Aug 2015 00:21:45 +0200 Subject: bcma: switch GPIO portions to use GPIOLIB_IRQCHIP This switches the BCMA GPIO driver to use GPIOLIB_IRQCHIP to handle its interrupts instead of rolling its own copy of the irqdomain handling etc. Signed-off-by: Linus Walleij Signed-off-by: Hauke Mehrtens Signed-off-by: Kalle Valo --- drivers/bcma/Kconfig | 2 +- drivers/bcma/driver_gpio.c | 92 ++++++++++------------------- include/linux/bcma/bcma_driver_chipcommon.h | 1 - 3 files changed, 31 insertions(+), 64 deletions(-) (limited to 'include/linux') diff --git a/drivers/bcma/Kconfig b/drivers/bcma/Kconfig index be5fffb6da24..023d448ed3fa 100644 --- a/drivers/bcma/Kconfig +++ b/drivers/bcma/Kconfig @@ -92,7 +92,7 @@ config BCMA_DRIVER_GMAC_CMN config BCMA_DRIVER_GPIO bool "BCMA GPIO driver" depends on BCMA && GPIOLIB - select IRQ_DOMAIN if BCMA_HOST_SOC + select GPIOLIB_IRQCHIP if BCMA_HOST_SOC help Driver to provide access to the GPIO pins of the bcma bus. diff --git a/drivers/bcma/driver_gpio.c b/drivers/bcma/driver_gpio.c index 5f6018e7cd4c..504899a72966 100644 --- a/drivers/bcma/driver_gpio.c +++ b/drivers/bcma/driver_gpio.c @@ -8,10 +8,8 @@ * Licensed under the GNU/GPL. See COPYING for details. */ -#include -#include +#include #include -#include #include #include @@ -79,19 +77,11 @@ static void bcma_gpio_free(struct gpio_chip *chip, unsigned gpio) } #if IS_BUILTIN(CONFIG_BCM47XX) || IS_BUILTIN(CONFIG_ARCH_BCM_5301X) -static int bcma_gpio_to_irq(struct gpio_chip *chip, unsigned gpio) -{ - struct bcma_drv_cc *cc = bcma_gpio_get_cc(chip); - - if (cc->core->bus->hosttype == BCMA_HOSTTYPE_SOC) - return irq_find_mapping(cc->irq_domain, gpio); - else - return -EINVAL; -} static void bcma_gpio_irq_unmask(struct irq_data *d) { - struct bcma_drv_cc *cc = irq_data_get_irq_chip_data(d); + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct bcma_drv_cc *cc = bcma_gpio_get_cc(gc); int gpio = irqd_to_hwirq(d); u32 val = bcma_chipco_gpio_in(cc, BIT(gpio)); @@ -101,7 +91,8 @@ static void bcma_gpio_irq_unmask(struct irq_data *d) static void bcma_gpio_irq_mask(struct irq_data *d) { - struct bcma_drv_cc *cc = irq_data_get_irq_chip_data(d); + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct bcma_drv_cc *cc = bcma_gpio_get_cc(gc); int gpio = irqd_to_hwirq(d); bcma_chipco_gpio_intmask(cc, BIT(gpio), 0); @@ -116,6 +107,7 @@ static struct irq_chip bcma_gpio_irq_chip = { static irqreturn_t bcma_gpio_irq_handler(int irq, void *dev_id) { struct bcma_drv_cc *cc = dev_id; + struct gpio_chip *gc = &cc->gpio; u32 val = bcma_cc_read32(cc, BCMA_CC_GPIOIN); u32 mask = bcma_cc_read32(cc, BCMA_CC_GPIOIRQ); u32 pol = bcma_cc_read32(cc, BCMA_CC_GPIOPOL); @@ -125,81 +117,58 @@ static irqreturn_t bcma_gpio_irq_handler(int irq, void *dev_id) if (!irqs) return IRQ_NONE; - for_each_set_bit(gpio, &irqs, cc->gpio.ngpio) - generic_handle_irq(bcma_gpio_to_irq(&cc->gpio, gpio)); + for_each_set_bit(gpio, &irqs, gc->ngpio) + generic_handle_irq(irq_find_mapping(gc->irqdomain, gpio)); bcma_chipco_gpio_polarity(cc, irqs, val & irqs); return IRQ_HANDLED; } -static int bcma_gpio_irq_domain_init(struct bcma_drv_cc *cc) +static int bcma_gpio_irq_init(struct bcma_drv_cc *cc) { struct gpio_chip *chip = &cc->gpio; - int gpio, hwirq, err; + int hwirq, err; if (cc->core->bus->hosttype != BCMA_HOSTTYPE_SOC) return 0; - cc->irq_domain = irq_domain_add_linear(NULL, chip->ngpio, - &irq_domain_simple_ops, cc); - if (!cc->irq_domain) { - err = -ENODEV; - goto err_irq_domain; - } - for (gpio = 0; gpio < chip->ngpio; gpio++) { - int irq = irq_create_mapping(cc->irq_domain, gpio); - - irq_set_chip_data(irq, cc); - irq_set_chip_and_handler(irq, &bcma_gpio_irq_chip, - handle_simple_irq); - } - hwirq = bcma_core_irq(cc->core, 0); err = request_irq(hwirq, bcma_gpio_irq_handler, IRQF_SHARED, "gpio", cc); if (err) - goto err_req_irq; + return err; bcma_chipco_gpio_intmask(cc, ~0, 0); bcma_cc_set32(cc, BCMA_CC_IRQMASK, BCMA_CC_IRQ_GPIO); - return 0; - -err_req_irq: - for (gpio = 0; gpio < chip->ngpio; gpio++) { - int irq = irq_find_mapping(cc->irq_domain, gpio); - - irq_dispose_mapping(irq); + err = gpiochip_irqchip_add(chip, + &bcma_gpio_irq_chip, + 0, + handle_simple_irq, + IRQ_TYPE_NONE); + if (err) { + free_irq(hwirq, cc); + return err; } - irq_domain_remove(cc->irq_domain); -err_irq_domain: - return err; + + return 0; } -static void bcma_gpio_irq_domain_exit(struct bcma_drv_cc *cc) +static void bcma_gpio_irq_exit(struct bcma_drv_cc *cc) { - struct gpio_chip *chip = &cc->gpio; - int gpio; - if (cc->core->bus->hosttype != BCMA_HOSTTYPE_SOC) return; bcma_cc_mask32(cc, BCMA_CC_IRQMASK, ~BCMA_CC_IRQ_GPIO); free_irq(bcma_core_irq(cc->core, 0), cc); - for (gpio = 0; gpio < chip->ngpio; gpio++) { - int irq = irq_find_mapping(cc->irq_domain, gpio); - - irq_dispose_mapping(irq); - } - irq_domain_remove(cc->irq_domain); } #else -static int bcma_gpio_irq_domain_init(struct bcma_drv_cc *cc) +static int bcma_gpio_irq_init(struct bcma_drv_cc *cc) { return 0; } -static void bcma_gpio_irq_domain_exit(struct bcma_drv_cc *cc) +static void bcma_gpio_irq_exit(struct bcma_drv_cc *cc) { } #endif @@ -218,9 +187,8 @@ int bcma_gpio_init(struct bcma_drv_cc *cc) chip->set = bcma_gpio_set_value; chip->direction_input = bcma_gpio_direction_input; chip->direction_output = bcma_gpio_direction_output; -#if IS_BUILTIN(CONFIG_BCM47XX) || IS_BUILTIN(CONFIG_ARCH_BCM_5301X) - chip->to_irq = bcma_gpio_to_irq; -#endif + chip->owner = THIS_MODULE; + chip->dev = bcma_bus_get_host_dev(bus); #if IS_BUILTIN(CONFIG_OF) if (cc->core->bus->hosttype == BCMA_HOSTTYPE_SOC) chip->of_node = cc->core->dev.of_node; @@ -248,13 +216,13 @@ int bcma_gpio_init(struct bcma_drv_cc *cc) else chip->base = -1; - err = bcma_gpio_irq_domain_init(cc); + err = gpiochip_add(chip); if (err) return err; - err = gpiochip_add(chip); + err = bcma_gpio_irq_init(cc); if (err) { - bcma_gpio_irq_domain_exit(cc); + gpiochip_remove(chip); return err; } @@ -263,7 +231,7 @@ int bcma_gpio_init(struct bcma_drv_cc *cc) int bcma_gpio_unregister(struct bcma_drv_cc *cc) { - bcma_gpio_irq_domain_exit(cc); + bcma_gpio_irq_exit(cc); gpiochip_remove(&cc->gpio); return 0; } diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h index 6cceedf65ca2..cf038431a5cc 100644 --- a/include/linux/bcma/bcma_driver_chipcommon.h +++ b/include/linux/bcma/bcma_driver_chipcommon.h @@ -640,7 +640,6 @@ struct bcma_drv_cc { spinlock_t gpio_lock; #ifdef CONFIG_BCMA_DRIVER_GPIO struct gpio_chip gpio; - struct irq_domain *irq_domain; #endif }; -- cgit v1.2.3 From f4e774f55fe0bb568a0877b2eb9e1b4b5a6f5cbc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 19 Aug 2015 09:46:22 +0200 Subject: average: remove out-of-line implementation Since all users are now converted to the inline implementation, remove the out-of-line implementation entirely. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/linux/average.h | 24 ------------------- lib/Kconfig | 10 -------- lib/average.c | 64 ------------------------------------------------- 3 files changed, 98 deletions(-) delete mode 100644 lib/average.c (limited to 'include/linux') diff --git a/include/linux/average.h b/include/linux/average.h index 60f8226c5652..d04aa58280de 100644 --- a/include/linux/average.h +++ b/include/linux/average.h @@ -3,30 +3,6 @@ /* Exponentially weighted moving average (EWMA) */ -/* For more documentation see lib/average.c */ - -struct ewma { - unsigned long internal; - unsigned long factor; - unsigned long weight; -}; - -extern void ewma_init(struct ewma *avg, unsigned long factor, - unsigned long weight); - -extern struct ewma *ewma_add(struct ewma *avg, unsigned long val); - -/** - * ewma_read() - Get average value - * @avg: Average structure - * - * Returns the average value held in @avg. - */ -static inline unsigned long ewma_read(const struct ewma *avg) -{ - return avg->internal >> avg->factor; -} - #define DECLARE_EWMA(name, _factor, _weight) \ struct ewma_##name { \ unsigned long internal; \ diff --git a/lib/Kconfig b/lib/Kconfig index 3a2ef67db6c7..278890dd1049 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -460,16 +460,6 @@ config ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE config LRU_CACHE tristate -config AVERAGE - bool "Averaging functions" - help - This option is provided for the case where no in-kernel-tree - modules require averaging functions, but a module built outside - the kernel tree does. Such modules that use library averaging - functions require Y here. - - If unsure, say N. - config CLZ_TAB bool diff --git a/lib/average.c b/lib/average.c deleted file mode 100644 index 114d1beae0c7..000000000000 --- a/lib/average.c +++ /dev/null @@ -1,64 +0,0 @@ -/* - * lib/average.c - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - */ - -#include -#include -#include -#include -#include - -/** - * DOC: Exponentially Weighted Moving Average (EWMA) - * - * These are generic functions for calculating Exponentially Weighted Moving - * Averages (EWMA). We keep a structure with the EWMA parameters and a scaled - * up internal representation of the average value to prevent rounding errors. - * The factor for scaling up and the exponential weight (or decay rate) have to - * be specified thru the init fuction. The structure should not be accessed - * directly but only thru the helper functions. - */ - -/** - * ewma_init() - Initialize EWMA parameters - * @avg: Average structure - * @factor: Factor to use for the scaled up internal value. The maximum value - * of averages can be ULONG_MAX/(factor*weight). For performance reasons - * factor has to be a power of 2. - * @weight: Exponential weight, or decay rate. This defines how fast the - * influence of older values decreases. For performance reasons weight has - * to be a power of 2. - * - * Initialize the EWMA parameters for a given struct ewma @avg. - */ -void ewma_init(struct ewma *avg, unsigned long factor, unsigned long weight) -{ - WARN_ON(!is_power_of_2(weight) || !is_power_of_2(factor)); - - avg->weight = ilog2(weight); - avg->factor = ilog2(factor); - avg->internal = 0; -} -EXPORT_SYMBOL(ewma_init); - -/** - * ewma_add() - Exponentially weighted moving average (EWMA) - * @avg: Average structure - * @val: Current value - * - * Add a sample to the average. - */ -struct ewma *ewma_add(struct ewma *avg, unsigned long val) -{ - unsigned long internal = ACCESS_ONCE(avg->internal); - - ACCESS_ONCE(avg->internal) = internal ? - (((internal << avg->weight) - internal) + - (val << avg->factor)) >> avg->weight : - (val << avg->factor); - return avg; -} -EXPORT_SYMBOL(ewma_add); -- cgit v1.2.3 From b7fe10e5ebac2a3f37e95535e616494b65fa020f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 19 Aug 2015 17:07:32 -0700 Subject: gro: Fix remcsum offload to deal with frags in GRO The remote checksum offload GRO did not consider the case that frag0 might be in use. This patch fixes that by accessing headers using the skb_gro functions and not saving offsets relative to skb->head. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 23 +++++++++-------------- include/linux/netdevice.h | 44 ++++++++++++++++++++++++++++++++------------ net/ipv4/fou.c | 28 ++++++++++++---------------- 3 files changed, 53 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 54615bb9d916..64fcd2402562 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -519,10 +519,10 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, u32 data, struct gro_remcsum *grc, bool nopartial) { - size_t start, offset, plen; + size_t start, offset; if (skb->remcsum_offload) - return NULL; + return vh; if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; @@ -532,17 +532,8 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, offsetof(struct udphdr, check) : offsetof(struct tcphdr, check)); - plen = hdrlen + offset + sizeof(u16); - - /* Pull checksum that will be written */ - if (skb_gro_header_hard(skb, off + plen)) { - vh = skb_gro_header_slow(skb, off + plen, off); - if (!vh) - return NULL; - } - - skb_gro_remcsum_process(skb, (void *)vh + hdrlen, - start, offset, grc, nopartial); + vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen, + start, offset, grc, nopartial); skb->remcsum_offload = 1; @@ -573,7 +564,6 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, goto out; } - skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); flags = ntohl(vh->vx_flags); @@ -588,6 +578,8 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, goto out; } + skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ + flush = 0; for (p = *head; p; p = p->next) { @@ -1110,6 +1102,9 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, { size_t start, offset, plen; + if (skb->remcsum_offload) + return vh; + start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; offset = start + ((data & VXLAN_RCO_UDP) ? offsetof(struct udphdr, check) : diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4bd177faa90c..6abe0d6f1e1d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2311,8 +2311,7 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb); static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb) { - return (NAPI_GRO_CB(skb)->gro_remcsum_start - skb_headroom(skb) == - skb_gro_offset(skb)); + return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb)); } static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, @@ -2408,37 +2407,58 @@ static inline void skb_gro_remcsum_init(struct gro_remcsum *grc) grc->delta = 0; } -static inline void skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, - int start, int offset, - struct gro_remcsum *grc, - bool nopartial) +static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, + unsigned int off, size_t hdrlen, + int start, int offset, + struct gro_remcsum *grc, + bool nopartial) { __wsum delta; + size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); BUG_ON(!NAPI_GRO_CB(skb)->csum_valid); if (!nopartial) { - NAPI_GRO_CB(skb)->gro_remcsum_start = - ((unsigned char *)ptr + start) - skb->head; - return; + NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start; + return ptr; + } + + ptr = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, off + plen)) { + ptr = skb_gro_header_slow(skb, off + plen, off); + if (!ptr) + return NULL; } - delta = remcsum_adjust(ptr, NAPI_GRO_CB(skb)->csum, start, offset); + delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum, + start, offset); /* Adjust skb->csum since we changed the packet */ NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); - grc->offset = (ptr + offset) - (void *)skb->head; + grc->offset = off + hdrlen + offset; grc->delta = delta; + + return ptr; } static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, struct gro_remcsum *grc) { + void *ptr; + size_t plen = grc->offset + sizeof(u16); + if (!grc->delta) return; - remcsum_unadjust((__sum16 *)(skb->head + grc->offset), grc->delta); + ptr = skb_gro_header_fast(skb, grc->offset); + if (skb_gro_header_hard(skb, grc->offset + sizeof(u16))) { + ptr = skb_gro_header_slow(skb, plen, grc->offset); + if (!ptr) + return; + } + + remcsum_unadjust((__sum16 *)ptr, grc->delta); } static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 34968cd5c146..eb11f9506894 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -79,7 +79,11 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, __be16 *pd = data; size_t start = ntohs(pd[0]); size_t offset = ntohs(pd[1]); - size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); + size_t plen = sizeof(struct udphdr) + hdrlen + + max_t(size_t, offset + sizeof(u16), start); + + if (skb->remcsum_offload) + return guehdr; if (!pskb_may_pull(skb, plen)) return NULL; @@ -221,29 +225,21 @@ out_unlock: static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, struct guehdr *guehdr, void *data, - size_t hdrlen, u8 ipproto, - struct gro_remcsum *grc, bool nopartial) + size_t hdrlen, struct gro_remcsum *grc, + bool nopartial) { __be16 *pd = data; size_t start = ntohs(pd[0]); size_t offset = ntohs(pd[1]); - size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); if (skb->remcsum_offload) - return NULL; + return guehdr; if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; - /* Pull checksum that will be written */ - if (skb_gro_header_hard(skb, off + plen)) { - guehdr = skb_gro_header_slow(skb, off + plen, off); - if (!guehdr) - return NULL; - } - - skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, - start, offset, grc, nopartial); + guehdr = skb_gro_remcsum_process(skb, (void *)guehdr, off, hdrlen, + start, offset, grc, nopartial); skb->remcsum_offload = 1; @@ -307,10 +303,10 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, if (flags & GUE_PFLAG_REMCSUM) { guehdr = gue_gro_remcsum(skb, off, guehdr, - data + doffset, hdrlen, - guehdr->proto_ctype, &grc, + data + doffset, hdrlen, &grc, !!(fou->flags & FOU_F_REMCSUM_NOPARTIAL)); + if (!guehdr) goto out; -- cgit v1.2.3 From 0e4ead9d7b3655d76371604abb9b0dcc4e79bb7d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 27 Aug 2015 09:31:18 +0200 Subject: net: introduce change upper device notifier change info Add info that is passed along with NETDEV_CHANGEUPPER event. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ net/core/dev.c | 16 ++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6abe0d6f1e1d..39f30daac483 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2127,6 +2127,13 @@ struct netdev_notifier_change_info { unsigned int flags_changed; }; +struct netdev_notifier_changeupper_info { + struct netdev_notifier_info info; /* must be first */ + struct net_device *upper_dev; /* new upper dev */ + bool master; /* is upper dev master */ + bool linking; /* is the nofication for link or unlink */ +}; + static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, struct net_device *dev) { diff --git a/net/core/dev.c b/net/core/dev.c index 7bb24f1879b8..a8e6cf4298d3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5311,6 +5311,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, void *private) { + struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j, *to_i, *to_j; int ret = 0; @@ -5329,6 +5330,10 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; + changeupper_info.upper_dev = upper_dev; + changeupper_info.master = master; + changeupper_info.linking = true; + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, master); if (ret) @@ -5367,7 +5372,8 @@ static int __netdev_upper_dev_link(struct net_device *dev, goto rollback_lower_mesh; } - call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); return 0; rollback_lower_mesh: @@ -5462,9 +5468,14 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link_private); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { + struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j; ASSERT_RTNL(); + changeupper_info.upper_dev = upper_dev; + changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; + changeupper_info.linking = false; + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); /* Here is the tricky part. We must remove all dev's lower @@ -5484,7 +5495,8 @@ void netdev_upper_dev_unlink(struct net_device *dev, list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) __netdev_adjacent_dev_unlink(dev, i->dev); - call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); } EXPORT_SYMBOL(netdev_upper_dev_unlink); -- cgit v1.2.3 From 0894ae3f0a587bda9733ec4a4b67af7ded3a9498 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 27 Aug 2015 09:31:19 +0200 Subject: net: add netif_is_bridge_master helper Add this helper so code can easily figure out if netdev is a bridge. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 39f30daac483..be625f4754b9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3848,6 +3848,11 @@ static inline bool netif_is_vrf(const struct net_device *dev) return dev->priv_flags & IFF_VRF_MASTER; } +static inline bool netif_is_bridge_master(const struct net_device *dev) +{ + return dev->priv_flags & IFF_EBRIDGE; +} + static inline bool netif_index_is_vrf(struct net *net, int ifindex) { bool rc = false; -- cgit v1.2.3 From 35d4e1725202e6656fcfa8b88447327ad3ae0c0c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 27 Aug 2015 09:31:20 +0200 Subject: net: add netif_is_ovs_master helper with IFF_OPENVSWITCH private flag Add this helper so code can easily figure out if netdev is openswitch. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++++ net/openvswitch/vport-internal_dev.c | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index be625f4754b9..6656a43c072d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1264,6 +1264,7 @@ struct net_device_ops { * @IFF_MACVLAN: Macvlan device * @IFF_VRF_MASTER: device is a VRF master * @IFF_NO_QUEUE: device can run without qdisc attached + * @IFF_OPENVSWITCH: device is a Open vSwitch master */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1293,6 +1294,7 @@ enum netdev_priv_flags { IFF_IPVLAN_SLAVE = 1<<24, IFF_VRF_MASTER = 1<<25, IFF_NO_QUEUE = 1<<26, + IFF_OPENVSWITCH = 1<<27, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1322,6 +1324,7 @@ enum netdev_priv_flags { #define IFF_IPVLAN_SLAVE IFF_IPVLAN_SLAVE #define IFF_VRF_MASTER IFF_VRF_MASTER #define IFF_NO_QUEUE IFF_NO_QUEUE +#define IFF_OPENVSWITCH IFF_OPENVSWITCH /** * struct net_device - The DEVICE structure. @@ -3853,6 +3856,11 @@ static inline bool netif_is_bridge_master(const struct net_device *dev) return dev->priv_flags & IFF_EBRIDGE; } +static inline bool netif_is_ovs_master(const struct net_device *dev) +{ + return dev->priv_flags & IFF_OPENVSWITCH; +} + static inline bool netif_index_is_vrf(struct net *net, int ifindex) { bool rc = false; diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index c058bbf876c3..80b3e12ec882 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -135,7 +135,7 @@ static void do_setup(struct net_device *netdev) netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; - netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH; netdev->destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; -- cgit v1.2.3 From 0dc1549bfd67053181415a3f7544628a6bcd2a08 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 27 Aug 2015 09:31:21 +0200 Subject: net: kill long time unused bonding private flags We don't use them for years, just kill them now. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 57 +++++++++++++++++------------------------------ 1 file changed, 21 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6656a43c072d..88a00694eda5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1240,13 +1240,8 @@ struct net_device_ops { * * @IFF_802_1Q_VLAN: 802.1Q VLAN device * @IFF_EBRIDGE: Ethernet bridging device - * @IFF_SLAVE_INACTIVE: bonding slave not the curr. active - * @IFF_MASTER_8023AD: bonding master, 802.3ad - * @IFF_MASTER_ALB: bonding master, balance-alb * @IFF_BONDING: bonding master or slave - * @IFF_SLAVE_NEEDARP: need ARPs for validation * @IFF_ISATAP: ISATAP interface (RFC4214) - * @IFF_MASTER_ARPMON: bonding master, ARP mon in use * @IFF_WAN_HDLC: WAN HDLC device * @IFF_XMIT_DST_RELEASE: dev_hard_start_xmit() is allowed to * release skb->dst @@ -1269,43 +1264,33 @@ struct net_device_ops { enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, IFF_EBRIDGE = 1<<1, - IFF_SLAVE_INACTIVE = 1<<2, - IFF_MASTER_8023AD = 1<<3, - IFF_MASTER_ALB = 1<<4, - IFF_BONDING = 1<<5, - IFF_SLAVE_NEEDARP = 1<<6, - IFF_ISATAP = 1<<7, - IFF_MASTER_ARPMON = 1<<8, - IFF_WAN_HDLC = 1<<9, - IFF_XMIT_DST_RELEASE = 1<<10, - IFF_DONT_BRIDGE = 1<<11, - IFF_DISABLE_NETPOLL = 1<<12, - IFF_MACVLAN_PORT = 1<<13, - IFF_BRIDGE_PORT = 1<<14, - IFF_OVS_DATAPATH = 1<<15, - IFF_TX_SKB_SHARING = 1<<16, - IFF_UNICAST_FLT = 1<<17, - IFF_TEAM_PORT = 1<<18, - IFF_SUPP_NOFCS = 1<<19, - IFF_LIVE_ADDR_CHANGE = 1<<20, - IFF_MACVLAN = 1<<21, - IFF_XMIT_DST_RELEASE_PERM = 1<<22, - IFF_IPVLAN_MASTER = 1<<23, - IFF_IPVLAN_SLAVE = 1<<24, - IFF_VRF_MASTER = 1<<25, - IFF_NO_QUEUE = 1<<26, - IFF_OPENVSWITCH = 1<<27, + IFF_BONDING = 1<<2, + IFF_ISATAP = 1<<3, + IFF_WAN_HDLC = 1<<4, + IFF_XMIT_DST_RELEASE = 1<<5, + IFF_DONT_BRIDGE = 1<<6, + IFF_DISABLE_NETPOLL = 1<<7, + IFF_MACVLAN_PORT = 1<<8, + IFF_BRIDGE_PORT = 1<<9, + IFF_OVS_DATAPATH = 1<<10, + IFF_TX_SKB_SHARING = 1<<11, + IFF_UNICAST_FLT = 1<<12, + IFF_TEAM_PORT = 1<<13, + IFF_SUPP_NOFCS = 1<<14, + IFF_LIVE_ADDR_CHANGE = 1<<15, + IFF_MACVLAN = 1<<16, + IFF_XMIT_DST_RELEASE_PERM = 1<<17, + IFF_IPVLAN_MASTER = 1<<18, + IFF_IPVLAN_SLAVE = 1<<19, + IFF_VRF_MASTER = 1<<20, + IFF_NO_QUEUE = 1<<21, + IFF_OPENVSWITCH = 1<<22, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN #define IFF_EBRIDGE IFF_EBRIDGE -#define IFF_SLAVE_INACTIVE IFF_SLAVE_INACTIVE -#define IFF_MASTER_8023AD IFF_MASTER_8023AD -#define IFF_MASTER_ALB IFF_MASTER_ALB #define IFF_BONDING IFF_BONDING -#define IFF_SLAVE_NEEDARP IFF_SLAVE_NEEDARP #define IFF_ISATAP IFF_ISATAP -#define IFF_MASTER_ARPMON IFF_MASTER_ARPMON #define IFF_WAN_HDLC IFF_WAN_HDLC #define IFF_XMIT_DST_RELEASE IFF_XMIT_DST_RELEASE #define IFF_DONT_BRIDGE IFF_DONT_BRIDGE -- cgit v1.2.3 From 2e4cfae2a8e3f9ce3925c9b6e9e865fe8476fc4f Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Thu, 27 Aug 2015 15:25:45 -0700 Subject: netfilter: Define v6ops in !CONFIG_NETFILTER case. When CONFIG_OPENVSWITCH is set, and CONFIG_NETFILTER is not set, the openvswitch IPv6 fragmentation handling cannot refer to ipv6_ops because it isn't defined. Add a dummy version to avoid #ifdefs in source files. Fixes: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer Signed-off-by: David S. Miller --- include/linux/netfilter_ipv6.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 8b7d28f3aada..771574677e83 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -9,15 +9,6 @@ #include - -#ifdef CONFIG_NETFILTER -int ip6_route_me_harder(struct sk_buff *skb); -__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, - unsigned int dataoff, u_int8_t protocol); - -int ipv6_netfilter_init(void); -void ipv6_netfilter_fini(void); - /* * Hook functions for ipv6 to allow xt_* modules to be built-in even * if IPv6 is a module. @@ -30,6 +21,14 @@ struct nf_ipv6_ops { int (*output)(struct sock *, struct sk_buff *)); }; +#ifdef CONFIG_NETFILTER +int ip6_route_me_harder(struct sk_buff *skb); +__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, u_int8_t protocol); + +int ipv6_netfilter_init(void); +void ipv6_netfilter_fini(void); + extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops; static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void) { @@ -39,6 +38,7 @@ static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void) #else /* CONFIG_NETFILTER */ static inline int ipv6_netfilter_init(void) { return 0; } static inline void ipv6_netfilter_fini(void) { return; } +static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void) { return NULL; } #endif /* CONFIG_NETFILTER */ #endif /*__LINUX_IP6_NETFILTER_H*/ -- cgit v1.2.3 From df2cf4a78e488d26728590cb3c6b4fe4c4862c77 Mon Sep 17 00:00:00 2001 From: Philip Downey Date: Thu, 27 Aug 2015 16:46:26 +0100 Subject: IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey Signed-off-by: David S. Miller --- include/linux/igmp.h | 1 + net/ipv4/igmp.c | 26 +++++++++++++++++++++++++- net/ipv4/sysctl_net_ipv4.c | 7 +++++++ 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 193ad488d3e2..908429216d9f 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -37,6 +37,7 @@ static inline struct igmpv3_query * return (struct igmpv3_query *)skb_transport_header(skb); } +extern int sysctl_igmp_llm_reports; extern int sysctl_igmp_max_memberships; extern int sysctl_igmp_max_msf; extern int sysctl_igmp_qrv; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 9fdfd9deac11..d38b8b61eaee 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -110,6 +110,9 @@ #define IP_MAX_MEMBERSHIPS 20 #define IP_MAX_MSF 10 +/* IGMP reports for link-local multicast groups are enabled by default */ +int sysctl_igmp_llm_reports __read_mostly = 1; + #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ @@ -437,6 +440,8 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; + if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports) + return skb; isquery = type == IGMPV3_MODE_IS_INCLUDE || type == IGMPV3_MODE_IS_EXCLUDE; @@ -545,6 +550,9 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) for_each_pmc_rcu(in_dev, pmc) { if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; + if (ipv4_is_local_multicast(pmc->multiaddr) && + !sysctl_igmp_llm_reports) + continue; spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) type = IGMPV3_MODE_IS_EXCLUDE; @@ -678,7 +686,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); - else if (type == IGMP_HOST_LEAVE_MESSAGE) + + if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + return 0; + + if (type == IGMP_HOST_LEAVE_MESSAGE) dst = IGMP_ALL_ROUTER; else dst = group; @@ -851,6 +863,8 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group) if (group == IGMP_ALL_HOSTS) return false; + if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + return false; rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { @@ -957,6 +971,9 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, continue; if (im->multiaddr == IGMP_ALL_HOSTS) continue; + if (ipv4_is_local_multicast(im->multiaddr) && + !sysctl_igmp_llm_reports) + continue; spin_lock_bh(&im->lock); if (im->tm_running) im->gsquery = im->gsquery && mark; @@ -1181,6 +1198,8 @@ static void igmp_group_dropped(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; + if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + return; reporter = im->reporter; igmp_stop_timer(im); @@ -1213,6 +1232,8 @@ static void igmp_group_added(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; + if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + return; if (in_dev->dead) return; @@ -1518,6 +1539,9 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == IGMP_ALL_HOSTS) continue; + if (ipv4_is_local_multicast(im->multiaddr) && + !sysctl_igmp_llm_reports) + continue; /* a failover is happening and switches * must be notified immediately diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 879bdc5c95b1..894da3a70aff 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -929,6 +929,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "igmp_link_local_mcast_reports", + .data = &sysctl_igmp_llm_reports, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; -- cgit v1.2.3 From 1a6877b9c0c2ad901d4335d909432d3bb6d3a330 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 28 Aug 2015 15:56:22 -0700 Subject: lib: introduce strncpy_from_unsafe() generalize FETCH_FUNC_NAME(memory, string) into strncpy_from_unsafe() and fix sparse warnings that were present in original implementation. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/uaccess.h | 2 ++ kernel/trace/trace_kprobe.c | 20 ++++---------------- lib/strncpy_from_user.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index ae572c138607..d6f2c2c5b043 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -129,4 +129,6 @@ extern long __probe_kernel_read(void *dst, const void *src, size_t size); extern long notrace probe_kernel_write(void *dst, const void *src, size_t size); extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size); +extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); + #endif /* __LINUX_UACCESS_H__ */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b7d0cdd9906c..c9956440d0e6 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory) static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, void *addr, void *dest) { - long ret; int maxlen = get_rloc_len(*(u32 *)dest); u8 *dst = get_rloc_data(dest); - u8 *src = addr; - mm_segment_t old_fs = get_fs(); + long ret; if (!maxlen) return; @@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, * Try to get string again, since the string can be changed while * probing. */ - set_fs(KERNEL_DS); - pagefault_disable(); - - do - ret = __copy_from_user_inatomic(dst++, src++, 1); - while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); - - dst[-1] = '\0'; - pagefault_enable(); - set_fs(old_fs); + ret = strncpy_from_unsafe(dst, addr, maxlen); if (ret < 0) { /* Failed to fetch string */ - ((u8 *)get_rloc_data(dest))[0] = '\0'; + dst[0] = '\0'; *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); } else { - *(u32 *)dest = make_data_rloc(src - (u8 *)addr, - get_rloc_offs(*(u32 *)dest)); + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest)); } } NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index e0af6ff73d14..ead8c4a068d1 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -112,3 +112,44 @@ long strncpy_from_user(char *dst, const char __user *src, long count) return -EFAULT; } EXPORT_SYMBOL(strncpy_from_user); + +/** + * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address. + * @dst: Destination address, in kernel space. This buffer must be at + * least @count bytes long. + * @src: Unsafe address. + * @count: Maximum number of bytes to copy, including the trailing NUL. + * + * Copies a NUL-terminated string from unsafe address to kernel buffer. + * + * On success, returns the length of the string INCLUDING the trailing NUL. + * + * If access fails, returns -EFAULT (some data may have been copied + * and the trailing NUL added). + * + * If @count is smaller than the length of the string, copies @count-1 bytes, + * sets the last byte of @dst buffer to NUL and returns @count. + */ +long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) +{ + mm_segment_t old_fs = get_fs(); + const void *src = unsafe_addr; + long ret; + + if (unlikely(count <= 0)) + return 0; + + set_fs(KERNEL_DS); + pagefault_disable(); + + do { + ret = __copy_from_user_inatomic(dst++, + (const void __user __force *)src++, 1); + } while (dst[-1] && ret == 0 && src - unsafe_addr < count); + + dst[-1] = '\0'; + pagefault_enable(); + set_fs(old_fs); + + return ret < 0 ? ret : src - unsafe_addr; +} -- cgit v1.2.3 From 5a11dd7d9649149f336ca72069d56ce52b21567f Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 31 Aug 2015 15:56:46 +0200 Subject: net: phy: Allow PHY devices to identify themselves as Ethernet switches, etc. Some Ethernet MAC drivers using the PHY library require the hardcoding of link parameters when interfaced to a switch device, SFP module, switch to switch port, etc. This has typically lead to various ad-hoc implementations looking like this: - using a "fixed PHY" emulated device, which will provide link indication towards the Ethernet MAC driver and hardware - pretend there is no PHY and hardcode link parameters, ala mv643x_eth Based on that, it is desireable to have the PHY drivers advertise the correct link parameters, just like regular Ethernet PHYs towards their CPU Ethernet MAC drivers, however, Ethernet MAC drivers should be able to tell whether this link should be monitored or not. In the context of an Ethernet switch, SFP module, switch to switch link, we do not need to monitor this link since it should be always up. Signed-off-by: Florian Fainelli Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/fixed_phy.c | 1 + include/linux/phy.h | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index 99d9bc19c94a..ce46428ff21f 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -303,6 +303,7 @@ struct phy_device *fixed_phy_register(unsigned int irq, of_node_get(np); phy->dev.of_node = np; + phy->is_pseudo_fixed_link = true; ret = phy_device_register(phy); if (ret) { diff --git a/include/linux/phy.h b/include/linux/phy.h index e5fb1d415961..962387a192f1 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -330,6 +330,7 @@ struct phy_c45_device_ids { * c45_ids: 802.3-c45 Device Identifers if is_c45. * is_c45: Set to true if this phy uses clause 45 addressing. * is_internal: Set to true if this phy is internal to a MAC. + * is_pseudo_fixed_link: Set to true if this phy is an Ethernet switch, etc. * has_fixups: Set to true if this phy has fixups/quirks. * suspended: Set to true if this phy has been suspended successfully. * state: state of the PHY for management purposes @@ -368,6 +369,7 @@ struct phy_device { struct phy_c45_device_ids c45_ids; bool is_c45; bool is_internal; + bool is_pseudo_fixed_link; bool has_fixups; bool suspended; @@ -688,6 +690,16 @@ static inline bool phy_interface_is_rgmii(struct phy_device *phydev) { return phydev->interface >= PHY_INTERFACE_MODE_RGMII && phydev->interface <= PHY_INTERFACE_MODE_RGMII_TXID; +}; + +/* + * phy_is_pseudo_fixed_link - Convenience function for testing if this + * PHY is the CPU port facing side of an Ethernet switch, or similar. + * @phydev: the phy_device struct + */ +static inline bool phy_is_pseudo_fixed_link(struct phy_device *phydev) +{ + return phydev->is_pseudo_fixed_link; } /** -- cgit v1.2.3 From a5597008dbc230876db2d344561d634f4d52ea4a Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 31 Aug 2015 15:56:53 +0200 Subject: phy: fixed_phy: Add gpio to determine link up/down. An SFP module may have a link up/down status pin which can be connection to a GPIO line of the host. Add support for reading such an GPIO in the fixed_phy driver. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- .../devicetree/bindings/net/fixed-link.txt | 14 +++++++++++- Documentation/networking/stmmac.txt | 2 +- arch/m68k/coldfire/m5272.c | 2 +- arch/mips/ar7/platform.c | 5 +++-- arch/mips/bcm47xx/setup.c | 2 +- drivers/net/ethernet/broadcom/genet/bcmmii.c | 2 +- drivers/net/phy/fixed_phy.c | 26 +++++++++++++++++++--- drivers/of/of_mdio.c | 13 ++++++++--- include/linux/phy_fixed.h | 8 +++++-- 9 files changed, 59 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/Documentation/devicetree/bindings/net/fixed-link.txt b/Documentation/devicetree/bindings/net/fixed-link.txt index 82bf7e0f47b6..ec5d889fe3d8 100644 --- a/Documentation/devicetree/bindings/net/fixed-link.txt +++ b/Documentation/devicetree/bindings/net/fixed-link.txt @@ -17,6 +17,8 @@ properties: enabled. * 'asym-pause' (boolean, optional), to indicate that asym_pause should be enabled. +* 'link-gpios' ('gpio-list', optional), to indicate if a gpio can be read + to determine if the link is up. Old, deprecated 'fixed-link' binding: @@ -30,7 +32,7 @@ Old, deprecated 'fixed-link' binding: - e: asymmetric pause configuration: 0 for no asymmetric pause, 1 for asymmetric pause -Example: +Examples: ethernet@0 { ... @@ -40,3 +42,13 @@ ethernet@0 { }; ... }; + +ethernet@1 { + ... + fixed-link { + speed = <1000>; + pause; + link-gpios = <&gpio0 12 GPIO_ACTIVE_HIGH>; + }; + ... +}; diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt index 2903b1cf4d70..d64a14714236 100644 --- a/Documentation/networking/stmmac.txt +++ b/Documentation/networking/stmmac.txt @@ -254,7 +254,7 @@ static struct fixed_phy_status stmmac0_fixed_phy_status = { During the board's device_init we can configure the first MAC for fixed_link by calling: - fixed_phy_add(PHY_POLL, 1, &stmmac0_fixed_phy_status));) + fixed_phy_add(PHY_POLL, 1, &stmmac0_fixed_phy_status, -1); and the second one, with a real PHY device attached to the bus, by using the stmmac_mdio_bus_data structure (to provide the id, the reset procedure etc). diff --git a/arch/m68k/coldfire/m5272.c b/arch/m68k/coldfire/m5272.c index b15219ed22bf..c525e4c08f84 100644 --- a/arch/m68k/coldfire/m5272.c +++ b/arch/m68k/coldfire/m5272.c @@ -126,7 +126,7 @@ static struct fixed_phy_status nettel_fixed_phy_status __initdata = { static int __init init_BSP(void) { m5272_uarts_init(); - fixed_phy_add(PHY_POLL, 0, &nettel_fixed_phy_status); + fixed_phy_add(PHY_POLL, 0, &nettel_fixed_phy_status, -1); return 0; } diff --git a/arch/mips/ar7/platform.c b/arch/mips/ar7/platform.c index be9ff1673ded..298b97715d5f 100644 --- a/arch/mips/ar7/platform.c +++ b/arch/mips/ar7/platform.c @@ -679,7 +679,8 @@ static int __init ar7_register_devices(void) } if (ar7_has_high_cpmac()) { - res = fixed_phy_add(PHY_POLL, cpmac_high.id, &fixed_phy_status); + res = fixed_phy_add(PHY_POLL, cpmac_high.id, + &fixed_phy_status, -1); if (!res) { cpmac_get_mac(1, cpmac_high_data.dev_addr); @@ -692,7 +693,7 @@ static int __init ar7_register_devices(void) } else cpmac_low_data.phy_mask = 0xffffffff; - res = fixed_phy_add(PHY_POLL, cpmac_low.id, &fixed_phy_status); + res = fixed_phy_add(PHY_POLL, cpmac_low.id, &fixed_phy_status, -1); if (!res) { cpmac_get_mac(0, cpmac_low_data.dev_addr); res = platform_device_register(&cpmac_low); diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c index 98c075f81795..17503a05938e 100644 --- a/arch/mips/bcm47xx/setup.c +++ b/arch/mips/bcm47xx/setup.c @@ -263,7 +263,7 @@ static int __init bcm47xx_register_bus_complete(void) bcm47xx_leds_register(); bcm47xx_workarounds(); - fixed_phy_add(PHY_POLL, 0, &bcm47xx_fixed_phy_status); + fixed_phy_add(PHY_POLL, 0, &bcm47xx_fixed_phy_status, -1); return 0; } device_initcall(bcm47xx_register_bus_complete); diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index b3679ad1c1c7..c8affad76f36 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -585,7 +585,7 @@ static int bcmgenet_mii_pd_init(struct bcmgenet_priv *priv) .asym_pause = 0, }; - phydev = fixed_phy_register(PHY_POLL, &fphy_status, NULL); + phydev = fixed_phy_register(PHY_POLL, &fphy_status, -1, NULL); if (!phydev || IS_ERR(phydev)) { dev_err(kdev, "failed to register fixed PHY device\n"); return -ENODEV; diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index 2f9457f05a2e..1bb70e3cc03e 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -22,6 +22,7 @@ #include #include #include +#include #define MII_REGS_NUM 29 @@ -38,6 +39,7 @@ struct fixed_phy { struct fixed_phy_status status; int (*link_update)(struct net_device *, struct fixed_phy_status *); struct list_head node; + int link_gpio; }; static struct platform_device *pdev; @@ -52,6 +54,9 @@ static int fixed_phy_update_regs(struct fixed_phy *fp) u16 lpagb = 0; u16 lpa = 0; + if (gpio_is_valid(fp->link_gpio)) + fp->status.link = !!gpio_get_value_cansleep(fp->link_gpio); + if (!fp->status.link) goto done; bmsr |= BMSR_LSTATUS | BMSR_ANEGCOMPLETE; @@ -215,7 +220,8 @@ int fixed_phy_update_state(struct phy_device *phydev, EXPORT_SYMBOL(fixed_phy_update_state); int fixed_phy_add(unsigned int irq, int phy_addr, - struct fixed_phy_status *status) + struct fixed_phy_status *status, + int link_gpio) { int ret; struct fixed_mdio_bus *fmb = &platform_fmb; @@ -231,15 +237,26 @@ int fixed_phy_add(unsigned int irq, int phy_addr, fp->addr = phy_addr; fp->status = *status; + fp->link_gpio = link_gpio; + + if (gpio_is_valid(fp->link_gpio)) { + ret = gpio_request_one(fp->link_gpio, GPIOF_DIR_IN, + "fixed-link-gpio-link"); + if (ret) + goto err_regs; + } ret = fixed_phy_update_regs(fp); if (ret) - goto err_regs; + goto err_gpio; list_add_tail(&fp->node, &fmb->phys); return 0; +err_gpio: + if (gpio_is_valid(fp->link_gpio)) + gpio_free(fp->link_gpio); err_regs: kfree(fp); return ret; @@ -254,6 +271,8 @@ void fixed_phy_del(int phy_addr) list_for_each_entry_safe(fp, tmp, &fmb->phys, node) { if (fp->addr == phy_addr) { list_del(&fp->node); + if (gpio_is_valid(fp->link_gpio)) + gpio_free(fp->link_gpio); kfree(fp); return; } @@ -266,6 +285,7 @@ static DEFINE_SPINLOCK(phy_fixed_addr_lock); struct phy_device *fixed_phy_register(unsigned int irq, struct fixed_phy_status *status, + int link_gpio, struct device_node *np) { struct fixed_mdio_bus *fmb = &platform_fmb; @@ -282,7 +302,7 @@ struct phy_device *fixed_phy_register(unsigned int irq, phy_addr = phy_fixed_addr++; spin_unlock(&phy_fixed_addr_lock); - ret = fixed_phy_add(PHY_POLL, phy_addr, status); + ret = fixed_phy_add(PHY_POLL, phy_addr, status, link_gpio); if (ret < 0) return ERR_PTR(ret); diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index 7c8c23cc6896..1350fa25cdb0 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -294,6 +295,7 @@ int of_phy_register_fixed_link(struct device_node *np) struct fixed_phy_status status = {}; struct device_node *fixed_link_node; const __be32 *fixed_link_prop; + int link_gpio; int len, err; struct phy_device *phy; const char *managed; @@ -302,7 +304,7 @@ int of_phy_register_fixed_link(struct device_node *np) if (err == 0) { if (strcmp(managed, "in-band-status") == 0) { /* status is zeroed, namely its .link member */ - phy = fixed_phy_register(PHY_POLL, &status, np); + phy = fixed_phy_register(PHY_POLL, &status, -1, np); return IS_ERR(phy) ? PTR_ERR(phy) : 0; } } @@ -318,8 +320,13 @@ int of_phy_register_fixed_link(struct device_node *np) status.pause = of_property_read_bool(fixed_link_node, "pause"); status.asym_pause = of_property_read_bool(fixed_link_node, "asym-pause"); + link_gpio = of_get_named_gpio_flags(fixed_link_node, + "link-gpios", 0, NULL); of_node_put(fixed_link_node); - phy = fixed_phy_register(PHY_POLL, &status, np); + if (link_gpio == -EPROBE_DEFER) + return -EPROBE_DEFER; + + phy = fixed_phy_register(PHY_POLL, &status, link_gpio, np); return IS_ERR(phy) ? PTR_ERR(phy) : 0; } @@ -331,7 +338,7 @@ int of_phy_register_fixed_link(struct device_node *np) status.speed = be32_to_cpu(fixed_link_prop[2]); status.pause = be32_to_cpu(fixed_link_prop[3]); status.asym_pause = be32_to_cpu(fixed_link_prop[4]); - phy = fixed_phy_register(PHY_POLL, &status, np); + phy = fixed_phy_register(PHY_POLL, &status, -1, np); return IS_ERR(phy) ? PTR_ERR(phy) : 0; } diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index fe5732d53eda..2400d2ea4f34 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -13,9 +13,11 @@ struct device_node; #if IS_ENABLED(CONFIG_FIXED_PHY) extern int fixed_phy_add(unsigned int irq, int phy_id, - struct fixed_phy_status *status); + struct fixed_phy_status *status, + int link_gpio); extern struct phy_device *fixed_phy_register(unsigned int irq, struct fixed_phy_status *status, + int link_gpio, struct device_node *np); extern void fixed_phy_del(int phy_addr); extern int fixed_phy_set_link_update(struct phy_device *phydev, @@ -26,12 +28,14 @@ extern int fixed_phy_update_state(struct phy_device *phydev, const struct fixed_phy_status *changed); #else static inline int fixed_phy_add(unsigned int irq, int phy_id, - struct fixed_phy_status *status) + struct fixed_phy_status *status, + int link_gpio) { return -ENODEV; } static inline struct phy_device *fixed_phy_register(unsigned int irq, struct fixed_phy_status *status, + int gpio_link, struct device_node *np) { return ERR_PTR(-ENODEV); -- cgit v1.2.3 From e5276937ae6e654a811345f0716266f12e77bede Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:23 -0700 Subject: flow_dissector: Move skb related functions to skbuff.h Move the flow dissector functions that are specific to skbuffs into skbuff.h out of flow_dissector.h. This makes flow_dissector.h have no dependencies on skbuff.h. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 47 +++++++++++++++++++++++++++++++++++++++++ include/net/flow_dissector.h | 50 -------------------------------------------- 2 files changed, 47 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 989307f991db..8a697c673b58 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -945,6 +945,53 @@ skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type) skb->hash = hash; } +void __skb_get_hash(struct sk_buff *skb); +u32 skb_get_poff(const struct sk_buff *skb); +u32 __skb_get_poff(const struct sk_buff *skb, void *data, + const struct flow_keys *keys, int hlen); +__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + void *data, int hlen_proto); + +static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, + int thoff, u8 ip_proto) +{ + return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); +} + +void skb_flow_dissector_init(struct flow_dissector *flow_dissector, + const struct flow_dissector_key *key, + unsigned int key_count); + +bool __skb_flow_dissect(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, + void *data, __be16 proto, int nhoff, int hlen); + +static inline bool skb_flow_dissect(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + return __skb_flow_dissect(skb, flow_dissector, target_container, + NULL, 0, 0, 0); +} + +static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, + struct flow_keys *flow) +{ + memset(flow, 0, sizeof(*flow)); + return __skb_flow_dissect(skb, &flow_keys_dissector, flow, + NULL, 0, 0, 0); +} + +static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow, + void *data, __be16 proto, + int nhoff, int hlen) +{ + memset(flow, 0, sizeof(*flow)); + return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow, + data, proto, nhoff, hlen); +} + static inline __u32 skb_get_hash(struct sk_buff *skb) { if (!skb->l4_hash && !skb->sw_hash) diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 1a8c22419936..6777a84c6f94 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -2,7 +2,6 @@ #define _NET_FLOW_DISSECTOR_H #include -#include #include #include @@ -134,23 +133,6 @@ struct flow_dissector { unsigned short int offset[FLOW_DISSECTOR_KEY_MAX]; }; -void skb_flow_dissector_init(struct flow_dissector *flow_dissector, - const struct flow_dissector_key *key, - unsigned int key_count); - -bool __skb_flow_dissect(const struct sk_buff *skb, - struct flow_dissector *flow_dissector, - void *target_container, - void *data, __be16 proto, int nhoff, int hlen); - -static inline bool skb_flow_dissect(const struct sk_buff *skb, - struct flow_dissector *flow_dissector, - void *target_container) -{ - return __skb_flow_dissect(skb, flow_dissector, target_container, - NULL, 0, 0, 0); -} - struct flow_keys { struct flow_dissector_key_control control; #define FLOW_KEYS_HASH_START_FIELD basic @@ -170,38 +152,6 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow); extern struct flow_dissector flow_keys_dissector; extern struct flow_dissector flow_keys_buf_dissector; -static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, - struct flow_keys *flow) -{ - memset(flow, 0, sizeof(*flow)); - return __skb_flow_dissect(skb, &flow_keys_dissector, flow, - NULL, 0, 0, 0); -} - -static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow, - void *data, __be16 proto, - int nhoff, int hlen) -{ - memset(flow, 0, sizeof(*flow)); - return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow, - data, proto, nhoff, hlen); -} - -__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, - void *data, int hlen_proto); - -static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, - int thoff, u8 ip_proto) -{ - return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0); -} - -u32 flow_hash_from_keys(struct flow_keys *keys); -void __skb_get_hash(struct sk_buff *skb); -u32 skb_get_poff(const struct sk_buff *skb); -u32 __skb_get_poff(const struct sk_buff *skb, void *data, - const struct flow_keys *keys, int hlen); - /* struct flow_keys_digest: * * This structure is used to hold a digest of the full flow keys. This is a -- cgit v1.2.3 From bcc83839ffdb063dd2b0370cd85c4f825761fc59 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:24 -0700 Subject: skbuff: Make __skb_set_sw_hash a general function Move __skb_set_sw_hash to skbuff.h and add __skb_set_hash which is a common method (between __skb_set_sw_hash and skb_set_hash) to set the hash in an skbuff. Also, move skb_clear_hash to be closer to __skb_set_hash. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 45 ++++++++++++++++++++++++++++---------------- include/net/flow_dissector.h | 5 +++++ net/core/flow_dissector.c | 18 ++++++------------ 3 files changed, 40 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8a697c673b58..5d2c812e725b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -937,14 +937,40 @@ enum pkt_hash_types { PKT_HASH_TYPE_L4, /* Input: src_IP, dst_IP, src_port, dst_port */ }; -static inline void -skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type) +static inline void skb_clear_hash(struct sk_buff *skb) { - skb->l4_hash = (type == PKT_HASH_TYPE_L4); + skb->hash = 0; skb->sw_hash = 0; + skb->l4_hash = 0; +} + +static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb) +{ + if (!skb->l4_hash) + skb_clear_hash(skb); +} + +static inline void +__skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4) +{ + skb->l4_hash = is_l4; + skb->sw_hash = is_sw; skb->hash = hash; } +static inline void +skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type) +{ + /* Used by drivers to set hash from HW */ + __skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4); +} + +static inline void +__skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4) +{ + __skb_set_hash(skb, hash, true, is_l4); +} + void __skb_get_hash(struct sk_buff *skb); u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, @@ -1027,19 +1053,6 @@ static inline __u32 skb_get_hash_raw(const struct sk_buff *skb) return skb->hash; } -static inline void skb_clear_hash(struct sk_buff *skb) -{ - skb->hash = 0; - skb->sw_hash = 0; - skb->l4_hash = 0; -} - -static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb) -{ - if (!skb->l4_hash) - skb_clear_hash(skb); -} - static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) { to->hash = from->hash; diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 6777a84c6f94..af76c496f7db 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -167,4 +167,9 @@ struct flow_keys_digest { void make_flow_keys_digest(struct flow_keys_digest *digest, const struct flow_keys *flow); +static inline bool flow_keys_have_l4(struct flow_keys *keys) +{ + return (keys->ports.ports || keys->tags.flow_label); +} + #endif diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 11e6540fa386..151b6e48b81f 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -590,15 +590,6 @@ void make_flow_keys_digest(struct flow_keys_digest *digest, } EXPORT_SYMBOL(make_flow_keys_digest); -static inline void __skb_set_sw_hash(struct sk_buff *skb, u32 hash, - struct flow_keys *keys) -{ - if (keys->ports.ports) - skb->l4_hash = 1; - skb->sw_hash = 1; - skb->hash = hash; -} - /** * __skb_get_hash: calculate a flow hash * @skb: sk_buff to calculate flow hash from @@ -619,7 +610,8 @@ void __skb_get_hash(struct sk_buff *skb) if (!hash) return; - __skb_set_sw_hash(skb, hash, &keys); + __skb_set_sw_hash(skb, hash, + flow_keys_have_l4(&keys)); } EXPORT_SYMBOL(__skb_get_hash); @@ -648,7 +640,8 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) keys.tags.flow_label = (__force u32)fl6->flowlabel; keys.basic.ip_proto = fl6->flowi6_proto; - __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys); + __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), + flow_keys_have_l4(&keys)); return skb->hash; } @@ -668,7 +661,8 @@ __u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4) keys.keyid.keyid = fl4->fl4_gre_key; keys.basic.ip_proto = fl4->flowi4_proto; - __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys); + __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), + flow_keys_have_l4(&keys)); return skb->hash; } -- cgit v1.2.3 From c6cc1ca7f4d70cbb3ea3a5ca163c5dabaf155cdb Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:25 -0700 Subject: flowi: Abstract out functions to get flow hash based on flowi Create __get_hash_from_flowi6 and __get_hash_from_flowi4 to get the flow keys and hash based on flowi structures. These are called by __skb_get_hash_flowi6 and __skb_get_hash_flowi4. Also, created get_hash_from_flowi6 and get_hash_from_flowi4 which can be called when just the hash value for a flowi is needed. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 16 ++++++++++++---- include/net/flow.h | 19 +++++++++++++++++++ include/net/flow_dissector.h | 2 ++ net/core/flow.c | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5d2c812e725b..bbe41bccfc5f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1030,8 +1030,12 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6); static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) { - if (!skb->l4_hash && !skb->sw_hash) - __skb_get_hash_flowi6(skb, fl6); + if (!skb->l4_hash && !skb->sw_hash) { + struct flow_keys keys; + + __skb_set_sw_hash(skb, __get_hash_from_flowi6(fl6, &keys), + flow_keys_have_l4(&keys)); + } return skb->hash; } @@ -1040,8 +1044,12 @@ __u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl); static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4) { - if (!skb->l4_hash && !skb->sw_hash) - __skb_get_hash_flowi4(skb, fl4); + if (!skb->l4_hash && !skb->sw_hash) { + struct flow_keys keys; + + __skb_set_sw_hash(skb, __get_hash_from_flowi4(fl4, &keys), + flow_keys_have_l4(&keys)); + } return skb->hash; } diff --git a/include/net/flow.h b/include/net/flow.h index 9e0297c4c11d..dafe97c3c048 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -10,6 +10,7 @@ #include #include #include +#include /* * ifindex generation is per-net namespace, and loopback is @@ -243,4 +244,22 @@ void flow_cache_flush(struct net *net); void flow_cache_flush_deferred(struct net *net); extern atomic_t flow_cache_genid; +__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys); + +static inline __u32 get_hash_from_flowi6(struct flowi6 *fl6) +{ + struct flow_keys keys; + + return __get_hash_from_flowi6(fl6, &keys); +} + +__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys); + +static inline __u32 get_hash_from_flowi4(struct flowi4 *fl4) +{ + struct flow_keys keys; + + return __get_hash_from_flowi4(fl4, &keys); +} + #endif diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index af76c496f7db..74fe160f0b05 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -172,4 +172,6 @@ static inline bool flow_keys_have_l4(struct flow_keys *keys) return (keys->ports.ports || keys->tags.flow_label); } +u32 flow_hash_from_keys(struct flow_keys *keys); + #endif diff --git a/net/core/flow.c b/net/core/flow.c index 1033725be40b..61930bb0eb59 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -509,3 +510,38 @@ void flow_cache_fini(struct net *net) fc->percpu = NULL; } EXPORT_SYMBOL(flow_cache_fini); + +__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys) +{ + memset(keys, 0, sizeof(*keys)); + + memcpy(&keys->addrs.v6addrs.src, &fl6->saddr, + sizeof(keys->addrs.v6addrs.src)); + memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr, + sizeof(keys->addrs.v6addrs.dst)); + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + keys->ports.src = fl6->fl6_sport; + keys->ports.dst = fl6->fl6_dport; + keys->keyid.keyid = fl6->fl6_gre_key; + keys->tags.flow_label = (__force u32)fl6->flowlabel; + keys->basic.ip_proto = fl6->flowi6_proto; + + return flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(__get_hash_from_flowi6); + +__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys) +{ + memset(keys, 0, sizeof(*keys)); + + keys->addrs.v4addrs.src = fl4->saddr; + keys->addrs.v4addrs.dst = fl4->daddr; + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + keys->ports.src = fl4->fl4_sport; + keys->ports.dst = fl4->fl4_dport; + keys->keyid.keyid = fl4->fl4_gre_key; + keys->basic.ip_proto = fl4->flowi4_proto; + + return flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(__get_hash_from_flowi4); -- cgit v1.2.3 From cd79a2382aa5dcefa6e21a7c59bb1bb19e53b74d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:27 -0700 Subject: flow_dissector: Add flags argument to skb_flow_dissector functions The flags argument will allow control of the dissection process (for instance whether to parse beyond L3). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/ethernet/cisco/enic/enic_clsf.c | 2 +- drivers/net/hyperv/netvsc_drv.c | 2 +- include/linux/skbuff.h | 19 +++++++++++-------- net/core/flow_dissector.c | 7 ++++--- net/ethernet/eth.c | 2 +- net/sched/cls_flow.c | 2 +- net/sched/cls_flower.c | 2 +- net/sched/sch_choke.c | 4 ++-- 9 files changed, 23 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 06e2d01f0b4e..771a449d2f56 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3095,7 +3095,7 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, int noff, proto = -1; if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) - return skb_flow_dissect_flow_keys(skb, fk); + return skb_flow_dissect_flow_keys(skb, fk, 0); fk->ports.ports = 0; noff = skb_network_offset(skb); diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c index d106186f4f4a..3c677ed3c29e 100644 --- a/drivers/net/ethernet/cisco/enic/enic_clsf.c +++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c @@ -177,7 +177,7 @@ int enic_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb, int res, i; enic = netdev_priv(dev); - res = skb_flow_dissect_flow_keys(skb, &keys); + res = skb_flow_dissect_flow_keys(skb, &keys, 0); if (!res || keys.basic.n_proto != htons(ETH_P_IP) || (keys.basic.ip_proto != IPPROTO_TCP && keys.basic.ip_proto != IPPROTO_UDP)) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 2990024b90f9..409b48e1e589 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -239,7 +239,7 @@ static bool netvsc_set_hash(u32 *hash, struct sk_buff *skb) struct flow_keys flow; int data_len; - if (!skb_flow_dissect_flow_keys(skb, &flow) || + if (!skb_flow_dissect_flow_keys(skb, &flow, 0) || !(flow.basic.n_proto == htons(ETH_P_IP) || flow.basic.n_proto == htons(ETH_P_IPV6))) return false; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bbe41bccfc5f..9e62687c70f3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -991,31 +991,34 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, - void *data, __be16 proto, int nhoff, int hlen); + void *data, __be16 proto, int nhoff, int hlen, + unsigned int flags); static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container) + void *target_container, unsigned int flags) { return __skb_flow_dissect(skb, flow_dissector, target_container, - NULL, 0, 0, 0); + NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, - struct flow_keys *flow) + struct flow_keys *flow, + unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(skb, &flow_keys_dissector, flow, - NULL, 0, 0, 0); + NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow, void *data, __be16 proto, - int nhoff, int hlen) + int nhoff, int hlen, + unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow, - data, proto, nhoff, hlen); + data, proto, nhoff, hlen, flags); } static inline __u32 skb_get_hash(struct sk_buff *skb) @@ -2046,7 +2049,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb, if (skb_transport_header_was_set(skb)) return; - else if (skb_flow_dissect_flow_keys(skb, &keys)) + else if (skb_flow_dissect_flow_keys(skb, &keys, 0)) skb_set_transport_header(skb, keys.control.thoff); else skb_set_transport_header(skb, offset_hint); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 22f3d768b459..c3d9807cb34e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -121,7 +121,8 @@ EXPORT_SYMBOL(__skb_flow_get_ports); bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, - void *data, __be16 proto, int nhoff, int hlen) + void *data, __be16 proto, int nhoff, int hlen, + unsigned int flags) { struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -556,7 +557,7 @@ EXPORT_SYMBOL(flow_hash_from_keys); static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, u32 keyval) { - if (!skb_flow_dissect_flow_keys(skb, keys)) + if (!skb_flow_dissect_flow_keys(skb, keys, 0)) return 0; return __flow_hash_from_keys(keys, keyval); @@ -726,7 +727,7 @@ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys keys; - if (!skb_flow_dissect_flow_keys(skb, &keys)) + if (!skb_flow_dissect_flow_keys(skb, &keys, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 217127c3a3ef..d850fdc828f9 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -132,7 +132,7 @@ u32 eth_get_headlen(void *data, unsigned int len) /* parse any remaining L2/L3 headers, check for L4 */ if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, - sizeof(*eth), len)) + sizeof(*eth), len, 0)) return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index bb2a0f529c1f..536838b657bf 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -301,7 +301,7 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, keymask = f->keymask; if (keymask & FLOW_KEYS_NEEDED) - skb_flow_dissect_flow_keys(skb, &flow_keys); + skb_flow_dissect_flow_keys(skb, &flow_keys, 0); for (n = 0; n < f->nkeys; n++) { key = ffs(keymask) - 1; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 2f3d03f99487..57692947ebbe 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -129,7 +129,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, * so do it rather here. */ skb_key.basic.n_proto = skb->protocol; - skb_flow_dissect(skb, &head->dissector, &skb_key); + skb_flow_dissect(skb, &head->dissector, &skb_key, 0); fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 665bde07916b..02bfd3d1c4f0 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -170,13 +170,13 @@ static bool choke_match_flow(struct sk_buff *skb1, if (!choke_skb_cb(skb1)->keys_valid) { choke_skb_cb(skb1)->keys_valid = 1; - skb_flow_dissect_flow_keys(skb1, &temp); + skb_flow_dissect_flow_keys(skb1, &temp, 0); make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp); } if (!choke_skb_cb(skb2)->keys_valid) { choke_skb_cb(skb2)->keys_valid = 1; - skb_flow_dissect_flow_keys(skb2, &temp); + skb_flow_dissect_flow_keys(skb2, &temp, 0); make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp); } -- cgit v1.2.3 From de4c1f8ba302ccf4f2b3b17dc614b0a0b14d351a Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 18:11:04 -0700 Subject: flow_dissector: Fix function argument ordering dependency Commit c6cc1ca7f4d70c ("flowi: Abstract out functions to get flow hash based on flowi") introduced a bug in __skb_set_sw_hash where we require a dependency on evaluating arguments in a function in order. There is no such ordering enforced in C, so this incorrect. This patch fixes that by splitting out the arguments. This bug was found via a compiler warning that keys may be uninitialized. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9e62687c70f3..eabfb810bc62 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1035,9 +1035,9 @@ static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) { if (!skb->l4_hash && !skb->sw_hash) { struct flow_keys keys; + __u32 hash = __get_hash_from_flowi6(fl6, &keys); - __skb_set_sw_hash(skb, __get_hash_from_flowi6(fl6, &keys), - flow_keys_have_l4(&keys)); + __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } return skb->hash; @@ -1049,9 +1049,9 @@ static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4) { if (!skb->l4_hash && !skb->sw_hash) { struct flow_keys keys; + __u32 hash = __get_hash_from_flowi4(fl4, &keys); - __skb_set_sw_hash(skb, __get_hash_from_flowi4(fl4, &keys), - flow_keys_have_l4(&keys)); + __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } return skb->hash; -- cgit v1.2.3 From 20a17bf6c04e3eca8824c930ecc55ab832558e3b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 1 Sep 2015 21:19:17 -0700 Subject: flow_dissector: Use 'const' where possible. Signed-off-by: David S. Miller --- include/linux/skbuff.h | 8 ++--- include/net/flow.h | 8 ++--- net/core/flow_dissector.c | 79 ++++++++++++++++++++++++----------------------- 3 files changed, 49 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index eabfb810bc62..2738d355cdf9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1029,9 +1029,9 @@ static inline __u32 skb_get_hash(struct sk_buff *skb) return skb->hash; } -__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6); +__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6); -static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) +static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6) { if (!skb->l4_hash && !skb->sw_hash) { struct flow_keys keys; @@ -1043,9 +1043,9 @@ static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) return skb->hash; } -__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl); +__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl); -static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4) +static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4) { if (!skb->l4_hash && !skb->sw_hash) { struct flow_keys keys; diff --git a/include/net/flow.h b/include/net/flow.h index dafe97c3c048..acd6a096250e 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -244,18 +244,18 @@ void flow_cache_flush(struct net *net); void flow_cache_flush_deferred(struct net *net); extern atomic_t flow_cache_genid; -__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys); +__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys); -static inline __u32 get_hash_from_flowi6(struct flowi6 *fl6) +static inline __u32 get_hash_from_flowi6(const struct flowi6 *fl6) { struct flow_keys keys; return __get_hash_from_flowi6(fl6, &keys); } -__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys); +__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys); -static inline __u32 get_hash_from_flowi4(struct flowi4 *fl4) +static inline __u32 get_hash_from_flowi4(const struct flowi4 *fl4) { struct flow_keys keys; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 345a0408cfe4..d79699c9d1b9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -19,14 +19,14 @@ #include #include -static bool skb_flow_dissector_uses_key(struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id) +static bool dissector_uses_key(const struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) { return flow_dissector->used_keys & (1 << key_id); } -static void skb_flow_dissector_set_key(struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id) +static void dissector_set_key(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) { flow_dissector->used_keys |= (1 << key_id); } @@ -51,20 +51,20 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, * boundaries of unsigned short. */ BUG_ON(key->offset > USHRT_MAX); - BUG_ON(skb_flow_dissector_uses_key(flow_dissector, - key->key_id)); + BUG_ON(dissector_uses_key(flow_dissector, + key->key_id)); - skb_flow_dissector_set_key(flow_dissector, key->key_id); + dissector_set_key(flow_dissector, key->key_id); flow_dissector->offset[key->key_id] = key->offset; } /* Ensure that the dissector always includes control and basic key. * That way we are able to avoid handling lack of these in fast path. */ - BUG_ON(!skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_CONTROL)); - BUG_ON(!skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_BASIC)); + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL)); + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC)); } EXPORT_SYMBOL(skb_flow_dissector_init); @@ -154,8 +154,8 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ETH_ADDRS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); struct flow_dissector_key_eth_addrs *key_eth_addrs; @@ -178,8 +178,8 @@ ip: ip_proto = iph->protocol; - if (!skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS)) + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) break; key_addrs = skb_flow_dissector_target(flow_dissector, @@ -218,8 +218,8 @@ ipv6: ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs; key_ipv6_addrs = skb_flow_dissector_target(flow_dissector, @@ -232,8 +232,8 @@ ipv6: flow_label = ip6_flowlabel(iph); if (flow_label) { - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL, target_container); @@ -257,8 +257,8 @@ ipv6: if (!vlan) goto out_bad; - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_VLANID)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_VLANID)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_VLANID, target_container); @@ -298,8 +298,8 @@ ipv6: if (!hdr) goto out_bad; - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_TIPC_ADDRS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_TIPC_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_TIPC_ADDRS, target_container); @@ -320,8 +320,8 @@ mpls: if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) { - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY, target_container); @@ -374,8 +374,8 @@ ip_proto_again: if (!keyid) goto out_bad; - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_GRE_KEYID)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID)) { key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID, target_container); @@ -470,8 +470,8 @@ ip_proto_again: break; } - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); @@ -497,18 +497,21 @@ static __always_inline void __flow_hash_secret_init(void) net_get_random_once(&hashrnd, sizeof(hashrnd)); } -static __always_inline u32 __flow_hash_words(u32 *words, u32 length, u32 keyval) +static __always_inline u32 __flow_hash_words(const u32 *words, u32 length, + u32 keyval) { return jhash2(words, length, keyval); } -static inline void *flow_keys_hash_start(struct flow_keys *flow) +static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow) { + const void *p = flow; + BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32)); - return (void *)flow + FLOW_KEYS_HASH_OFFSET; + return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET); } -static inline size_t flow_keys_hash_length(struct flow_keys *flow) +static inline size_t flow_keys_hash_length(const struct flow_keys *flow) { size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); @@ -598,7 +601,7 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) __flow_hash_consistentify(keys); - hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys), + hash = __flow_hash_words(flow_keys_hash_start(keys), flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; @@ -677,7 +680,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) } EXPORT_SYMBOL(skb_get_hash_perturb); -__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) +__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6) { struct flow_keys keys; @@ -701,7 +704,7 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) } EXPORT_SYMBOL(__skb_get_hash_flowi6); -__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4) +__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4) { struct flow_keys keys; @@ -787,7 +790,7 @@ u32 skb_get_poff(const struct sk_buff *skb) return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } -__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys) +__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) { memset(keys, 0, sizeof(*keys)); @@ -806,7 +809,7 @@ __u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys) } EXPORT_SYMBOL(__get_hash_from_flowi6); -__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys) +__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys) { memset(keys, 0, sizeof(*keys)); -- cgit v1.2.3 From 62da98656b62a5ca57f22263705175af8ded5aa1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Sep 2015 01:26:07 +0200 Subject: netfilter: nf_conntrack: make nf_ct_zone_dflt built-in Fengguang reported, that some randconfig generated the following linker issue with nf_ct_zone_dflt object involved: [...] CC init/version.o LD init/built-in.o net/built-in.o: In function `ipv4_conntrack_defrag': nf_defrag_ipv4.c:(.text+0x93e95): undefined reference to `nf_ct_zone_dflt' net/built-in.o: In function `ipv6_defrag': nf_defrag_ipv6_hooks.c:(.text+0xe3ffe): undefined reference to `nf_ct_zone_dflt' make: *** [vmlinux] Error 1 Given that configurations exist where we have a built-in part, which is accessing nf_ct_zone_dflt such as the two handlers nf_ct_defrag_user() and nf_ct6_defrag_user(), and a part that configures nf_conntrack as a module, we must move nf_ct_zone_dflt into a fixed, guaranteed built-in area when netfilter is configured in general. Therefore, split the more generic parts into a common header under include/linux/netfilter/ and move nf_ct_zone_dflt into the built-in section that already holds parts related to CONFIG_NF_CONNTRACK in the netfilter core. This fixes the issue on my side. Fixes: 308ac9143ee2 ("netfilter: nf_conntrack: push zone object into functions") Reported-by: Fengguang Wu Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/netfilter.h | 2 ++ .../linux/netfilter/nf_conntrack_zones_common.h | 23 ++++++++++++++++++++++ include/net/netfilter/nf_conntrack_zones.h | 19 +----------------- net/netfilter/core.c | 6 ++++++ net/netfilter/nf_conntrack_core.c | 7 ------- 5 files changed, 32 insertions(+), 25 deletions(-) create mode 100644 include/linux/netfilter/nf_conntrack_zones_common.h (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index d788ce62d826..36a652531791 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -368,6 +368,8 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) #endif /*CONFIG_NETFILTER*/ #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#include + extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu; void nf_ct_attach(struct sk_buff *, const struct sk_buff *); extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu; diff --git a/include/linux/netfilter/nf_conntrack_zones_common.h b/include/linux/netfilter/nf_conntrack_zones_common.h new file mode 100644 index 000000000000..5d7cf36d4766 --- /dev/null +++ b/include/linux/netfilter/nf_conntrack_zones_common.h @@ -0,0 +1,23 @@ +#ifndef _NF_CONNTRACK_ZONES_COMMON_H +#define _NF_CONNTRACK_ZONES_COMMON_H + +#include + +#define NF_CT_DEFAULT_ZONE_ID 0 + +#define NF_CT_ZONE_DIR_ORIG (1 << IP_CT_DIR_ORIGINAL) +#define NF_CT_ZONE_DIR_REPL (1 << IP_CT_DIR_REPLY) + +#define NF_CT_DEFAULT_ZONE_DIR (NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL) + +#define NF_CT_FLAG_MARK 1 + +struct nf_conntrack_zone { + u16 id; + u8 flags; + u8 dir; +}; + +extern const struct nf_conntrack_zone nf_ct_zone_dflt; + +#endif /* _NF_CONNTRACK_ZONES_COMMON_H */ diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h index 5316c7b3a374..4e32512cef32 100644 --- a/include/net/netfilter/nf_conntrack_zones.h +++ b/include/net/netfilter/nf_conntrack_zones.h @@ -1,24 +1,7 @@ #ifndef _NF_CONNTRACK_ZONES_H #define _NF_CONNTRACK_ZONES_H -#include - -#define NF_CT_DEFAULT_ZONE_ID 0 - -#define NF_CT_ZONE_DIR_ORIG (1 << IP_CT_DIR_ORIGINAL) -#define NF_CT_ZONE_DIR_REPL (1 << IP_CT_DIR_REPLY) - -#define NF_CT_DEFAULT_ZONE_DIR (NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL) - -#define NF_CT_FLAG_MARK 1 - -struct nf_conntrack_zone { - u16 id; - u8 flags; - u8 dir; -}; - -extern const struct nf_conntrack_zone nf_ct_zone_dflt; +#include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 0b939b7ad724..8e47f8113495 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -388,6 +388,12 @@ EXPORT_SYMBOL(nf_conntrack_destroy); struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nfq_ct_hook); +/* Built-in default zone used e.g. by modules. */ +const struct nf_conntrack_zone nf_ct_zone_dflt = { + .id = NF_CT_DEFAULT_ZONE_ID, + .dir = NF_CT_DEFAULT_ZONE_DIR, +}; +EXPORT_SYMBOL_GPL(nf_ct_zone_dflt); #endif /* CONFIG_NF_CONNTRACK */ #ifdef CONFIG_NF_NAT_NEEDED diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ac3be9b0629b..eedf0495f11f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1286,13 +1286,6 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, } EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); -/* Built-in default zone used e.g. by modules. */ -const struct nf_conntrack_zone nf_ct_zone_dflt = { - .id = NF_CT_DEFAULT_ZONE_ID, - .dir = NF_CT_DEFAULT_ZONE_DIR, -}; -EXPORT_SYMBOL_GPL(nf_ct_zone_dflt); - #ifdef CONFIG_NF_CONNTRACK_ZONES static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = { .len = sizeof(struct nf_conntrack_zone), -- cgit v1.2.3