From 8b58a39846568dcd7d0c98b2fadc25018e59dedf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 8 Jul 2015 23:32:12 +0200
Subject: ipv6: use flag instead of u16 for hop in inet6_skb_parm

Hop was always either 0 or sizeof(struct ipv6hdr).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 82806c60aa42..1319a6bb6b82 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -94,7 +94,6 @@ static inline struct ipv6hdr *ipipv6_hdr(const struct sk_buff *skb)
 struct inet6_skb_parm {
 	int			iif;
 	__be16			ra;
-	__u16			hop;
 	__u16			dst0;
 	__u16			srcrt;
 	__u16			dst1;
@@ -111,6 +110,7 @@ struct inet6_skb_parm {
 #define IP6SKB_REROUTED		4
 #define IP6SKB_ROUTERALERT	8
 #define IP6SKB_FRAGMENTED      16
+#define IP6SKB_HOPBYHOP        32
 };
 
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
-- 
cgit v1.2.3


From 634ec36cc0ab9d8dda0f2c101fa28d2e2a61b9eb Mon Sep 17 00:00:00 2001
From: David Thomson <david.thomson@alliedtelesis.co.nz>
Date: Fri, 10 Jul 2015 13:56:54 +1200
Subject: net: phy: Pass mdix ethtool setting through to phy driver

Pass the mdix setting from ethtool down to the phy driver, to allow
driver specific implementations of manually setting the polarity.

Signed-off-by: David Thomson <david.thomson@alliedtelesis.co.nz>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 2 ++
 include/linux/phy.h   | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index b2197b506acb..47693a9ebd3a 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -353,6 +353,8 @@ int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd)
 
 	phydev->duplex = cmd->duplex;
 
+	phydev->mdix = cmd->eth_tp_mdix_ctrl;
+
 	/* Restart the PHY */
 	phy_start_aneg(phydev);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index a26c3f84b8dd..e5fb1d415961 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -424,6 +424,8 @@ struct phy_device {
 
 	struct net_device *attached_dev;
 
+	u8 mdix;
+
 	void (*adjust_link)(struct net_device *dev);
 };
 #define to_phy_device(d) container_of(d, struct phy_device, dev)
-- 
cgit v1.2.3


From 70aa996601335ca3069190ebcdae8870828086a8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 10 Jul 2015 18:13:20 -0500
Subject: netfilter: kill nf_hooks_active

The function obscures what is going on in nf_hook_thresh and it's existence
requires computing the hook list twice.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 00050dfd9f23..60e89348a91d 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -150,11 +150,6 @@ static inline bool nf_hook_list_active(struct list_head *nf_hook_list,
 }
 #endif
 
-static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook)
-{
-	return nf_hook_list_active(&nf_hooks[pf][hook], pf, hook);
-}
-
 int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state);
 
 /**
@@ -172,10 +167,12 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 				 int (*okfn)(struct sock *, struct sk_buff *),
 				 int thresh)
 {
-	if (nf_hooks_active(pf, hook)) {
+	struct list_head *nf_hook_list = &nf_hooks[pf][hook];
+
+	if (nf_hook_list_active(nf_hook_list, pf, hook)) {
 		struct nf_hook_state state;
 
-		nf_hook_state_init(&state, &nf_hooks[pf][hook], hook, thresh,
+		nf_hook_state_init(&state, nf_hook_list, hook, thresh,
 				   pf, indev, outdev, sk, okfn);
 		return nf_hook_slow(skb, &state);
 	}
-- 
cgit v1.2.3


From 085db2c04557d31db61541f361bd8b4de92c9939 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 10 Jul 2015 18:15:06 -0500
Subject: netfilter: Per network namespace netfilter hooks.

- Add a new set of functions for registering and unregistering per
  network namespace hooks.

- Modify the old global namespace hook functions to use the per
  network namespace hooks in their implementation, so their remains a
  single list that needs to be walked for any hook (this is important
  for keeping the hook priority working and for keeping the code
  walking the hooks simple).

- Only allow registering the per netdevice hooks in the network
  namespace where the network device lives.

- Dynamically allocate the structures in the per network namespace
  hook list in nf_register_net_hook, and unregister them in
  nf_unregister_net_hook.

  Dynamic allocate is required somewhere as the number of network
  namespaces are not fixed so we might as well allocate them in the
  registration function.

  The chain of registered hooks on any list is expected to be small so
  the cost of walking that list to find the entry we are unregistering
  should also be small.

  Performing the management of the dynamically allocated list entries
  in the registration and unregistration functions keeps the complexity
  from spreading.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/netfilter.h     |  14 +++-
 include/net/netns/netfilter.h |   1 +
 net/netfilter/core.c          | 182 +++++++++++++++++++++++++++++++++++++-----
 3 files changed, 173 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 60e89348a91d..9bbd110ec81b 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -11,6 +11,8 @@
 #include <linux/list.h>
 #include <linux/static_key.h>
 #include <linux/netfilter_defs.h>
+#include <linux/netdevice.h>
+#include <net/net_namespace.h>
 
 #ifdef CONFIG_NETFILTER
 static inline int NF_DROP_GETERR(int verdict)
@@ -118,6 +120,13 @@ struct nf_sockopt_ops {
 };
 
 /* Function to register/unregister hook points. */
+int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops);
+void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops);
+int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+			  unsigned int n);
+void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+			     unsigned int n);
+
 int nf_register_hook(struct nf_hook_ops *reg);
 void nf_unregister_hook(struct nf_hook_ops *reg);
 int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
@@ -128,8 +137,6 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
 int nf_register_sockopt(struct nf_sockopt_ops *reg);
 void nf_unregister_sockopt(struct nf_sockopt_ops *reg);
 
-extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
-
 #ifdef HAVE_JUMP_LABEL
 extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 
@@ -167,7 +174,8 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 				 int (*okfn)(struct sock *, struct sk_buff *),
 				 int thresh)
 {
-	struct list_head *nf_hook_list = &nf_hooks[pf][hook];
+	struct net *net = dev_net(indev ? indev : outdev);
+	struct list_head *nf_hook_list = &net->nf.hooks[pf][hook];
 
 	if (nf_hook_list_active(nf_hook_list, pf, hook)) {
 		struct nf_hook_state state;
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index 532e4ba64f49..38aa4983e2a9 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -14,5 +14,6 @@ struct netns_nf {
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header *nf_log_dir_header;
 #endif
+	struct list_head hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 };
 #endif
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index fa4d3c111d3f..56ead1a1711c 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -52,9 +52,6 @@ void nf_unregister_afinfo(const struct nf_afinfo *afinfo)
 }
 EXPORT_SYMBOL_GPL(nf_unregister_afinfo);
 
-struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly;
-EXPORT_SYMBOL(nf_hooks);
-
 #ifdef HAVE_JUMP_LABEL
 struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 EXPORT_SYMBOL(nf_hooks_needed);
@@ -62,27 +59,40 @@ EXPORT_SYMBOL(nf_hooks_needed);
 
 static DEFINE_MUTEX(nf_hook_mutex);
 
-static struct list_head *find_nf_hook_list(const struct nf_hook_ops *reg)
+static struct list_head *find_nf_hook_list(struct net *net,
+					   const struct nf_hook_ops *reg)
 {
 	struct list_head *nf_hook_list = NULL;
 
 	if (reg->pf != NFPROTO_NETDEV)
-		nf_hook_list = &nf_hooks[reg->pf][reg->hooknum];
+		nf_hook_list = &net->nf.hooks[reg->pf][reg->hooknum];
 	else if (reg->hooknum == NF_NETDEV_INGRESS) {
 #ifdef CONFIG_NETFILTER_INGRESS
-		if (reg->dev)
+		if (reg->dev && dev_net(reg->dev) == net)
 			nf_hook_list = &reg->dev->nf_hooks_ingress;
 #endif
 	}
 	return nf_hook_list;
 }
 
-int nf_register_hook(struct nf_hook_ops *reg)
+int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
 	struct list_head *nf_hook_list;
-	struct nf_hook_ops *elem;
+	struct nf_hook_ops *elem, *new;
+
+	new = kzalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
 
-	nf_hook_list = find_nf_hook_list(reg);
+	new->hook     = reg->hook;
+	new->dev      = reg->dev;
+	new->owner    = reg->owner;
+	new->priv     = reg->priv;
+	new->pf       = reg->pf;
+	new->hooknum  = reg->hooknum;
+	new->priority = reg->priority;
+
+	nf_hook_list = find_nf_hook_list(net, reg);
 	if (!nf_hook_list)
 		return -ENOENT;
 
@@ -91,7 +101,7 @@ int nf_register_hook(struct nf_hook_ops *reg)
 		if (reg->priority < elem->priority)
 			break;
 	}
-	list_add_rcu(&reg->list, elem->list.prev);
+	list_add_rcu(&new->list, elem->list.prev);
 	mutex_unlock(&nf_hook_mutex);
 #ifdef CONFIG_NETFILTER_INGRESS
 	if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
@@ -102,13 +112,35 @@ int nf_register_hook(struct nf_hook_ops *reg)
 #endif
 	return 0;
 }
-EXPORT_SYMBOL(nf_register_hook);
+EXPORT_SYMBOL(nf_register_net_hook);
 
-void nf_unregister_hook(struct nf_hook_ops *reg)
+void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
+	struct list_head *nf_hook_list;
+	struct nf_hook_ops *elem;
+
+	nf_hook_list = find_nf_hook_list(net, reg);
+	if (!nf_hook_list)
+		return;
+
 	mutex_lock(&nf_hook_mutex);
-	list_del_rcu(&reg->list);
+	list_for_each_entry(elem, nf_hook_list, list) {
+		if ((reg->hook     == elem->hook) &&
+		    (reg->dev      == elem->dev) &&
+		    (reg->owner    == elem->owner) &&
+		    (reg->priv     == elem->priv) &&
+		    (reg->pf       == elem->pf) &&
+		    (reg->hooknum  == elem->hooknum) &&
+		    (reg->priority == elem->priority)) {
+			list_del_rcu(&elem->list);
+			break;
+		}
+	}
 	mutex_unlock(&nf_hook_mutex);
+	if (&elem->list == nf_hook_list) {
+		WARN(1, "nf_unregister_net_hook: hook not found!\n");
+		return;
+	}
 #ifdef CONFIG_NETFILTER_INGRESS
 	if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
 		net_dec_ingress_queue();
@@ -117,7 +149,77 @@ void nf_unregister_hook(struct nf_hook_ops *reg)
 	static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
 #endif
 	synchronize_net();
-	nf_queue_nf_hook_drop(reg);
+	nf_queue_nf_hook_drop(elem);
+	kfree(elem);
+}
+EXPORT_SYMBOL(nf_unregister_net_hook);
+
+int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+			  unsigned int n)
+{
+	unsigned int i;
+	int err = 0;
+
+	for (i = 0; i < n; i++) {
+		err = nf_register_net_hook(net, &reg[i]);
+		if (err)
+			goto err;
+	}
+	return err;
+
+err:
+	if (i > 0)
+		nf_unregister_net_hooks(net, reg, i);
+	return err;
+}
+EXPORT_SYMBOL(nf_register_net_hooks);
+
+void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
+			     unsigned int n)
+{
+	while (n-- > 0)
+		nf_unregister_net_hook(net, &reg[n]);
+}
+EXPORT_SYMBOL(nf_unregister_net_hooks);
+
+static LIST_HEAD(nf_hook_list);
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+	struct net *net, *last;
+	int ret;
+
+	rtnl_lock();
+	for_each_net(net) {
+		ret = nf_register_net_hook(net, reg);
+		if (ret && ret != -ENOENT)
+			goto rollback;
+	}
+	list_add_tail(&reg->list, &nf_hook_list);
+	rtnl_unlock();
+
+	return 0;
+rollback:
+	last = net;
+	for_each_net(net) {
+		if (net == last)
+			break;
+		nf_unregister_net_hook(net, reg);
+	}
+	rtnl_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(nf_register_hook);
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+	struct net *net;
+
+	rtnl_lock();
+	list_del(&reg->list);
+	for_each_net(net)
+		nf_unregister_net_hook(net, reg);
+	rtnl_unlock();
 }
 EXPORT_SYMBOL(nf_unregister_hook);
 
@@ -294,8 +396,46 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
 EXPORT_SYMBOL(nf_nat_decode_session_hook);
 #endif
 
+static int nf_register_hook_list(struct net *net)
+{
+	struct nf_hook_ops *elem;
+	int ret;
+
+	rtnl_lock();
+	list_for_each_entry(elem, &nf_hook_list, list) {
+		ret = nf_register_net_hook(net, elem);
+		if (ret && ret != -ENOENT)
+			goto out_undo;
+	}
+	rtnl_unlock();
+	return 0;
+
+out_undo:
+	list_for_each_entry_continue_reverse(elem, &nf_hook_list, list)
+		nf_unregister_net_hook(net, elem);
+	rtnl_unlock();
+	return ret;
+}
+
+static void nf_unregister_hook_list(struct net *net)
+{
+	struct nf_hook_ops *elem;
+
+	rtnl_lock();
+	list_for_each_entry(elem, &nf_hook_list, list)
+		nf_unregister_net_hook(net, elem);
+	rtnl_unlock();
+}
+
 static int __net_init netfilter_net_init(struct net *net)
 {
+	int i, h, ret;
+
+	for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) {
+		for (h = 0; h < NF_MAX_HOOKS; h++)
+			INIT_LIST_HEAD(&net->nf.hooks[i][h]);
+	}
+
 #ifdef CONFIG_PROC_FS
 	net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
 						net->proc_net);
@@ -306,11 +446,16 @@ static int __net_init netfilter_net_init(struct net *net)
 		return -ENOMEM;
 	}
 #endif
-	return 0;
+	ret = nf_register_hook_list(net);
+	if (ret)
+		remove_proc_entry("netfilter", net->proc_net);
+
+	return ret;
 }
 
 static void __net_exit netfilter_net_exit(struct net *net)
 {
+	nf_unregister_hook_list(net);
 	remove_proc_entry("netfilter", net->proc_net);
 }
 
@@ -321,12 +466,7 @@ static struct pernet_operations netfilter_net_ops = {
 
 int __init netfilter_init(void)
 {
-	int i, h, ret;
-
-	for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) {
-		for (h = 0; h < NF_MAX_HOOKS; h++)
-			INIT_LIST_HEAD(&nf_hooks[i][h]);
-	}
+	int ret;
 
 	ret = register_pernet_subsys(&netfilter_net_ops);
 	if (ret < 0)
-- 
cgit v1.2.3


From e7c8899f3e6f2830136cf6e115c4a55ce7a3920a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 14 Jul 2015 17:51:07 +0200
Subject: netfilter: move tee_active to core

This prepares for a TEE like expression in nftables.
We want to ensure only one duplicate is sent, so both will
use the same percpu variable to detect duplication.

The other use case is detection of recursive call to xtables, but since
we don't want dependency from nft to xtables core its put into core.c
instead of the x_tables core.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 11 +++++++++++
 net/netfilter/core.c      |  3 +++
 net/netfilter/xt_TEE.c    | 13 ++++++-------
 3 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 9bbd110ec81b..e01da73ee6c4 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -390,4 +390,15 @@ extern struct nfq_ct_hook __rcu *nfq_ct_hook;
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
 #endif
 
+/**
+ * nf_skb_duplicated - TEE target has sent a packet
+ *
+ * When a xtables target sends a packet, the OUTPUT and POSTROUTING
+ * hooks are traversed again, i.e. nft and xtables are invoked recursively.
+ *
+ * This is used by xtables TEE target to prevent the duplicated skb from
+ * being duplicated again.
+ */
+DECLARE_PER_CPU(bool, nf_skb_duplicated);
+
 #endif /*__LINUX_NETFILTER_H*/
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 56ead1a1711c..6896cee8b733 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -34,6 +34,9 @@ EXPORT_SYMBOL(nf_afinfo);
 const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
 EXPORT_SYMBOL_GPL(nf_ipv6_ops);
 
+DEFINE_PER_CPU(bool, nf_skb_duplicated);
+EXPORT_SYMBOL_GPL(nf_skb_duplicated);
+
 int nf_register_afinfo(const struct nf_afinfo *afinfo)
 {
 	mutex_lock(&afinfo_mutex);
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index a747eb475b68..8950e79c4dc9 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -37,7 +37,6 @@ struct xt_tee_priv {
 };
 
 static const union nf_inet_addr tee_zero_address;
-static DEFINE_PER_CPU(bool, tee_active);
 
 static struct net *pick_net(struct sk_buff *skb)
 {
@@ -88,7 +87,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	const struct xt_tee_tginfo *info = par->targinfo;
 	struct iphdr *iph;
 
-	if (__this_cpu_read(tee_active))
+	if (__this_cpu_read(nf_skb_duplicated))
 		return XT_CONTINUE;
 	/*
 	 * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
@@ -125,9 +124,9 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	ip_send_check(iph);
 
 	if (tee_tg_route4(skb, info)) {
-		__this_cpu_write(tee_active, true);
+		__this_cpu_write(nf_skb_duplicated, true);
 		ip_local_out(skb);
-		__this_cpu_write(tee_active, false);
+		__this_cpu_write(nf_skb_duplicated, false);
 	} else {
 		kfree_skb(skb);
 	}
@@ -170,7 +169,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_tee_tginfo *info = par->targinfo;
 
-	if (__this_cpu_read(tee_active))
+	if (__this_cpu_read(nf_skb_duplicated))
 		return XT_CONTINUE;
 	skb = pskb_copy(skb, GFP_ATOMIC);
 	if (skb == NULL)
@@ -188,9 +187,9 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 		--iph->hop_limit;
 	}
 	if (tee_tg_route6(skb, info)) {
-		__this_cpu_write(tee_active, true);
+		__this_cpu_write(nf_skb_duplicated, true);
 		ip6_local_out(skb);
-		__this_cpu_write(tee_active, false);
+		__this_cpu_write(nf_skb_duplicated, false);
 	} else {
 		kfree_skb(skb);
 	}
-- 
cgit v1.2.3


From 7814b6ec6d0d63444abdb49554166c8cfcbd063e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 14 Jul 2015 17:51:08 +0200
Subject: netfilter: xtables: don't save/restore jumpstack offset

In most cases there is no reentrancy into ip/ip6tables.

For skbs sent by REJECT or SYNPROXY targets, there is one level
of reentrancy, but its not relevant as those targets issue an absolute
verdict, i.e. the jumpstack can be clobbered since its not used
after the target issues absolute verdict (ACCEPT, DROP, STOLEN, etc).

So the only special case where it is relevant is the TEE target, which
returns XT_CONTINUE.

This patch changes ip(6)_do_table to always use the jump stack starting
from 0.

When we detect we're operating on an skb sent via TEE (percpu
nf_skb_duplicated is 1) we switch to an alternate stack to leave
the original one alone.

Since there is no TEE support for arptables, it doesn't need to
test if tee is active.

The jump stack overflow tests are no longer needed as well --
since ->stacksize is the largest call depth we cannot exceed it.

A much better alternative to the external jumpstack would be to just
declare a jumps[32] stack on the local stack frame, but that would mean
we'd have to reject iptables rulesets that used to work before.

Another alternative would be to start rejecting rulesets with a larger
call depth, e.g. 1000 -- in this case it would be feasible to allocate the
entire stack in the percpu area which would avoid one dereference.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  1 -
 net/ipv4/netfilter/arp_tables.c    | 11 +++--------
 net/ipv4/netfilter/ip_tables.c     | 37 ++++++++++++++++++++-----------------
 net/ipv6/netfilter/ip6_tables.c    | 26 ++++++++++++++------------
 net/netfilter/x_tables.c           | 22 +++++++++++-----------
 5 files changed, 48 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 286098a5667f..149284557ca7 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -222,7 +222,6 @@ struct xt_table_info {
 	 * @stacksize jumps (number of user chains) can possibly be made.
 	 */
 	unsigned int stacksize;
-	unsigned int __percpu *stackptr;
 	void ***jumpstack;
 
 	unsigned char entries[0] __aligned(8);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index ae6d0a124213..969fdbe6fbb5 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -280,6 +280,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	table_base = private->entries;
 	jumpstack  = (struct arpt_entry **)private->jumpstack[cpu];
 
+	/* No TEE support for arptables, so no need to switch to alternate
+	 * stack.  All targets that reenter must return absolute verdicts.
+	 */
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	acpar.in      = state->in;
@@ -325,11 +328,6 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			}
 			if (table_base + v
 			    != arpt_next_entry(e)) {
-
-				if (stackidx >= private->stacksize) {
-					verdict = NF_DROP;
-					break;
-				}
 				jumpstack[stackidx++] = e;
 			}
 
@@ -337,9 +335,6 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			continue;
 		}
 
-		/* Targets which reenter must return
-		 * abs. verdicts
-		 */
 		acpar.target   = t->u.kernel.target;
 		acpar.targinfo = t->data;
 		verdict = t->u.kernel.target->target(skb, &acpar);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 5e44b35a8de8..a2e4b018a254 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -296,12 +296,13 @@ ipt_do_table(struct sk_buff *skb,
 	const char *indev, *outdev;
 	const void *table_base;
 	struct ipt_entry *e, **jumpstack;
-	unsigned int *stackptr, origptr, cpu;
+	unsigned int stackidx, cpu;
 	const struct xt_table_info *private;
 	struct xt_action_param acpar;
 	unsigned int addend;
 
 	/* Initialization */
+	stackidx = 0;
 	ip = ip_hdr(skb);
 	indev = state->in ? state->in->name : nulldevname;
 	outdev = state->out ? state->out->name : nulldevname;
@@ -331,13 +332,20 @@ ipt_do_table(struct sk_buff *skb,
 	smp_read_barrier_depends();
 	table_base = private->entries;
 	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
-	stackptr   = per_cpu_ptr(private->stackptr, cpu);
-	origptr    = *stackptr;
+
+	/* Switch to alternate jumpstack if we're being invoked via TEE.
+	 * TEE issues XT_CONTINUE verdict on original skb so we must not
+	 * clobber the jumpstack.
+	 *
+	 * For recursion via REJECT or SYNPROXY the stack will be clobbered
+	 * but it is no problem since absolute verdict is issued by these.
+	 */
+	jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
-	pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n",
-		 table->name, hook, origptr,
+	pr_debug("Entering %s(hook %u), UF %p\n",
+		 table->name, hook,
 		 get_entry(table_base, private->underflow[hook]));
 
 	do {
@@ -383,28 +391,24 @@ ipt_do_table(struct sk_buff *skb,
 					verdict = (unsigned int)(-v) - 1;
 					break;
 				}
-				if (*stackptr <= origptr) {
+				if (stackidx == 0) {
 					e = get_entry(table_base,
 					    private->underflow[hook]);
 					pr_debug("Underflow (this is normal) "
 						 "to %p\n", e);
 				} else {
-					e = jumpstack[--*stackptr];
+					e = jumpstack[--stackidx];
 					pr_debug("Pulled %p out from pos %u\n",
-						 e, *stackptr);
+						 e, stackidx);
 					e = ipt_next_entry(e);
 				}
 				continue;
 			}
 			if (table_base + v != ipt_next_entry(e) &&
 			    !(e->ip.flags & IPT_F_GOTO)) {
-				if (*stackptr >= private->stacksize) {
-					verdict = NF_DROP;
-					break;
-				}
-				jumpstack[(*stackptr)++] = e;
+				jumpstack[stackidx++] = e;
 				pr_debug("Pushed %p into pos %u\n",
-					 e, *stackptr - 1);
+					 e, stackidx - 1);
 			}
 
 			e = get_entry(table_base, v);
@@ -423,9 +427,8 @@ ipt_do_table(struct sk_buff *skb,
 			/* Verdict */
 			break;
 	} while (!acpar.hotdrop);
-	pr_debug("Exiting %s; resetting sp from %u to %u\n",
-		 __func__, *stackptr, origptr);
-	*stackptr = origptr;
+	pr_debug("Exiting %s; sp at %u\n", __func__, stackidx);
+
  	xt_write_recseq_end(addend);
  	local_bh_enable();
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index baf032179918..531281f0ff86 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -324,12 +324,13 @@ ip6t_do_table(struct sk_buff *skb,
 	const char *indev, *outdev;
 	const void *table_base;
 	struct ip6t_entry *e, **jumpstack;
-	unsigned int *stackptr, origptr, cpu;
+	unsigned int stackidx, cpu;
 	const struct xt_table_info *private;
 	struct xt_action_param acpar;
 	unsigned int addend;
 
 	/* Initialization */
+	stackidx = 0;
 	indev = state->in ? state->in->name : nulldevname;
 	outdev = state->out ? state->out->name : nulldevname;
 	/* We handle fragments by dealing with the first fragment as
@@ -357,8 +358,15 @@ ip6t_do_table(struct sk_buff *skb,
 	cpu        = smp_processor_id();
 	table_base = private->entries;
 	jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];
-	stackptr   = per_cpu_ptr(private->stackptr, cpu);
-	origptr    = *stackptr;
+
+	/* Switch to alternate jumpstack if we're being invoked via TEE.
+	 * TEE issues XT_CONTINUE verdict on original skb so we must not
+	 * clobber the jumpstack.
+	 *
+	 * For recursion via REJECT or SYNPROXY the stack will be clobbered
+	 * but it is no problem since absolute verdict is issued by these.
+	 */
+	jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -406,20 +414,16 @@ ip6t_do_table(struct sk_buff *skb,
 					verdict = (unsigned int)(-v) - 1;
 					break;
 				}
-				if (*stackptr <= origptr)
+				if (stackidx == 0)
 					e = get_entry(table_base,
 					    private->underflow[hook]);
 				else
-					e = ip6t_next_entry(jumpstack[--*stackptr]);
+					e = ip6t_next_entry(jumpstack[--stackidx]);
 				continue;
 			}
 			if (table_base + v != ip6t_next_entry(e) &&
 			    !(e->ipv6.flags & IP6T_F_GOTO)) {
-				if (*stackptr >= private->stacksize) {
-					verdict = NF_DROP;
-					break;
-				}
-				jumpstack[(*stackptr)++] = e;
+				jumpstack[stackidx++] = e;
 			}
 
 			e = get_entry(table_base, v);
@@ -437,8 +441,6 @@ ip6t_do_table(struct sk_buff *skb,
 			break;
 	} while (!acpar.hotdrop);
 
-	*stackptr = origptr;
-
  	xt_write_recseq_end(addend);
  	local_bh_enable();
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 4db7d60d42fa..154447e519ab 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -67,9 +67,6 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
 	[NFPROTO_IPV6]   = "ip6",
 };
 
-/* Allow this many total (re)entries. */
-static const unsigned int xt_jumpstack_multiplier = 2;
-
 /* Registration hooks for targets. */
 int xt_register_target(struct xt_target *target)
 {
@@ -688,8 +685,6 @@ void xt_free_table_info(struct xt_table_info *info)
 		kvfree(info->jumpstack);
 	}
 
-	free_percpu(info->stackptr);
-
 	kvfree(info);
 }
 EXPORT_SYMBOL(xt_free_table_info);
@@ -737,10 +732,6 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)
 	unsigned int size;
 	int cpu;
 
-	i->stackptr = alloc_percpu(unsigned int);
-	if (i->stackptr == NULL)
-		return -ENOMEM;
-
 	size = sizeof(void **) * nr_cpu_ids;
 	if (size > PAGE_SIZE)
 		i->jumpstack = vzalloc(size);
@@ -753,8 +744,17 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)
 	if (i->stacksize == 0)
 		return 0;
 
-	i->stacksize *= xt_jumpstack_multiplier;
-	size = sizeof(void *) * i->stacksize;
+	/* Jumpstack needs to be able to record two full callchains, one
+	 * from the first rule set traversal, plus one table reentrancy
+	 * via -j TEE without clobbering the callchain that brought us to
+	 * TEE target.
+	 *
+	 * This is done by allocating two jumpstacks per cpu, on reentry
+	 * the upper half of the stack is used.
+	 *
+	 * see the jumpstack setup in ipt_do_table() for more details.
+	 */
+	size = sizeof(void *) * i->stacksize * 2u;
 	for_each_possible_cpu(cpu) {
 		if (size > PAGE_SIZE)
 			i->jumpstack[cpu] = vmalloc_node(size,
-- 
cgit v1.2.3


From dcebd3153e0a7749bb054ab73fa4e1ca33e9d3f9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 14 Jul 2015 17:51:09 +0200
Subject: netfilter: add and use jump label for xt_tee

Don't bother testing if we need to switch to alternate stack
unless TEE target is used.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 7 +++++++
 net/ipv4/netfilter/ip_tables.c     | 3 ++-
 net/ipv6/netfilter/ip6_tables.c    | 3 ++-
 net/netfilter/x_tables.c           | 3 +++
 net/netfilter/xt_TEE.c             | 2 ++
 5 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 149284557ca7..b006b719183f 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -3,6 +3,7 @@
 
 
 #include <linux/netdevice.h>
+#include <linux/static_key.h>
 #include <uapi/linux/netfilter/x_tables.h>
 
 /**
@@ -280,6 +281,12 @@ void xt_free_table_info(struct xt_table_info *info);
  */
 DECLARE_PER_CPU(seqcount_t, xt_recseq);
 
+/* xt_tee_enabled - true if x_tables needs to handle reentrancy
+ *
+ * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
+ */
+extern struct static_key xt_tee_enabled;
+
 /**
  * xt_write_recseq_begin - start of a write section
  *
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index a2e4b018a254..ff585bdbf850 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -340,7 +340,8 @@ ipt_do_table(struct sk_buff *skb,
 	 * For recursion via REJECT or SYNPROXY the stack will be clobbered
 	 * but it is no problem since absolute verdict is issued by these.
 	 */
-	jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
+	if (static_key_false(&xt_tee_enabled))
+		jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 531281f0ff86..ea6d105063c2 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -366,7 +366,8 @@ ip6t_do_table(struct sk_buff *skb,
 	 * For recursion via REJECT or SYNPROXY the stack will be clobbered
 	 * but it is no problem since absolute verdict is issued by these.
 	 */
-	jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
+	if (static_key_false(&xt_tee_enabled))
+		jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 154447e519ab..9b42b5ea6dcd 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -727,6 +727,9 @@ EXPORT_SYMBOL_GPL(xt_compat_unlock);
 DEFINE_PER_CPU(seqcount_t, xt_recseq);
 EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
 
+struct static_key xt_tee_enabled __read_mostly;
+EXPORT_SYMBOL_GPL(xt_tee_enabled);
+
 static int xt_jumpstack_alloc(struct xt_table_info *i)
 {
 	unsigned int size;
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 8950e79c4dc9..c5d6556dbc5e 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -251,6 +251,7 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
 	} else
 		info->priv = NULL;
 
+	static_key_slow_inc(&xt_tee_enabled);
 	return 0;
 }
 
@@ -262,6 +263,7 @@ static void tee_tg_destroy(const struct xt_tgdtor_param *par)
 		unregister_netdevice_notifier(&info->priv->notifier);
 		kfree(info->priv);
 	}
+	static_key_slow_dec(&xt_tee_enabled);
 }
 
 static struct xt_target tee_tg_reg[] __read_mostly = {
-- 
cgit v1.2.3


From d746d707a8b1421a4ba46b497cb5d59e20161645 Mon Sep 17 00:00:00 2001
From: Anuradha Karuppiah <anuradhak@cumulusnetworks.com>
Date: Tue, 14 Jul 2015 13:43:19 -0700
Subject: net core: Add protodown support.

This patch introduces the proto_down flag that can be used by user space
applications to notify switch drivers that errors have been detected on the
device.

The switch driver can react to protodown notification by doing a phys down
on the associated switch port.

Signed-off-by: Anuradha Karuppiah <anuradhak@cumulusnetworks.com>
Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Wilson Kok <wkok@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 14 ++++++++++++++
 net/core/dev.c            | 20 ++++++++++++++++++++
 net/core/net-sysfs.c      | 14 ++++++++++++++
 3 files changed, 48 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e20979dfd6a9..45cfd797eb77 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1041,6 +1041,12 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *	TX queue.
  * int (*ndo_get_iflink)(const struct net_device *dev);
  *	Called to get the iflink value of this device.
+ * void (*ndo_change_proto_down)(struct net_device *dev,
+ *				  bool proto_down);
+ *	This function is used to pass protocol port error state information
+ *	to the switch driver. The switch driver can react to the proto_down
+ *      by doing a phys down on the associated switch port.
+ *
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1211,6 +1217,8 @@ struct net_device_ops {
 						      int queue_index,
 						      u32 maxrate);
 	int			(*ndo_get_iflink)(const struct net_device *dev);
+	int			(*ndo_change_proto_down)(struct net_device *dev,
+							 bool proto_down);
 };
 
 /**
@@ -1502,6 +1510,10 @@ enum netdev_priv_flags {
  *
  *	@qdisc_tx_busylock:	XXX: need comments on this one
  *
+ *	@proto_down:	protocol port state information can be sent to the
+ *			switch driver and used to set the phys state of the
+ *			switch port.
+ *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
  */
@@ -1762,6 +1774,7 @@ struct net_device {
 #endif
 	struct phy_device *phydev;
 	struct lock_class_key *qdisc_tx_busylock;
+	bool proto_down;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -2982,6 +2995,7 @@ int dev_get_phys_port_id(struct net_device *dev,
 			 struct netdev_phys_item_id *ppid);
 int dev_get_phys_port_name(struct net_device *dev,
 			   char *name, size_t len);
+int dev_change_proto_down(struct net_device *dev, bool proto_down);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
diff --git a/net/core/dev.c b/net/core/dev.c
index 69445a33ace6..8810b6bbebfe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6074,6 +6074,26 @@ int dev_get_phys_port_name(struct net_device *dev,
 }
 EXPORT_SYMBOL(dev_get_phys_port_name);
 
+/**
+ *	dev_change_proto_down - update protocol port state information
+ *	@dev: device
+ *	@proto_down: new value
+ *
+ *	This info can be used by switch drivers to set the phys state of the
+ *	port.
+ */
+int dev_change_proto_down(struct net_device *dev, bool proto_down)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (!ops->ndo_change_proto_down)
+		return -EOPNOTSUPP;
+	if (!netif_device_present(dev))
+		return -ENODEV;
+	return ops->ndo_change_proto_down(dev, proto_down);
+}
+EXPORT_SYMBOL(dev_change_proto_down);
+
 /**
  *	dev_new_index	-	allocate an ifindex
  *	@net: the applicable net namespace
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 18b34d771ed4..194c1d03b2b3 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -404,6 +404,19 @@ static ssize_t group_store(struct device *dev, struct device_attribute *attr,
 NETDEVICE_SHOW(group, fmt_dec);
 static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store);
 
+static int change_proto_down(struct net_device *dev, unsigned long proto_down)
+{
+	return dev_change_proto_down(dev, (bool) proto_down);
+}
+
+static ssize_t proto_down_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t len)
+{
+	return netdev_store(dev, attr, buf, len, change_proto_down);
+}
+NETDEVICE_SHOW_RW(proto_down, fmt_dec);
+
 static ssize_t phys_port_id_show(struct device *dev,
 				 struct device_attribute *attr, char *buf)
 {
@@ -501,6 +514,7 @@ static struct attribute *net_class_attrs[] = {
 	&dev_attr_phys_port_id.attr,
 	&dev_attr_phys_port_name.attr,
 	&dev_attr_phys_switch_id.attr,
+	&dev_attr_proto_down.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(net_class);
-- 
cgit v1.2.3


From 0c4f691ff6791e55ac831666df0b49b1679c56e4 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sat, 18 Jul 2015 18:24:48 -0700
Subject: net: don't reforward packets already forwarded by offload device

Just before queuing skb for xmit on port, check if skb has been marked by
switchdev port driver as already fordwarded by device.  If so, drop skb.  A
non-zero skb->offload_fwd_mark field is set by the switchdev port
driver/device on ingress to indicate the skb has already been forwarded by
the device to egress ports with matching dev->skb_mark.  The switchdev port
driver would assign a non-zero dev->offload_skb_mark for each device port
netdev during registration, for example.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  6 ++++++
 include/linux/skbuff.h    |  9 ++++++++-
 net/core/dev.c            | 10 ++++++++++
 3 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 45cfd797eb77..8364f29e08be 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1456,6 +1456,8 @@ enum netdev_priv_flags {
  *
  *	@xps_maps:	XXX: need comments on this one
  *
+ *	@offload_fwd_mark:	Offload device fwding mark
+ *
  *	@trans_start:		Time (in jiffies) of last Tx
  *	@watchdog_timeo:	Represents the timeout that is used by
  *				the watchdog ( see dev_watchdog() )
@@ -1697,6 +1699,10 @@ struct net_device {
 	struct xps_dev_maps __rcu *xps_maps;
 #endif
 
+#ifdef CONFIG_NET_SWITCHDEV
+	u32			offload_fwd_mark;
+#endif
+
 	/* These may be needed for future network-power-down code. */
 
 	/*
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d6cdd6e87d53..af7a09650fa2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -506,6 +506,7 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
  *	@no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
   *	@napi_id: id of the NAPI struct this skb came from
  *	@secmark: security marking
+ *	@offload_fwd_mark: fwding offload mark
  *	@mark: Generic packet mark
  *	@vlan_proto: vlan encapsulation protocol
  *	@vlan_tci: vlan tag control information
@@ -650,9 +651,15 @@ struct sk_buff {
 		unsigned int	sender_cpu;
 	};
 #endif
+	union {
 #ifdef CONFIG_NETWORK_SECMARK
-	__u32			secmark;
+		__u32		secmark;
+#endif
+#ifdef CONFIG_NET_SWITCHDEV
+		__u32		offload_fwd_mark;
 #endif
+	};
+
 	union {
 		__u32		mark;
 		__u32		reserved_tailroom;
diff --git a/net/core/dev.c b/net/core/dev.c
index 8810b6bbebfe..2ee15afb412d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3061,6 +3061,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 	else
 		skb_dst_force(skb);
 
+#ifdef CONFIG_NET_SWITCHDEV
+	/* Don't forward if offload device already forwarded */
+	if (skb->offload_fwd_mark &&
+	    skb->offload_fwd_mark == dev->offload_fwd_mark) {
+		consume_skb(skb);
+		rc = NET_XMIT_SUCCESS;
+		goto out;
+	}
+#endif
+
 	txq = netdev_pick_tx(dev, skb, accel_priv);
 	q = rcu_dereference_bh(txq->qdisc);
 
-- 
cgit v1.2.3


From d754f98b502ad9a8c7570d494e1eaa0e6bc0350c Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sat, 18 Jul 2015 18:24:49 -0700
Subject: net: add phys ID compare helper to test if two IDs are the same

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 7 +++++++
 net/switchdev/switchdev.c | 8 ++------
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8364f29e08be..607b5f41f46f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -766,6 +766,13 @@ struct netdev_phys_item_id {
 	unsigned char id_len;
 };
 
+static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
+					    struct netdev_phys_item_id *b)
+{
+	return a->id_len == b->id_len &&
+	       memcmp(a->id, b->id, a->id_len) == 0;
+}
+
 typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 				       struct sk_buff *skb);
 
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 9f2add3cba26..4e5bba50ccff 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -910,13 +910,9 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
 		if (switchdev_port_attr_get(dev, &attr))
 			return NULL;
 
-		if (nhsel > 0) {
-			if (prev_attr.u.ppid.id_len != attr.u.ppid.id_len)
+		if (nhsel > 0 &&
+		    !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid))
 				return NULL;
-			if (memcmp(prev_attr.u.ppid.id, attr.u.ppid.id,
-				   attr.u.ppid.id_len))
-				return NULL;
-		}
 
 		prev_attr = attr;
 	}
-- 
cgit v1.2.3


From 6acc23266054a9969737b435fa012f87465dbc50 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 16 Jul 2015 21:50:50 +0200
Subject: net: remove skb_frag_add_head

It's not used anywhere.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index af7a09650fa2..6bd96fe9416a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2678,12 +2678,6 @@ static inline void skb_frag_list_init(struct sk_buff *skb)
 	skb_shinfo(skb)->frag_list = NULL;
 }
 
-static inline void skb_frag_add_head(struct sk_buff *skb, struct sk_buff *frag)
-{
-	frag->next = skb_shinfo(skb)->frag_list;
-	skb_shinfo(skb)->frag_list = frag;
-}
-
 #define skb_walk_frags(skb, iter)	\
 	for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)
 
-- 
cgit v1.2.3


From f4c190eb8b4f80b12dc98ce7d54a3bea0e4e7e69 Mon Sep 17 00:00:00 2001
From: Joachim Eastwood <manabian@gmail.com>
Date: Fri, 17 Jul 2015 00:26:12 +0200
Subject: stmmac: drop custom_* fields from plat_stmmacenet_data

Both of these fields are unused and has been unused since they
were added 3 and 5 years ago. Drop them since they are clearly
not very useful.

Signed-off-by: Joachim Eastwood <manabian@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/stmmac.txt | 4 ----
 include/linux/stmmac.h              | 2 --
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
index e655e2453c98..5fddefa69baf 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -139,8 +139,6 @@ struct plat_stmmacenet_data {
 	void (*free)(struct platform_device *pdev, void *priv);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
-	void *custom_cfg;
-	void *custom_data;
 	void *bsp_priv;
 };
 
@@ -186,8 +184,6 @@ Where:
 	     which will be stored in bsp_priv, and then passed to init and
 	     exit callbacks. init/exit callbacks should not use or modify
 	     platform data.
- o custom_cfg/custom_data: this is a custom configuration that can be passed
-			   while initializing the resources.
  o bsp_priv: another private pointer.
 
 For MDIO bus The we have:
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index c735f5c91eea..c86a20047cb1 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -123,8 +123,6 @@ struct plat_stmmacenet_data {
 	void (*free)(struct platform_device *pdev, void *priv);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
-	void *custom_cfg;
-	void *custom_data;
 	void *bsp_priv;
 };
 
-- 
cgit v1.2.3


From 4e10df9a60d96ced321dd2af71da558c6b750078 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Mon, 20 Jul 2015 20:34:18 -0700
Subject: bpf: introduce bpf_skb_vlan_push/pop() helpers

Allow eBPF programs attached to TC qdiscs call skb_vlan_push/pop via
helper functions. These functions may change skb->data/hlen which are
cached by some JITs to improve performance of ld_abs/ld_ind instructions.
Therefore JITs need to recognize bpf_skb_vlan_push/pop() calls,
re-compute header len and re-cache skb->data/hlen back into cpu registers.
Note, skb->data/hlen are not directly accessible from the programs,
so any changes to skb->data done either by these helpers or by other
TC actions are safe.

eBPF JIT supported by three architectures:
- arm64 JIT is using bpf_load_pointer() without caching, so it's ok as-is.
- x64 JIT re-caches skb->data/hlen unconditionally after vlan_push/pop calls
  (experiments showed that conditional re-caching is slower).
- s390 JIT falls back to interpreter for now when bpf_skb_vlan_push() is present
  in the program (re-caching is tbd).

These helpers allow more scalable handling of vlan from the programs.
Instead of creating thousands of vlan netdevs on top of eth0 and attaching
TC+ingress+bpf to all of them, the program can be attached to eth0 directly
and manipulate vlans as necessary.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/s390/net/bpf_jit_comp.c |  4 +++
 arch/x86/net/bpf_jit_comp.c  | 80 +++++++++++++++++++++++---------------------
 include/linux/bpf.h          |  2 ++
 include/linux/filter.h       |  1 +
 include/uapi/linux/bpf.h     |  2 ++
 net/core/filter.c            | 48 ++++++++++++++++++++++++++
 6 files changed, 99 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index fee782acc2ee..79c731e8d178 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -973,6 +973,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 		 */
 		const u64 func = (u64)__bpf_call_base + imm;
 
+		if (bpf_helper_changes_skb_data((void *)func))
+			/* TODO reload skb->data, hlen */
+			return -1;
+
 		REG_SET_SEEN(BPF_REG_5);
 		jit->seen |= SEEN_FUNC;
 		/* lg %w1,<d(imm)>(%l) */
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 579a8fd74be0..6c335a8fc086 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -315,6 +315,26 @@ static void emit_bpf_tail_call(u8 **pprog)
 	*pprog = prog;
 }
 
+
+static void emit_load_skb_data_hlen(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	/* r9d = skb->len - skb->data_len (headlen)
+	 * r10 = skb->data
+	 */
+	/* mov %r9d, off32(%rdi) */
+	EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len));
+
+	/* sub %r9d, off32(%rdi) */
+	EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len));
+
+	/* mov %r10, off32(%rdi) */
+	EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data));
+	*pprog = prog;
+}
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		  int oldproglen, struct jit_context *ctx)
 {
@@ -329,36 +349,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 
 	emit_prologue(&prog);
 
-	if (seen_ld_abs) {
-		/* r9d : skb->len - skb->data_len (headlen)
-		 * r10 : skb->data
-		 */
-		if (is_imm8(offsetof(struct sk_buff, len)))
-			/* mov %r9d, off8(%rdi) */
-			EMIT4(0x44, 0x8b, 0x4f,
-			      offsetof(struct sk_buff, len));
-		else
-			/* mov %r9d, off32(%rdi) */
-			EMIT3_off32(0x44, 0x8b, 0x8f,
-				    offsetof(struct sk_buff, len));
-
-		if (is_imm8(offsetof(struct sk_buff, data_len)))
-			/* sub %r9d, off8(%rdi) */
-			EMIT4(0x44, 0x2b, 0x4f,
-			      offsetof(struct sk_buff, data_len));
-		else
-			EMIT3_off32(0x44, 0x2b, 0x8f,
-				    offsetof(struct sk_buff, data_len));
-
-		if (is_imm8(offsetof(struct sk_buff, data)))
-			/* mov %r10, off8(%rdi) */
-			EMIT4(0x4c, 0x8b, 0x57,
-			      offsetof(struct sk_buff, data));
-		else
-			/* mov %r10, off32(%rdi) */
-			EMIT3_off32(0x4c, 0x8b, 0x97,
-				    offsetof(struct sk_buff, data));
-	}
+	if (seen_ld_abs)
+		emit_load_skb_data_hlen(&prog);
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		const s32 imm32 = insn->imm;
@@ -367,6 +359,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 		u8 b1 = 0, b2 = 0, b3 = 0;
 		s64 jmp_offset;
 		u8 jmp_cond;
+		bool reload_skb_data;
 		int ilen;
 		u8 *func;
 
@@ -818,12 +811,18 @@ xadd:			if (is_imm8(insn->off))
 			func = (u8 *) __bpf_call_base + imm32;
 			jmp_offset = func - (image + addrs[i]);
 			if (seen_ld_abs) {
-				EMIT2(0x41, 0x52); /* push %r10 */
-				EMIT2(0x41, 0x51); /* push %r9 */
-				/* need to adjust jmp offset, since
-				 * pop %r9, pop %r10 take 4 bytes after call insn
-				 */
-				jmp_offset += 4;
+				reload_skb_data = bpf_helper_changes_skb_data(func);
+				if (reload_skb_data) {
+					EMIT1(0x57); /* push %rdi */
+					jmp_offset += 22; /* pop, mov, sub, mov */
+				} else {
+					EMIT2(0x41, 0x52); /* push %r10 */
+					EMIT2(0x41, 0x51); /* push %r9 */
+					/* need to adjust jmp offset, since
+					 * pop %r9, pop %r10 take 4 bytes after call insn
+					 */
+					jmp_offset += 4;
+				}
 			}
 			if (!imm32 || !is_simm32(jmp_offset)) {
 				pr_err("unsupported bpf func %d addr %p image %p\n",
@@ -832,8 +831,13 @@ xadd:			if (is_imm8(insn->off))
 			}
 			EMIT1_off32(0xE8, jmp_offset);
 			if (seen_ld_abs) {
-				EMIT2(0x41, 0x59); /* pop %r9 */
-				EMIT2(0x41, 0x5A); /* pop %r10 */
+				if (reload_skb_data) {
+					EMIT1(0x5F); /* pop %rdi */
+					emit_load_skb_data_hlen(&prog);
+				} else {
+					EMIT2(0x41, 0x59); /* pop %r9 */
+					EMIT2(0x41, 0x5A); /* pop %r10 */
+				}
 			}
 			break;
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4383476a0d48..139d6d2e123f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -192,5 +192,7 @@ extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
 extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
 extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
 extern const struct bpf_func_proto bpf_get_current_comm_proto;
+extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
+extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 17724f6ea983..69d00555ce35 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -411,6 +411,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_int_jit_compile(struct bpf_prog *fp);
+bool bpf_helper_changes_skb_data(void *func);
 
 #ifdef CONFIG_BPF_JIT
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2de87e58b12b..2f6c83d714e9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -256,6 +256,8 @@ enum bpf_func_id {
 	 * Return: classid if != 0
 	 */
 	BPF_FUNC_get_cgroup_classid,
+	BPF_FUNC_skb_vlan_push, /* bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) */
+	BPF_FUNC_skb_vlan_pop,  /* bpf_skb_vlan_pop(skb) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 247450a5e387..50338071fac4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1437,6 +1437,50 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
+static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	__be16 vlan_proto = (__force __be16) r2;
+
+	if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
+		     vlan_proto != htons(ETH_P_8021AD)))
+		vlan_proto = htons(ETH_P_8021Q);
+
+	return skb_vlan_push(skb, vlan_proto, vlan_tci);
+}
+
+const struct bpf_func_proto bpf_skb_vlan_push_proto = {
+	.func           = bpf_skb_vlan_push,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_ANYTHING,
+	.arg3_type      = ARG_ANYTHING,
+};
+
+static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+
+	return skb_vlan_pop(skb);
+}
+
+const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
+	.func           = bpf_skb_vlan_pop,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+};
+
+bool bpf_helper_changes_skb_data(void *func)
+{
+	if (func == bpf_skb_vlan_push)
+		return true;
+	if (func == bpf_skb_vlan_pop)
+		return true;
+	return false;
+}
+
 static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id)
 {
@@ -1476,6 +1520,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_clone_redirect_proto;
 	case BPF_FUNC_get_cgroup_classid:
 		return &bpf_get_cgroup_classid_proto;
+	case BPF_FUNC_skb_vlan_push:
+		return &bpf_skb_vlan_push_proto;
+	case BPF_FUNC_skb_vlan_pop:
+		return &bpf_skb_vlan_pop_proto;
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 499a24256862714539e902c0499b67da2bb3ab72 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 21 Jul 2015 10:43:46 +0200
Subject: lwtunnel: infrastructure for handling light weight tunnels like mpls

Provides infrastructure to parse/dump/store encap information for
light weight tunnels like mpls. Encap information for such tunnels
is associated with fib routes.

This infrastructure is based on previous suggestions from
Eric Biederman to follow the xfrm infrastructure.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/lwtunnel.h      |   6 ++
 include/net/lwtunnel.h        | 132 +++++++++++++++++++++++++++++++
 include/uapi/linux/lwtunnel.h |  15 ++++
 net/Kconfig                   |   7 ++
 net/core/Makefile             |   1 +
 net/core/lwtunnel.c           | 179 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 340 insertions(+)
 create mode 100644 include/linux/lwtunnel.h
 create mode 100644 include/net/lwtunnel.h
 create mode 100644 include/uapi/linux/lwtunnel.h
 create mode 100644 net/core/lwtunnel.c

(limited to 'include/linux')

diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h
new file mode 100644
index 000000000000..97f32f8b4ae1
--- /dev/null
+++ b/include/linux/lwtunnel.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_LWTUNNEL_H_
+#define _LINUX_LWTUNNEL_H_
+
+#include <uapi/linux/lwtunnel.h>
+
+#endif /* _LINUX_LWTUNNEL_H_ */
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
new file mode 100644
index 000000000000..df24b3611ff4
--- /dev/null
+++ b/include/net/lwtunnel.h
@@ -0,0 +1,132 @@
+#ifndef __NET_LWTUNNEL_H
+#define __NET_LWTUNNEL_H 1
+
+#include <linux/lwtunnel.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <net/route.h>
+
+#define LWTUNNEL_HASH_BITS   7
+#define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)
+
+/* lw tunnel state flags */
+#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1
+
+struct lwtunnel_state {
+	__u16		type;
+	__u16		flags;
+	atomic_t	refcnt;
+	int             len;
+	__u8            data[0];
+};
+
+struct lwtunnel_encap_ops {
+	int (*build_state)(struct net_device *dev, struct nlattr *encap,
+			   struct lwtunnel_state **ts);
+	int (*output)(struct sock *sk, struct sk_buff *skb);
+	int (*fill_encap)(struct sk_buff *skb,
+			  struct lwtunnel_state *lwtstate);
+	int (*get_encap_size)(struct lwtunnel_state *lwtstate);
+	int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b);
+};
+
+extern const struct lwtunnel_encap_ops __rcu *
+		lwtun_encaps[LWTUNNEL_ENCAP_MAX+1];
+
+#ifdef CONFIG_LWTUNNEL
+static inline void lwtunnel_state_get(struct lwtunnel_state *lws)
+{
+	atomic_inc(&lws->refcnt);
+}
+
+static inline void lwtunnel_state_put(struct lwtunnel_state *lws)
+{
+	if (!lws)
+		return;
+
+	if (atomic_dec_and_test(&lws->refcnt))
+		kfree(lws);
+}
+
+static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
+{
+	if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT))
+		return true;
+
+	return false;
+}
+
+int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
+			   unsigned int num);
+int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
+			   unsigned int num);
+int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+			 struct nlattr *encap,
+			 struct lwtunnel_state **lws);
+int lwtunnel_fill_encap(struct sk_buff *skb,
+			struct lwtunnel_state *lwtstate);
+int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate);
+struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len);
+int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
+
+#else
+
+static inline void lwtunnel_state_get(struct lwtunnel_state *lws)
+{
+}
+
+static inline void lwtunnel_state_put(struct lwtunnel_state *lws)
+{
+}
+
+static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate)
+{
+	return false;
+}
+
+static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
+					 unsigned int num)
+{
+	return -EOPNOTSUPP;
+
+}
+
+static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
+					 unsigned int num)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+				       struct nlattr *encap,
+				       struct lwtunnel_state **lws)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int lwtunnel_fill_encap(struct sk_buff *skb,
+				      struct lwtunnel_state *lwtstate)
+{
+	return 0;
+}
+
+static inline int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
+{
+	return 0;
+}
+
+static inline struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len)
+{
+	return NULL;
+}
+
+static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a,
+				     struct lwtunnel_state *b)
+{
+	return 0;
+}
+
+#endif
+
+#endif /* __NET_LWTUNNEL_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
new file mode 100644
index 000000000000..aa611d931a31
--- /dev/null
+++ b/include/uapi/linux/lwtunnel.h
@@ -0,0 +1,15 @@
+#ifndef _UAPI_LWTUNNEL_H_
+#define _UAPI_LWTUNNEL_H_
+
+#include <linux/types.h>
+
+enum lwtunnel_encap_types {
+	LWTUNNEL_ENCAP_NONE,
+	LWTUNNEL_ENCAP_MPLS,
+	__LWTUNNEL_ENCAP_MAX,
+};
+
+#define LWTUNNEL_ENCAP_MAX (__LWTUNNEL_ENCAP_MAX - 1)
+
+
+#endif /* _UAPI_LWTUNNEL_H_ */
diff --git a/net/Kconfig b/net/Kconfig
index 57a7c5af3175..7021c1bf44d6 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -374,6 +374,13 @@ source "net/caif/Kconfig"
 source "net/ceph/Kconfig"
 source "net/nfc/Kconfig"
 
+config LWTUNNEL
+	bool "Network light weight tunnels"
+	---help---
+	  This feature provides an infrastructure to support light weight
+	  tunnels like mpls. There is no netdevice associated with a light
+	  weight tunnel endpoint. Tunnel encapsulation parameters are stored
+	  with light weight tunnel state associated with fib routes.
 
 endif   # if NET
 
diff --git a/net/core/Makefile b/net/core/Makefile
index fec0856dd6c0..086b01fbe1bd 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
 obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
 obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
+obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
new file mode 100644
index 000000000000..d7ae3a235b4b
--- /dev/null
+++ b/net/core/lwtunnel.c
@@ -0,0 +1,179 @@
+/*
+ * lwtunnel	Infrastructure for light weight tunnels like mpls
+ *
+ * Authors:	Roopa Prabhu, <roopa@cumulusnetworks.com>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/lwtunnel.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/err.h>
+
+#include <net/lwtunnel.h>
+#include <net/rtnetlink.h>
+
+struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
+{
+	struct lwtunnel_state *lws;
+
+	lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC);
+
+	return lws;
+}
+EXPORT_SYMBOL(lwtunnel_state_alloc);
+
+const struct lwtunnel_encap_ops __rcu *
+		lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly;
+
+int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops,
+			   unsigned int num)
+{
+	if (num > LWTUNNEL_ENCAP_MAX)
+		return -ERANGE;
+
+	return !cmpxchg((const struct lwtunnel_encap_ops **)
+			&lwtun_encaps[num],
+			NULL, ops) ? 0 : -1;
+}
+EXPORT_SYMBOL(lwtunnel_encap_add_ops);
+
+int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
+			   unsigned int encap_type)
+{
+	int ret;
+
+	if (encap_type == LWTUNNEL_ENCAP_NONE ||
+	    encap_type > LWTUNNEL_ENCAP_MAX)
+		return -ERANGE;
+
+	ret = (cmpxchg((const struct lwtunnel_encap_ops **)
+		       &lwtun_encaps[encap_type],
+		       ops, NULL) == ops) ? 0 : -1;
+
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_encap_del_ops);
+
+int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
+			 struct nlattr *encap, struct lwtunnel_state **lws)
+{
+	const struct lwtunnel_encap_ops *ops;
+	int ret = -EINVAL;
+
+	if (encap_type == LWTUNNEL_ENCAP_NONE ||
+	    encap_type > LWTUNNEL_ENCAP_MAX)
+		return ret;
+
+	ret = -EOPNOTSUPP;
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[encap_type]);
+	if (likely(ops && ops->build_state))
+		ret = ops->build_state(dev, encap, lws);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_build_state);
+
+int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
+{
+	const struct lwtunnel_encap_ops *ops;
+	struct nlattr *nest;
+	int ret = -EINVAL;
+
+	if (!lwtstate)
+		return 0;
+
+	if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+	    lwtstate->type > LWTUNNEL_ENCAP_MAX)
+		return 0;
+
+	ret = -EOPNOTSUPP;
+	nest = nla_nest_start(skb, RTA_ENCAP);
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+	if (likely(ops && ops->fill_encap))
+		ret = ops->fill_encap(skb, lwtstate);
+	rcu_read_unlock();
+
+	if (ret)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest);
+	ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type);
+	if (ret)
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+
+	return (ret == -EOPNOTSUPP ? 0 : ret);
+}
+EXPORT_SYMBOL(lwtunnel_fill_encap);
+
+int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate)
+{
+	const struct lwtunnel_encap_ops *ops;
+	int ret = 0;
+
+	if (!lwtstate)
+		return 0;
+
+	if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+	    lwtstate->type > LWTUNNEL_ENCAP_MAX)
+		return 0;
+
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+	if (likely(ops && ops->get_encap_size))
+		ret = nla_total_size(ops->get_encap_size(lwtstate));
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_get_encap_size);
+
+int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	const struct lwtunnel_encap_ops *ops;
+	int ret = 0;
+
+	if (!a && !b)
+		return 0;
+
+	if (!a || !b)
+		return 1;
+
+	if (a->type != b->type)
+		return 1;
+
+	if (a->type == LWTUNNEL_ENCAP_NONE ||
+	    a->type > LWTUNNEL_ENCAP_MAX)
+		return 0;
+
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[a->type]);
+	if (likely(ops && ops->cmp_encap))
+		ret = ops->cmp_encap(a, b);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_cmp_encap);
-- 
cgit v1.2.3


From e3e4712ec0961ed586a8db340bd994c4ad7f5dba Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 21 Jul 2015 10:43:53 +0200
Subject: mpls: ip tunnel support

This implementation uses lwtunnel infrastructure to register
hooks for mpls tunnel encaps.

It picks cues from iptunnel_encaps infrastructure and previous
mpls iptunnel RFC patches from Eric W. Biederman and Robert Shearman

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mpls_iptunnel.h      |   6 +
 include/net/mpls_iptunnel.h        |  29 +++++
 include/uapi/linux/mpls_iptunnel.h |  28 +++++
 net/mpls/Kconfig                   |   8 +-
 net/mpls/Makefile                  |   1 +
 net/mpls/mpls_iptunnel.c           | 233 +++++++++++++++++++++++++++++++++++++
 6 files changed, 304 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/mpls_iptunnel.h
 create mode 100644 include/net/mpls_iptunnel.h
 create mode 100644 include/uapi/linux/mpls_iptunnel.h
 create mode 100644 net/mpls/mpls_iptunnel.c

(limited to 'include/linux')

diff --git a/include/linux/mpls_iptunnel.h b/include/linux/mpls_iptunnel.h
new file mode 100644
index 000000000000..ef29eb2d6dfd
--- /dev/null
+++ b/include/linux/mpls_iptunnel.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_MPLS_IPTUNNEL_H
+#define _LINUX_MPLS_IPTUNNEL_H
+
+#include <uapi/linux/mpls_iptunnel.h>
+
+#endif  /* _LINUX_MPLS_IPTUNNEL_H */
diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h
new file mode 100644
index 000000000000..4757997f76ed
--- /dev/null
+++ b/include/net/mpls_iptunnel.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2015 Cumulus Networks, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef _NET_MPLS_IPTUNNEL_H
+#define _NET_MPLS_IPTUNNEL_H 1
+
+#define MAX_NEW_LABELS 2
+
+struct mpls_iptunnel_encap {
+	u32	label[MAX_NEW_LABELS];
+	u32	labels;
+};
+
+static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate)
+{
+	return (struct mpls_iptunnel_encap *)lwtstate->data;
+}
+
+#endif
diff --git a/include/uapi/linux/mpls_iptunnel.h b/include/uapi/linux/mpls_iptunnel.h
new file mode 100644
index 000000000000..d80a0498f77e
--- /dev/null
+++ b/include/uapi/linux/mpls_iptunnel.h
@@ -0,0 +1,28 @@
+/*
+ *	mpls tunnel api
+ *
+ *	Authors:
+ *		Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_MPLS_IPTUNNEL_H
+#define _UAPI_LINUX_MPLS_IPTUNNEL_H
+
+/* MPLS tunnel attributes
+ * [RTA_ENCAP] = {
+ *     [MPLS_IPTUNNEL_DST]
+ * }
+ */
+enum {
+	MPLS_IPTUNNEL_UNSPEC,
+	MPLS_IPTUNNEL_DST,
+	__MPLS_IPTUNNEL_MAX,
+};
+#define MPLS_IPTUNNEL_MAX (__MPLS_IPTUNNEL_MAX - 1)
+
+#endif /* _UAPI_LINUX_MPLS_IPTUNNEL_H */
diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig
index 17bde799c854..5c467ef97311 100644
--- a/net/mpls/Kconfig
+++ b/net/mpls/Kconfig
@@ -24,7 +24,13 @@ config NET_MPLS_GSO
 
 config MPLS_ROUTING
 	tristate "MPLS: routing support"
-	help
+	---help---
 	 Add support for forwarding of mpls packets.
 
+config MPLS_IPTUNNEL
+	tristate "MPLS: IP over MPLS tunnel support"
+	depends on LWTUNNEL && MPLS_ROUTING
+	---help---
+	 mpls ip tunnel support.
+
 endif # MPLS
diff --git a/net/mpls/Makefile b/net/mpls/Makefile
index 65bbe68c72e6..9ca923625016 100644
--- a/net/mpls/Makefile
+++ b/net/mpls/Makefile
@@ -3,5 +3,6 @@
 #
 obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o
 obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o
+obj-$(CONFIG_MPLS_IPTUNNEL) += mpls_iptunnel.o
 
 mpls_router-y := af_mpls.o
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
new file mode 100644
index 000000000000..eea096f21ba5
--- /dev/null
+++ b/net/mpls/mpls_iptunnel.c
@@ -0,0 +1,233 @@
+/*
+ * mpls tunnels	An implementation mpls tunnels using the light weight tunnel
+ *		infrastructure
+ *
+ * Authors:	Roopa Prabhu, <roopa@cumulusnetworks.com>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ */
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <linux/mpls.h>
+#include <linux/vmalloc.h>
+#include <net/ip.h>
+#include <net/dst.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include <net/ip6_fib.h>
+#include <net/route.h>
+#include <net/mpls_iptunnel.h>
+#include <linux/mpls_iptunnel.h>
+#include "internal.h"
+
+static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = {
+	[MPLS_IPTUNNEL_DST]	= { .type = NLA_U32 },
+};
+
+static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en)
+{
+	/* The size of the layer 2.5 labels to be added for this route */
+	return en->labels * sizeof(struct mpls_shim_hdr);
+}
+
+int mpls_output(struct sock *sk, struct sk_buff *skb)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+	struct mpls_shim_hdr *hdr;
+	struct net_device *out_dev;
+	unsigned int hh_len;
+	unsigned int new_header_size;
+	unsigned int mtu;
+	struct dst_entry *dst = skb_dst(skb);
+	struct rtable *rt = NULL;
+	struct rt6_info *rt6 = NULL;
+	struct lwtunnel_state *lwtstate = NULL;
+	int err = 0;
+	bool bos;
+	int i;
+	unsigned int ttl;
+
+	/* Obtain the ttl */
+	if (skb->protocol == htons(ETH_P_IP)) {
+		ttl = ip_hdr(skb)->ttl;
+		rt = (struct rtable *)dst;
+		lwtstate = rt->rt_lwtstate;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		ttl = ipv6_hdr(skb)->hop_limit;
+		rt6 = (struct rt6_info *)dst;
+		lwtstate = rt6->rt6i_lwtstate;
+	} else {
+		goto drop;
+	}
+
+	skb_orphan(skb);
+
+	/* Find the output device */
+	out_dev = rcu_dereference(dst->dev);
+	if (!mpls_output_possible(out_dev) ||
+	    !lwtstate || skb_warn_if_lro(skb))
+		goto drop;
+
+	skb_forward_csum(skb);
+
+	tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+	/* Verify the destination can hold the packet */
+	new_header_size = mpls_encap_size(tun_encap_info);
+	mtu = mpls_dev_mtu(out_dev);
+	if (mpls_pkt_too_big(skb, mtu - new_header_size))
+		goto drop;
+
+	hh_len = LL_RESERVED_SPACE(out_dev);
+	if (!out_dev->header_ops)
+		hh_len = 0;
+
+	/* Ensure there is enough space for the headers in the skb */
+	if (skb_cow(skb, hh_len + new_header_size))
+		goto drop;
+
+	skb_push(skb, new_header_size);
+	skb_reset_network_header(skb);
+
+	skb->dev = out_dev;
+	skb->protocol = htons(ETH_P_MPLS_UC);
+
+	/* Push the new labels */
+	hdr = mpls_hdr(skb);
+	bos = true;
+	for (i = tun_encap_info->labels - 1; i >= 0; i--) {
+		hdr[i] = mpls_entry_encode(tun_encap_info->label[i],
+					   ttl, 0, bos);
+		bos = false;
+	}
+
+	if (rt)
+		err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway,
+				 skb);
+	else if (rt6)
+		err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway,
+				 skb);
+	if (err)
+		net_dbg_ratelimited("%s: packet transmission failed: %d\n",
+				    __func__, err);
+
+	return 0;
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
+			    struct lwtunnel_state **ts)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+	struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1];
+	struct lwtunnel_state *newts;
+	int tun_encap_info_len;
+	int ret;
+
+	ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla,
+			       mpls_iptunnel_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[MPLS_IPTUNNEL_DST])
+		return -EINVAL;
+
+	tun_encap_info_len = sizeof(*tun_encap_info);
+
+	newts = lwtunnel_state_alloc(tun_encap_info_len);
+	if (!newts)
+		return -ENOMEM;
+
+	newts->len = tun_encap_info_len;
+	tun_encap_info = mpls_lwtunnel_encap(newts);
+	ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS,
+			     &tun_encap_info->labels, tun_encap_info->label);
+	if (ret)
+		goto errout;
+	newts->type = LWTUNNEL_ENCAP_MPLS;
+	newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+	*ts = newts;
+
+	return 0;
+
+errout:
+	kfree(newts);
+	*ts = NULL;
+
+	return ret;
+}
+
+static int mpls_fill_encap_info(struct sk_buff *skb,
+				struct lwtunnel_state *lwtstate)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+	
+	tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+	if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels,
+			   tun_encap_info->label))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	struct mpls_iptunnel_encap *tun_encap_info;
+
+	tun_encap_info = mpls_lwtunnel_encap(lwtstate);
+
+	return nla_total_size(tun_encap_info->labels * 4);
+}
+
+static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a);
+	struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b);
+	int l;
+
+	if (a_hdr->labels != b_hdr->labels)
+		return 1;
+
+	for (l = 0; l < MAX_NEW_LABELS; l++)
+		if (a_hdr->label[l] != b_hdr->label[l])
+			return 1;
+	return 0;
+}
+
+static const struct lwtunnel_encap_ops mpls_iptun_ops = {
+	.build_state = mpls_build_state,
+	.output = mpls_output,
+	.fill_encap = mpls_fill_encap_info,
+	.get_encap_size = mpls_encap_nlsize,
+	.cmp_encap = mpls_encap_cmp,
+};
+
+static int __init mpls_iptunnel_init(void)
+{
+	return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
+}
+module_init(mpls_iptunnel_init);
+
+static void __exit mpls_iptunnel_exit(void)
+{
+	lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
+}
+module_exit(mpls_iptunnel_exit);
+
+MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From ee122c79d4227f6ec642157834b6a90fcffa4382 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 21 Jul 2015 10:43:58 +0200
Subject: vxlan: Flow based tunneling

Allows putting a VXLAN device into a new flow-based mode in which
skbs with a ip_tunnel_info dst metadata attached will be encapsulated
according to the instructions stored in there with the VXLAN device
defaults taken into consideration.

Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is
set, the packet processing will populate a ip_tunnel_info struct for
each packet received and attach it to the skb using the new metadata
dst.  The metadata structure will contain the outer header and tunnel
header fields which have been stripped off. Layers further up in the
stack such as routing, tc or netfitler can later match on these fields
and perform forwarding. It is the responsibility of upper layers to
ensure that the flag is set if the metadata is needed. The flag limits
the additional cost of metadata collecting based on demand.

This prepares the VXLAN device to be steered by the routing and other
subsystems which allows to support encapsulation for a large number
of tunnel endpoints and tunnel ids through a single net_device which
improves the scalability.

It also allows for OVS to leverage this mode which in turn allows for
the removal of the OVS specific VXLAN code.

Because the skb is currently scrubed in vxlan_rcv(), the attachment of
the new dst metadata is postponed until after scrubing which requires
the temporary addition of a new member to vxlan_metadata. This member
is removed again in a later commit after the indirect VXLAN receive API
has been removed.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c          | 149 ++++++++++++++++++++++++++++++++++++-------
 include/linux/skbuff.h       |   1 +
 include/net/dst_metadata.h   |  13 ++++
 include/net/ip_tunnels.h     |  14 ++++
 include/net/vxlan.h          |  10 ++-
 include/uapi/linux/if_link.h |   1 +
 6 files changed, 165 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index ec86a11743fd..06c092b05a51 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -49,6 +49,7 @@
 #include <net/ip6_tunnel.h>
 #include <net/ip6_checksum.h>
 #endif
+#include <net/dst_metadata.h>
 
 #define VXLAN_VERSION	"0.1"
 
@@ -140,6 +141,11 @@ struct vxlan_dev {
 static u32 vxlan_salt __read_mostly;
 static struct workqueue_struct *vxlan_wq;
 
+static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
+{
+	return vs->flags & VXLAN_F_COLLECT_METADATA;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 static inline
 bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
@@ -1164,10 +1170,13 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 /* Callback from net/ipv4/udp.c to receive packets */
 static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
+	struct metadata_dst *tun_dst = NULL;
+	struct ip_tunnel_info *info;
 	struct vxlan_sock *vs;
 	struct vxlanhdr *vxh;
 	u32 flags, vni;
-	struct vxlan_metadata md = {0};
+	struct vxlan_metadata _md;
+	struct vxlan_metadata *md = &_md;
 
 	/* Need Vxlan and inner Ethernet header to be present */
 	if (!pskb_may_pull(skb, VXLAN_HLEN))
@@ -1202,6 +1211,33 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		vni &= VXLAN_VNI_MASK;
 	}
 
+	if (vxlan_collect_metadata(vs)) {
+		const struct iphdr *iph = ip_hdr(skb);
+
+		tun_dst = metadata_dst_alloc(sizeof(*md), GFP_ATOMIC);
+		if (!tun_dst)
+			goto drop;
+
+		info = &tun_dst->u.tun_info;
+		info->key.ipv4_src = iph->saddr;
+		info->key.ipv4_dst = iph->daddr;
+		info->key.ipv4_tos = iph->tos;
+		info->key.ipv4_ttl = iph->ttl;
+		info->key.tp_src = udp_hdr(skb)->source;
+		info->key.tp_dst = udp_hdr(skb)->dest;
+
+		info->mode = IP_TUNNEL_INFO_RX;
+		info->key.tun_flags = TUNNEL_KEY;
+		info->key.tun_id = cpu_to_be64(vni >> 8);
+		if (udp_hdr(skb)->check != 0)
+			info->key.tun_flags |= TUNNEL_CSUM;
+
+		md = ip_tunnel_info_opts(info, sizeof(*md));
+		md->tun_dst = tun_dst;
+	} else {
+		memset(md, 0, sizeof(*md));
+	}
+
 	/* For backwards compatibility, only allow reserved fields to be
 	 * used by VXLAN extensions if explicitly requested.
 	 */
@@ -1209,13 +1245,16 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		struct vxlanhdr_gbp *gbp;
 
 		gbp = (struct vxlanhdr_gbp *)vxh;
-		md.gbp = ntohs(gbp->policy_id);
+		md->gbp = ntohs(gbp->policy_id);
+
+		if (tun_dst)
+			info->key.tun_flags |= TUNNEL_VXLAN_OPT;
 
 		if (gbp->dont_learn)
-			md.gbp |= VXLAN_GBP_DONT_LEARN;
+			md->gbp |= VXLAN_GBP_DONT_LEARN;
 
 		if (gbp->policy_applied)
-			md.gbp |= VXLAN_GBP_POLICY_APPLIED;
+			md->gbp |= VXLAN_GBP_POLICY_APPLIED;
 
 		flags &= ~VXLAN_GBP_USED_BITS;
 	}
@@ -1233,8 +1272,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		goto bad_flags;
 	}
 
-	md.vni = vxh->vx_vni;
-	vs->rcv(vs, skb, &md);
+	md->vni = vxh->vx_vni;
+	vs->rcv(vs, skb, md);
 	return 0;
 
 drop:
@@ -1247,6 +1286,9 @@ bad_flags:
 		   ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
 
 error:
+	if (tun_dst)
+		dst_release((struct dst_entry *)tun_dst);
+
 	/* Return non vxlan pkt */
 	return 1;
 }
@@ -1263,7 +1305,12 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 	int err = 0;
 	union vxlan_addr *remote_ip;
 
-	vni = ntohl(md->vni) >> 8;
+	/* For flow based devices, map all packets to VNI 0 */
+	if (vs->flags & VXLAN_F_FLOW_BASED)
+		vni = 0;
+	else
+		vni = ntohl(md->vni) >> 8;
+
 	/* Is this VNI defined? */
 	vxlan = vxlan_vs_find_vni(vs, vni);
 	if (!vxlan)
@@ -1292,12 +1339,19 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 #endif
 	}
 
+	if (md->tun_dst) {
+		skb_dst_set(skb, (struct dst_entry *)md->tun_dst);
+		md->tun_dst = NULL;
+	}
+
 	if ((vxlan->flags & VXLAN_F_LEARN) &&
 	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
 		goto drop;
 
 	skb_reset_network_header(skb);
-	skb->mark = md->gbp;
+	/* In flow-based mode, GBP is carried in dst_metadata */
+	if (!(vs->flags & VXLAN_F_FLOW_BASED))
+		skb->mark = md->gbp;
 
 	if (oip6)
 		err = IP6_ECN_decapsulate(oip6, skb);
@@ -1330,6 +1384,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
 
 	return;
 drop:
+	if (md->tun_dst)
+		dst_release((struct dst_entry *)md->tun_dst);
+
 	/* Consume bad packet */
 	kfree_skb(skb);
 }
@@ -1878,22 +1935,40 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			   struct vxlan_rdst *rdst, bool did_rsc)
 {
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct sock *sk = vxlan->vn_sock->sock->sk;
 	struct rtable *rt = NULL;
 	const struct iphdr *old_iph;
 	struct flowi4 fl4;
 	union vxlan_addr *dst;
-	struct vxlan_metadata md;
+	union vxlan_addr remote_ip;
+	struct vxlan_metadata _md;
+	struct vxlan_metadata *md = &_md;
 	__be16 src_port = 0, dst_port;
 	u32 vni;
 	__be16 df = 0;
 	__u8 tos, ttl;
 	int err;
+	u32 flags = vxlan->flags;
 
-	dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
-	vni = rdst->remote_vni;
-	dst = &rdst->remote_ip;
+	if (rdst) {
+		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
+		vni = rdst->remote_vni;
+		dst = &rdst->remote_ip;
+	} else {
+		if (!info) {
+			WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
+				  dev->name);
+			goto drop;
+		}
+
+		dst_port = info->key.tp_dst ? : vxlan->dst_port;
+		vni = be64_to_cpu(info->key.tun_id);
+		remote_ip.sin.sin_family = AF_INET;
+		remote_ip.sin.sin_addr.s_addr = info->key.ipv4_dst;
+		dst = &remote_ip;
+	}
 
 	if (vxlan_addr_any(dst)) {
 		if (did_rsc) {
@@ -1918,8 +1993,25 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 				     vxlan->port_max, true);
 
 	if (dst->sa.sa_family == AF_INET) {
+		if (info) {
+			if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
+				df = htons(IP_DF);
+			if (info->key.tun_flags & TUNNEL_CSUM)
+				flags |= VXLAN_F_UDP_CSUM;
+			else
+				flags &= ~VXLAN_F_UDP_CSUM;
+
+			ttl = info->key.ipv4_ttl;
+			tos = info->key.ipv4_tos;
+
+			if (info->options_len)
+				md = ip_tunnel_info_opts(info, sizeof(*md));
+		} else {
+			md->gbp = skb->mark;
+		}
+
 		memset(&fl4, 0, sizeof(fl4));
-		fl4.flowi4_oif = rdst->remote_ifindex;
+		fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0;
 		fl4.flowi4_tos = RT_TOS(tos);
 		fl4.flowi4_mark = skb->mark;
 		fl4.flowi4_proto = IPPROTO_UDP;
@@ -1958,14 +2050,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
 		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
-		md.vni = htonl(vni << 8);
-		md.gbp = skb->mark;
-
+		md->vni = htonl(vni << 8);
 		err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr,
 				     dst->sin.sin_addr.s_addr, tos, ttl, df,
-				     src_port, dst_port, &md,
+				     src_port, dst_port, md,
 				     !net_eq(vxlan->net, dev_net(vxlan->dev)),
-				     vxlan->flags);
+				     flags);
 		if (err < 0) {
 			/* skb is already freed. */
 			skb = NULL;
@@ -1980,7 +2070,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		u32 flags;
 
 		memset(&fl6, 0, sizeof(fl6));
-		fl6.flowi6_oif = rdst->remote_ifindex;
+		fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0;
 		fl6.daddr = dst->sin6.sin6_addr;
 		fl6.saddr = vxlan->saddr.sin6.sin6_addr;
 		fl6.flowi6_mark = skb->mark;
@@ -2018,11 +2108,11 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		}
 
 		ttl = ttl ? : ip6_dst_hoplimit(ndst);
-		md.vni = htonl(vni << 8);
-		md.gbp = skb->mark;
+		md->vni = htonl(vni << 8);
+		md->gbp = skb->mark;
 
 		err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr,
-				      0, ttl, src_port, dst_port, &md,
+				      0, ttl, src_port, dst_port, md,
 				      !net_eq(vxlan->net, dev_net(vxlan->dev)),
 				      vxlan->flags);
 #endif
@@ -2051,6 +2141,7 @@ tx_free:
 static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
+	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	struct ethhdr *eth;
 	bool did_rsc = false;
 	struct vxlan_rdst *rdst, *fdst = NULL;
@@ -2078,6 +2169,12 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 #endif
 	}
 
+	if (vxlan->flags & VXLAN_F_FLOW_BASED &&
+	    info && info->mode == IP_TUNNEL_INFO_TX) {
+		vxlan_xmit_one(skb, dev, NULL, false);
+		return NETDEV_TX_OK;
+	}
+
 	f = vxlan_find_mac(vxlan, eth->h_dest);
 	did_rsc = false;
 
@@ -2405,6 +2502,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_RSC]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_L2MISS]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_L3MISS]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_FLOWBASED]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
 	[IFLA_VXLAN_UDP_CSUM]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
@@ -2681,6 +2779,10 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
 	if (data[IFLA_VXLAN_LIMIT])
 		vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
 
+	if (data[IFLA_VXLAN_FLOWBASED] &&
+	    nla_get_u8(data[IFLA_VXLAN_FLOWBASED]))
+		vxlan->flags |= VXLAN_F_FLOW_BASED;
+
 	if (data[IFLA_VXLAN_PORT_RANGE]) {
 		const struct ifla_vxlan_port_range *p
 			= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
@@ -2777,6 +2879,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L2MISS */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L3MISS */
+		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_FLOWBASED */
 		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
 		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
 		nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
@@ -2843,6 +2946,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			!!(vxlan->flags & VXLAN_F_L2MISS)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_L3MISS,
 			!!(vxlan->flags & VXLAN_F_L3MISS)) ||
+	    nla_put_u8(skb, IFLA_VXLAN_FLOWBASED,
+		       !!(vxlan->flags & VXLAN_F_FLOW_BASED)) ||
 	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
 	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) ||
 	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port) ||
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6bd96fe9416a..648a2c241993 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3469,5 +3469,6 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
 			       skb_network_header(skb);
 	return hdr_len + skb_gso_transport_seglen(skb);
 }
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 4f7694f3c7d0..e843937fb30a 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -8,6 +8,9 @@
 struct metadata_dst {
 	struct dst_entry		dst;
 	size_t				opts_len;
+	union {
+		struct ip_tunnel_info	tun_info;
+	} u;
 };
 
 static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
@@ -20,6 +23,16 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb)
 	return NULL;
 }
 
+static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb)
+{
+	struct metadata_dst *md_dst = skb_metadata_dst(skb);
+
+	if (md_dst)
+		return &md_dst->u.tun_info;
+
+	return NULL;
+}
+
 static inline bool skb_valid_dst(const struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 6b9d559ce5f5..d11530f1c1e2 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -38,10 +38,19 @@ struct ip_tunnel_key {
 	__be16			tp_dst;
 } __packed __aligned(4); /* Minimize padding. */
 
+/* Indicates whether the tunnel info structure represents receive
+ * or transmit tunnel parameters.
+ */
+enum {
+	IP_TUNNEL_INFO_RX,
+	IP_TUNNEL_INFO_TX,
+};
+
 struct ip_tunnel_info {
 	struct ip_tunnel_key	key;
 	const void		*options;
 	u8			options_len;
+	u8			mode;
 };
 
 /* 6rd prefix/relay information */
@@ -284,6 +293,11 @@ static inline void iptunnel_xmit_stats(int err,
 	}
 }
 
+static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n)
+{
+	return info + 1;
+}
+
 #endif /* CONFIG_INET */
 
 #endif /* __NET_IP_TUNNELS_H */
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 0082b5d33d7d..80a2da29e088 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -7,6 +7,7 @@
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/udp.h>
+#include <net/dst_metadata.h>
 
 #define VNI_HASH_BITS	10
 #define VNI_HASH_SIZE	(1<<VNI_HASH_BITS)
@@ -97,6 +98,9 @@ struct vxlanhdr {
 struct vxlan_metadata {
 	__be32		vni;
 	u32		gbp;
+
+	/* Temporary until vxlan_rcv() API is gone */
+	struct metadata_dst *tun_dst;
 };
 
 struct vxlan_sock;
@@ -130,6 +134,8 @@ struct vxlan_sock {
 #define VXLAN_F_REMCSUM_RX		0x400
 #define VXLAN_F_GBP			0x800
 #define VXLAN_F_REMCSUM_NOPARTIAL	0x1000
+#define VXLAN_F_COLLECT_METADATA	0x2000
+#define VXLAN_F_FLOW_BASED		0x4000
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
@@ -137,7 +143,9 @@ struct vxlan_sock {
 #define VXLAN_F_RCV_FLAGS		(VXLAN_F_GBP |			\
 					 VXLAN_F_UDP_ZERO_CSUM6_RX |	\
 					 VXLAN_F_REMCSUM_RX |		\
-					 VXLAN_F_REMCSUM_NOPARTIAL)
+					 VXLAN_F_REMCSUM_NOPARTIAL |	\
+					 VXLAN_F_COLLECT_METADATA |	\
+					 VXLAN_F_FLOW_BASED)
 
 struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
 				  vxlan_rcv_t *rcv, void *data,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 24d68b797c59..9eeb5d9cf8f0 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -382,6 +382,7 @@ enum {
 	IFLA_VXLAN_REMCSUM_RX,
 	IFLA_VXLAN_GBP,
 	IFLA_VXLAN_REMCSUM_NOPARTIAL,
+	IFLA_VXLAN_FLOWBASED,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
cgit v1.2.3


From 3985e8a3611a93bb36789f65db862e5700aab65e Mon Sep 17 00:00:00 2001
From: Erik Kline <ek@google.com>
Date: Wed, 22 Jul 2015 16:38:25 +0900
Subject: ipv6: sysctl to restrict candidate source addresses

Per RFC 6724, section 4, "Candidate Source Addresses":

    It is RECOMMENDED that the candidate source addresses be the set
    of unicast addresses assigned to the interface that will be used
    to send to the destination (the "outgoing" interface).

Add a sysctl to enable this behaviour.

Signed-off-by: Erik Kline <ek@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  7 +++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 22 +++++++++++++++++++---
 4 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index f63aeefd2c24..1a5ab21bcca5 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1460,6 +1460,13 @@ router_solicitations - INTEGER
 	routers are present.
 	Default: 3
 
+use_oif_addrs_only - BOOLEAN
+	When enabled, the candidate source addresses for destinations
+	routed via this interface are restricted to the set of addresses
+	configured on this interface (vis. RFC 6724, section 4).
+
+	Default: false
+
 use_tempaddr - INTEGER
 	Preference for Privacy Extensions (RFC3041).
 	  <= 0 : disable Privacy Extensions
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 1319a6bb6b82..06ed637225b8 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -57,6 +57,7 @@ struct ipv6_devconf {
 		bool initialized;
 		struct in6_addr secret;
 	} stable_secret;
+	__s32		use_oif_addrs_only;
 	void		*sysctl;
 };
 
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 5efa54ae567c..641a146ead7d 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -171,6 +171,7 @@ enum {
 	DEVCONF_USE_OPTIMISTIC,
 	DEVCONF_ACCEPT_RA_MTU,
 	DEVCONF_STABLE_SECRET,
+	DEVCONF_USE_OIF_ADDRS_ONLY,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 32153c248959..eb0c6a3a8a00 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -211,7 +211,8 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.accept_ra_mtu		= 1,
 	.stable_secret		= {
 		.initialized = false,
-	}
+	},
+	.use_oif_addrs_only	= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -253,6 +254,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.stable_secret		= {
 		.initialized = false,
 	},
+	.use_oif_addrs_only	= 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -1472,11 +1474,16 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
 	 *    include addresses assigned to interfaces
 	 *    belonging to the same site as the outgoing
 	 *    interface.)
+	 *  - "It is RECOMMENDED that the candidate source addresses
+	 *    be the set of unicast addresses assigned to the
+	 *    interface that will be used to send to the destination
+	 *    (the 'outgoing' interface)." (RFC 6724)
 	 */
 	if (dst_dev) {
+		idev = __in6_dev_get(dst_dev);
 		if ((dst_type & IPV6_ADDR_MULTICAST) ||
-		    dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) {
-			idev = __in6_dev_get(dst_dev);
+		    dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL ||
+		    (idev && idev->cnf.use_oif_addrs_only)) {
 			use_oif_addr = true;
 		}
 	}
@@ -4607,6 +4614,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local;
 	array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu;
 	/* we omit DEVCONF_STABLE_SECRET for now */
+	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5605,6 +5613,14 @@ static struct addrconf_sysctl_table
 			.mode		= 0600,
 			.proc_handler	= addrconf_sysctl_stable_secret,
 		},
+		{
+			.procname       = "use_oif_addrs_only",
+			.data           = &ipv6_devconf.use_oif_addrs_only,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = proc_dointvec,
+
+		},
 		{
 			/* sentinel */
 		}
-- 
cgit v1.2.3


From 3bbd14e0a2e3a988b1b5fe702a2539bd8d0ec622 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 20 Jul 2015 13:32:52 +0200
Subject: netfilter: rename local nf_hook_list to hook_list

085db2c04557 ("netfilter: Per network namespace netfilter hooks.") introduced a
new nf_hook_list that is global, so let's avoid this overlap.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/netfilter.h | 14 +++++++-------
 net/netfilter/core.c      | 28 ++++++++++++++--------------
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index e01da73ee6c4..d788ce62d826 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -140,20 +140,20 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg);
 #ifdef HAVE_JUMP_LABEL
 extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 
-static inline bool nf_hook_list_active(struct list_head *nf_hook_list,
+static inline bool nf_hook_list_active(struct list_head *hook_list,
 				       u_int8_t pf, unsigned int hook)
 {
 	if (__builtin_constant_p(pf) &&
 	    __builtin_constant_p(hook))
 		return static_key_false(&nf_hooks_needed[pf][hook]);
 
-	return !list_empty(nf_hook_list);
+	return !list_empty(hook_list);
 }
 #else
-static inline bool nf_hook_list_active(struct list_head *nf_hook_list,
+static inline bool nf_hook_list_active(struct list_head *hook_list,
 				       u_int8_t pf, unsigned int hook)
 {
-	return !list_empty(nf_hook_list);
+	return !list_empty(hook_list);
 }
 #endif
 
@@ -175,12 +175,12 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 				 int thresh)
 {
 	struct net *net = dev_net(indev ? indev : outdev);
-	struct list_head *nf_hook_list = &net->nf.hooks[pf][hook];
+	struct list_head *hook_list = &net->nf.hooks[pf][hook];
 
-	if (nf_hook_list_active(nf_hook_list, pf, hook)) {
+	if (nf_hook_list_active(hook_list, pf, hook)) {
 		struct nf_hook_state state;
 
-		nf_hook_state_init(&state, nf_hook_list, hook, thresh,
+		nf_hook_state_init(&state, hook_list, hook, thresh,
 				   pf, indev, outdev, sk, okfn);
 		return nf_hook_slow(skb, &state);
 	}
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 0ecb2b52f276..2a5a0704245c 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -62,20 +62,20 @@ EXPORT_SYMBOL(nf_hooks_needed);
 
 static DEFINE_MUTEX(nf_hook_mutex);
 
-static struct list_head *find_nf_hook_list(struct net *net,
+static struct list_head *nf_find_hook_list(struct net *net,
 					   const struct nf_hook_ops *reg)
 {
-	struct list_head *nf_hook_list = NULL;
+	struct list_head *hook_list = NULL;
 
 	if (reg->pf != NFPROTO_NETDEV)
-		nf_hook_list = &net->nf.hooks[reg->pf][reg->hooknum];
+		hook_list = &net->nf.hooks[reg->pf][reg->hooknum];
 	else if (reg->hooknum == NF_NETDEV_INGRESS) {
 #ifdef CONFIG_NETFILTER_INGRESS
 		if (reg->dev && dev_net(reg->dev) == net)
-			nf_hook_list = &reg->dev->nf_hooks_ingress;
+			hook_list = &reg->dev->nf_hooks_ingress;
 #endif
 	}
-	return nf_hook_list;
+	return hook_list;
 }
 
 struct nf_hook_entry {
@@ -85,7 +85,7 @@ struct nf_hook_entry {
 
 int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
-	struct list_head *nf_hook_list;
+	struct list_head *hook_list;
 	struct nf_hook_entry *entry;
 	struct nf_hook_ops *elem;
 
@@ -96,14 +96,14 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 	entry->orig_ops	= reg;
 	entry->ops	= *reg;
 
-	nf_hook_list = find_nf_hook_list(net, reg);
-	if (!nf_hook_list) {
+	hook_list = nf_find_hook_list(net, reg);
+	if (!hook_list) {
 		kfree(entry);
 		return -ENOENT;
 	}
 
 	mutex_lock(&nf_hook_mutex);
-	list_for_each_entry(elem, nf_hook_list, list) {
+	list_for_each_entry(elem, hook_list, list) {
 		if (reg->priority < elem->priority)
 			break;
 	}
@@ -122,16 +122,16 @@ EXPORT_SYMBOL(nf_register_net_hook);
 
 void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
-	struct list_head *nf_hook_list;
+	struct list_head *hook_list;
 	struct nf_hook_entry *entry;
 	struct nf_hook_ops *elem;
 
-	nf_hook_list = find_nf_hook_list(net, reg);
-	if (!nf_hook_list)
+	hook_list = nf_find_hook_list(net, reg);
+	if (!hook_list)
 		return;
 
 	mutex_lock(&nf_hook_mutex);
-	list_for_each_entry(elem, nf_hook_list, list) {
+	list_for_each_entry(elem, hook_list, list) {
 		entry = container_of(elem, struct nf_hook_entry, ops);
 		if (entry->orig_ops == reg) {
 			list_del_rcu(&entry->ops.list);
@@ -139,7 +139,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 		}
 	}
 	mutex_unlock(&nf_hook_mutex);
-	if (&elem->list == nf_hook_list) {
+	if (&elem->list == hook_list) {
 		WARN(1, "nf_unregister_net_hook: hook not found!\n");
 		return;
 	}
-- 
cgit v1.2.3


From e0910bace663b78c026b73bbd711a24ccf410531 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 23 Jul 2015 15:43:56 +0200
Subject: lwtunnel: export linux/lwtunnel.h to userspace

Note also that include/linux/lwtunnel.h is not needed.

CC: Thomas Graf <tgraf@suug.ch>
CC: Roopa Prabhu <roopa@cumulusnetworks.com>
Fixes: 499a24256862 ("lwtunnel: infrastructure for handling light weight tunnels like mpls")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/lwtunnel.h  | 6 ------
 include/uapi/linux/Kbuild | 1 +
 2 files changed, 1 insertion(+), 6 deletions(-)
 delete mode 100644 include/linux/lwtunnel.h

(limited to 'include/linux')

diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h
deleted file mode 100644
index 97f32f8b4ae1..000000000000
--- a/include/linux/lwtunnel.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _LINUX_LWTUNNEL_H_
-#define _LINUX_LWTUNNEL_H_
-
-#include <uapi/linux/lwtunnel.h>
-
-#endif /* _LINUX_LWTUNNEL_H_ */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 1ff9942718fe..aafb9937b162 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -243,6 +243,7 @@ header-y += limits.h
 header-y += llc.h
 header-y += loop.h
 header-y += lp.h
+header-y += lwtunnel.h
 header-y += magic.h
 header-y += major.h
 header-y += map_to_7segment.h
-- 
cgit v1.2.3


From 2be6967cdbc95a9960b620defedbf5e02e2af619 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 23 Jul 2015 23:35:56 +0300
Subject: net/mlx5e: Support ETH_RSS_HASH_XOR

The ConnectX-4 HW implements inverted XOR8.
To make it act as XOR we re-order the HW RSS indirection table.

Set XOR to be the default RSS hash function and add ethtool API to
control it.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  1 +
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 39 ++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 46 +++++++++++++++++-----
 include/linux/mlx5/mlx5_ifc.h                      |  6 +--
 4 files changed, 79 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 3d23bd657e3c..61d8433392aa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -195,6 +195,7 @@ struct mlx5e_params {
 	u16 rx_hash_log_tbl_sz;
 	bool lro_en;
 	u32 lro_wqe_sz;
+	u8  rss_hfunc;
 };
 
 enum {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 388938482ff9..cb2853570504 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -662,6 +662,43 @@ out:
 	return err;
 }
 
+static int mlx5e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
+			  u8 *hfunc)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (hfunc)
+		*hfunc = priv->params.rss_hfunc;
+
+	return 0;
+}
+
+static int mlx5e_set_rxfh(struct net_device *netdev, const u32 *indir,
+			  const u8 *key, const u8 hfunc)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err = 0;
+
+	if (hfunc == ETH_RSS_HASH_NO_CHANGE)
+		return 0;
+
+	if ((hfunc != ETH_RSS_HASH_XOR) &&
+	    (hfunc != ETH_RSS_HASH_TOP))
+		return -EINVAL;
+
+	mutex_lock(&priv->state_lock);
+
+	priv->params.rss_hfunc = hfunc;
+	if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+		mlx5e_close_locked(priv->netdev);
+		err = mlx5e_open_locked(priv->netdev);
+	}
+
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
 const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_drvinfo       = mlx5e_get_drvinfo,
 	.get_link          = ethtool_op_get_link,
@@ -676,4 +713,6 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
 	.set_coalesce      = mlx5e_set_coalesce,
 	.get_settings      = mlx5e_get_settings,
 	.set_settings      = mlx5e_set_settings,
+	.get_rxfh          = mlx5e_get_rxfh,
+	.set_rxfh          = mlx5e_set_rxfh,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 40206da1f9d7..07d36275021e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1158,6 +1158,24 @@ static void mlx5e_close_tises(struct mlx5e_priv *priv)
 		mlx5e_close_tis(priv, tc);
 }
 
+static int mlx5e_rx_hash_fn(int hfunc)
+{
+	return (hfunc == ETH_RSS_HASH_TOP) ?
+	       MLX5_RX_HASH_FN_TOEPLITZ :
+	       MLX5_RX_HASH_FN_INVERTED_XOR8;
+}
+
+static int mlx5e_bits_invert(unsigned long a, int size)
+{
+	int inv = 0;
+	int i;
+
+	for (i = 0; i < size; i++)
+		inv |= (test_bit(size - i - 1, &a) ? 1 : 0) << i;
+
+	return inv;
+}
+
 static int mlx5e_open_rqt(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
@@ -1166,11 +1184,10 @@ static int mlx5e_open_rqt(struct mlx5e_priv *priv)
 	void *rqtc;
 	int inlen;
 	int err;
-	int sz;
+	int log_tbl_sz = priv->params.rx_hash_log_tbl_sz;
+	int sz = 1 << log_tbl_sz;
 	int i;
 
-	sz = 1 << priv->params.rx_hash_log_tbl_sz;
-
 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
 	in = mlx5_vzalloc(inlen);
 	if (!in)
@@ -1182,8 +1199,12 @@ static int mlx5e_open_rqt(struct mlx5e_priv *priv)
 	MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
 
 	for (i = 0; i < sz; i++) {
-		int ix = i % priv->params.num_channels;
+		int ix = i;
+
+		if (priv->params.rss_hfunc == ETH_RSS_HASH_XOR)
+			ix = mlx5e_bits_invert(i, log_tbl_sz);
 
+		ix = ix % priv->params.num_channels;
 		MLX5_SET(rqtc, rqtc, rq_num[i], priv->channel[ix]->rq.rqn);
 	}
 
@@ -1254,12 +1275,16 @@ static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 *tirc, int tt)
 		MLX5_SET(tirc, tirc, indirect_table,
 			 priv->rqtn);
 		MLX5_SET(tirc, tirc, rx_hash_fn,
-			 MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ);
-		MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
-		netdev_rss_key_fill(MLX5_ADDR_OF(tirc, tirc,
-						 rx_hash_toeplitz_key),
-				    MLX5_FLD_SZ_BYTES(tirc,
-						      rx_hash_toeplitz_key));
+			 mlx5e_rx_hash_fn(priv->params.rss_hfunc));
+		if (priv->params.rss_hfunc == ETH_RSS_HASH_TOP) {
+			void *rss_key = MLX5_ADDR_OF(tirc, tirc,
+						     rx_hash_toeplitz_key);
+			size_t len = MLX5_FLD_SZ_BYTES(tirc,
+						       rx_hash_toeplitz_key);
+
+			MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
+			netdev_rss_key_fill(rss_key, len);
+		}
 		break;
 	}
 
@@ -1700,6 +1725,7 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
 		MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ;
 	priv->params.num_tc                = 1;
 	priv->params.default_vlan_prio     = 0;
+	priv->params.rss_hfunc             = ETH_RSS_HASH_XOR;
 
 	priv->params.lro_en = false && !!MLX5_CAP_ETH(priv->mdev, lro_cap);
 	priv->params.lro_wqe_sz            =
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6d2f6fee041c..c60a62bba652 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1936,9 +1936,9 @@ enum {
 };
 
 enum {
-	MLX5_TIRC_RX_HASH_FN_HASH_NONE           = 0x0,
-	MLX5_TIRC_RX_HASH_FN_HASH_INVERTED_XOR8  = 0x1,
-	MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ       = 0x2,
+	MLX5_RX_HASH_FN_NONE           = 0x0,
+	MLX5_RX_HASH_FN_INVERTED_XOR8  = 0x1,
+	MLX5_RX_HASH_FN_TOEPLITZ       = 0x2,
 };
 
 enum {
-- 
cgit v1.2.3


From 311c7c71c9bb8786c96fee353fe9886c08b017fe Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 23 Jul 2015 23:35:57 +0300
Subject: net/mlx5e: Allocate DMA coherent memory on reader NUMA node

By affinity hints and XPS, each mlx5e channel is assigned a CPU
core.

Channel DMA coherent memory that is written by the NIC and read
by SW (e.g CQ buffer) is allocated on the NUMA node of the CPU
core assigned for the channel.

Channel DMA coherent memory that is written by SW and read by the
NIC (e.g SQ/RQ buffer) is allocated on the NUMA node of the NIC.

Doorbell record (written by SW and read by the NIC) is an
exception since it is accessed by SW more frequently.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/alloc.c   | 48 +++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 11 ++++--
 drivers/net/ethernet/mellanox/mlx5/core/main.c    |  6 ++-
 drivers/net/ethernet/mellanox/mlx5/core/wq.c      | 12 +++---
 drivers/net/ethernet/mellanox/mlx5/core/wq.h      |  3 +-
 include/linux/mlx5/driver.h                       |  8 ++++
 6 files changed, 70 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
index 0715b497511f..6cb38304669f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
@@ -45,15 +45,34 @@
  * register it in a memory region at HCA virtual address 0.
  */
 
-int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf)
+static void *mlx5_dma_zalloc_coherent_node(struct mlx5_core_dev *dev,
+					   size_t size, dma_addr_t *dma_handle,
+					   int node)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	int original_node;
+	void *cpu_handle;
+
+	mutex_lock(&priv->alloc_mutex);
+	original_node = dev_to_node(&dev->pdev->dev);
+	set_dev_node(&dev->pdev->dev, node);
+	cpu_handle = dma_zalloc_coherent(&dev->pdev->dev, size,
+					 dma_handle, GFP_KERNEL);
+	set_dev_node(&dev->pdev->dev, original_node);
+	mutex_unlock(&priv->alloc_mutex);
+	return cpu_handle;
+}
+
+int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
+			struct mlx5_buf *buf, int node)
 {
 	dma_addr_t t;
 
 	buf->size = size;
 	buf->npages       = 1;
 	buf->page_shift   = (u8)get_order(size) + PAGE_SHIFT;
-	buf->direct.buf   = dma_zalloc_coherent(&dev->pdev->dev,
-						size, &t, GFP_KERNEL);
+	buf->direct.buf   = mlx5_dma_zalloc_coherent_node(dev, size,
+							  &t, node);
 	if (!buf->direct.buf)
 		return -ENOMEM;
 
@@ -66,6 +85,11 @@ int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf)
 
 	return 0;
 }
+
+int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf)
+{
+	return mlx5_buf_alloc_node(dev, size, buf, dev->priv.numa_node);
+}
 EXPORT_SYMBOL_GPL(mlx5_buf_alloc);
 
 void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf)
@@ -75,7 +99,8 @@ void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf)
 }
 EXPORT_SYMBOL_GPL(mlx5_buf_free);
 
-static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct device *dma_device)
+static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct mlx5_core_dev *dev,
+						 int node)
 {
 	struct mlx5_db_pgdir *pgdir;
 
@@ -84,8 +109,9 @@ static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct device *dma_device)
 		return NULL;
 
 	bitmap_fill(pgdir->bitmap, MLX5_DB_PER_PAGE);
-	pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE,
-					    &pgdir->db_dma, GFP_KERNEL);
+
+	pgdir->db_page = mlx5_dma_zalloc_coherent_node(dev, PAGE_SIZE,
+						       &pgdir->db_dma, node);
 	if (!pgdir->db_page) {
 		kfree(pgdir);
 		return NULL;
@@ -118,7 +144,7 @@ static int mlx5_alloc_db_from_pgdir(struct mlx5_db_pgdir *pgdir,
 	return 0;
 }
 
-int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db)
+int mlx5_db_alloc_node(struct mlx5_core_dev *dev, struct mlx5_db *db, int node)
 {
 	struct mlx5_db_pgdir *pgdir;
 	int ret = 0;
@@ -129,7 +155,7 @@ int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db)
 		if (!mlx5_alloc_db_from_pgdir(pgdir, db))
 			goto out;
 
-	pgdir = mlx5_alloc_db_pgdir(&(dev->pdev->dev));
+	pgdir = mlx5_alloc_db_pgdir(dev, node);
 	if (!pgdir) {
 		ret = -ENOMEM;
 		goto out;
@@ -145,6 +171,12 @@ out:
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(mlx5_db_alloc_node);
+
+int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db)
+{
+	return mlx5_db_alloc_node(dev, db, dev->priv.numa_node);
+}
 EXPORT_SYMBOL_GPL(mlx5_db_alloc);
 
 void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 07d36275021e..57cc8960b73b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -272,6 +272,8 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 	int err;
 	int i;
 
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
+
 	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wq,
 				&rq->wq_ctrl);
 	if (err)
@@ -502,6 +504,8 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 	if (err)
 		return err;
 
+	param->wq.db_numa_node = cpu_to_node(c->cpu);
+
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
 				 &sq->wq_ctrl);
 	if (err)
@@ -702,7 +706,8 @@ static int mlx5e_create_cq(struct mlx5e_channel *c,
 	int err;
 	u32 i;
 
-	param->wq.numa = cpu_to_node(c->cpu);
+	param->wq.buf_numa_node = cpu_to_node(c->cpu);
+	param->wq.db_numa_node  = cpu_to_node(c->cpu);
 	param->eq_ix   = c->ix;
 
 	err = mlx5_cqwq_create(mdev, &param->wq, param->cqc, &cq->wq,
@@ -1000,7 +1005,7 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 	MLX5_SET(wq, wq, log_wq_sz,        priv->params.log_rq_size);
 	MLX5_SET(wq, wq, pd,               priv->pdn);
 
-	param->wq.numa   = dev_to_node(&priv->mdev->pdev->dev);
+	param->wq.buf_numa_node = dev_to_node(&priv->mdev->pdev->dev);
 	param->wq.linear = 1;
 }
 
@@ -1014,7 +1019,7 @@ static void mlx5e_build_sq_param(struct mlx5e_priv *priv,
 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
 	MLX5_SET(wq, wq, pd,            priv->pdn);
 
-	param->wq.numa = dev_to_node(&priv->mdev->pdev->dev);
+	param->wq.buf_numa_node = dev_to_node(&priv->mdev->pdev->dev);
 }
 
 static void mlx5e_build_common_cq_param(struct mlx5e_priv *priv,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index afad529838de..c34eafbf1c04 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -455,7 +455,7 @@ static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
 	struct mlx5_priv *priv  = &mdev->priv;
 	struct msix_entry *msix = priv->msix_arr;
 	int irq                 = msix[i + MLX5_EQ_VEC_COMP_BASE].vector;
-	int numa_node           = dev_to_node(&mdev->pdev->dev);
+	int numa_node           = priv->numa_node;
 	int err;
 
 	if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
@@ -668,6 +668,10 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
 	INIT_LIST_HEAD(&priv->pgdir_list);
 	spin_lock_init(&priv->mkey_lock);
 
+	mutex_init(&priv->alloc_mutex);
+
+	priv->numa_node = dev_to_node(&dev->pdev->dev);
+
 	priv->dbg_root = debugfs_create_dir(dev_name(&pdev->dev), mlx5_debugfs_root);
 	if (!priv->dbg_root)
 		return -ENOMEM;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
index 8388411582cf..ce21ee5b2357 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
@@ -73,13 +73,14 @@ int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 	wq->log_stride = MLX5_GET(wq, wqc, log_wq_stride);
 	wq->sz_m1 = (1 << MLX5_GET(wq, wqc, log_wq_sz)) - 1;
 
-	err = mlx5_db_alloc(mdev, &wq_ctrl->db);
+	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_db_alloc() failed, %d\n", err);
 		return err;
 	}
 
-	err = mlx5_buf_alloc(mdev, mlx5_wq_cyc_get_byte_size(wq), &wq_ctrl->buf);
+	err = mlx5_buf_alloc_node(mdev, mlx5_wq_cyc_get_byte_size(wq),
+				  &wq_ctrl->buf, param->buf_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_buf_alloc() failed, %d\n", err);
 		goto err_db_free;
@@ -108,13 +109,14 @@ int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 	wq->log_sz = MLX5_GET(cqc, cqc, log_cq_size);
 	wq->sz_m1 = (1 << wq->log_sz) - 1;
 
-	err = mlx5_db_alloc(mdev, &wq_ctrl->db);
+	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_db_alloc() failed, %d\n", err);
 		return err;
 	}
 
-	err = mlx5_buf_alloc(mdev, mlx5_cqwq_get_byte_size(wq), &wq_ctrl->buf);
+	err = mlx5_buf_alloc_node(mdev, mlx5_cqwq_get_byte_size(wq),
+				  &wq_ctrl->buf, param->buf_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_buf_alloc() failed, %d\n", err);
 		goto err_db_free;
@@ -144,7 +146,7 @@ int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 	wq->log_stride = MLX5_GET(wq, wqc, log_wq_stride);
 	wq->sz_m1 = (1 << MLX5_GET(wq, wqc, log_wq_sz)) - 1;
 
-	err = mlx5_db_alloc(mdev, &wq_ctrl->db);
+	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_db_alloc() failed, %d\n", err);
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
index e0ddd69fb429..6c2a8f95093c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -37,7 +37,8 @@
 
 struct mlx5_wq_param {
 	int		linear;
-	int		numa;
+	int		buf_numa_node;
+	int		db_numa_node;
 };
 
 struct mlx5_wq_ctrl {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5722d88c2429..1c0d5d062d7c 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -463,6 +463,10 @@ struct mlx5_priv {
 	/* end: mr staff */
 
 	/* start: alloc staff */
+	/* protect buffer alocation according to numa node */
+	struct mutex            alloc_mutex;
+	int                     numa_node;
+
 	struct mutex            pgdir_mutex;
 	struct list_head        pgdir_list;
 	/* end: alloc staff */
@@ -672,6 +676,8 @@ void mlx5_health_cleanup(void);
 void  __init mlx5_health_init(void);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
 void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
+int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
+			struct mlx5_buf *buf, int node);
 int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf);
 void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf);
 struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
@@ -773,6 +779,8 @@ void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev);
 int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev);
 void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev);
 int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db);
+int mlx5_db_alloc_node(struct mlx5_core_dev *dev, struct mlx5_db *db,
+		       int node);
 void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db);
 
 const char *mlx5_command_str(int command);
-- 
cgit v1.2.3


From 88a85f99e51fb2373259ab83c8bb130a9bbf3804 Mon Sep 17 00:00:00 2001
From: Achiad Shochat <achiad@mellanox.com>
Date: Thu, 23 Jul 2015 23:35:59 +0300
Subject: net/mlx5e: TX latency optimization to save DMA reads

A regular TX WQE execution involves two or more DMA reads -
one to fetch the WQE, and another one per WQE gather entry.

These DMA reads obviously increase the TX latency.
There are two mlx5 mechanisms to bypass these DMA reads:
1) Inline WQE
2) Blue Flame (BF)

An inline WQE contains a whole packet, thus saves the DMA read/s
of the regular WQE gather entry/s. Inline WQE support was already
added in the previous commit.

A BF WQE is written directly to the device I/O mapped memory, thus
enables saving the DMA read that fetches the WQE.

The BF WQE I/O write must be in cache line granularity, thus uses
the CPU write combining mechanism.
A BF WQE I/O write acts also as a TX doorbell for notifying the
device of new TX WQEs.
A BF WQE is written to the same I/O mapped address as the regular TX
doorbell, thus this address is being mapped twice - once by ioremap()
and once by io_mapping_map_wc().

While both mechanisms reduce the TX latency, they both consume more CPU
cycles than a regular WQE:
- A BF WQE must still be written to host memory, in addition to being
  written directly to the device I/O mapped memory.
- An inline WQE involves copying the SKB data into it.

To handle this tradeoff, we introduce here a heuristic algorithm that
strives to avoid using these two mechanisms in case the TX queue is
being back-pressured by the device, and limit their usage rate otherwise.

An inline WQE will always be "Blue Flamed" (written directly to the
device I/O mapped memory) while a BF WQE may not be inlined (may contain
gather entries).

Preliminary testing using netperf UDP_RR shows that the latency goes down
from 17.5us to 16.9us, while the message rate (tested with pktgen) stays
the same.

Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      | 24 +++++++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 12 ++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   | 26 ++++++++++++++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/main.c    | 26 +++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/uar.c     |  6 ++++++
 include/linux/mlx5/driver.h                       |  4 +++-
 6 files changed, 79 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index d9dc506188c8..b66edd2c5a61 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -60,6 +60,7 @@
 
 #define MLX5E_TX_CQ_POLL_BUDGET        128
 #define MLX5E_UPDATE_STATS_INTERVAL    200 /* msecs */
+#define MLX5E_SQ_BF_BUDGET             16
 
 static const char vport_strings[][ETH_GSTRING_LEN] = {
 	/* vport statistics */
@@ -268,7 +269,9 @@ struct mlx5e_sq {
 	/* dirtied @xmit */
 	u16                        pc ____cacheline_aligned_in_smp;
 	u32                        dma_fifo_pc;
-	u32                        bf_offset;
+	u16                        bf_offset;
+	u16                        prev_cc;
+	u8                         bf_budget;
 	struct mlx5e_sq_stats      stats;
 
 	struct mlx5e_cq            cq;
@@ -281,9 +284,10 @@ struct mlx5e_sq {
 	struct mlx5_wq_cyc         wq;
 	u32                        dma_fifo_mask;
 	void __iomem              *uar_map;
+	void __iomem              *uar_bf_map;
 	struct netdev_queue       *txq;
 	u32                        sqn;
-	u32                        bf_buf_size;
+	u16                        bf_buf_size;
 	u16                        max_inline;
 	u16                        edge;
 	struct device             *pdev;
@@ -493,8 +497,10 @@ int mlx5e_update_priv_params(struct mlx5e_priv *priv,
 			     struct mlx5e_params *new_params);
 
 static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
-				      struct mlx5e_tx_wqe *wqe)
+				      struct mlx5e_tx_wqe *wqe, int bf_sz)
 {
+	u16 ofst = MLX5_BF_OFFSET + sq->bf_offset;
+
 	/* ensure wqe is visible to device before updating doorbell record */
 	dma_wmb();
 
@@ -505,9 +511,15 @@ static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
 	 */
 	wmb();
 
-	mlx5_write64((__be32 *)&wqe->ctrl,
-		     sq->uar_map + MLX5_BF_OFFSET + sq->bf_offset,
-		     NULL);
+	if (bf_sz) {
+		__iowrite64_copy(sq->uar_bf_map + ofst, &wqe->ctrl, bf_sz);
+
+		/* flush the write-combining mapped buffer */
+		wmb();
+
+	} else {
+		mlx5_write64((__be32 *)&wqe->ctrl, sq->uar_map + ofst, NULL);
+	}
 
 	sq->bf_offset ^= sq->bf_buf_size;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index c55fad431cbf..4a87e9dcf52c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -514,6 +514,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 
 	sq->wq.db       = &sq->wq.db[MLX5_SND_DBR];
 	sq->uar_map     = sq->uar.map;
+	sq->uar_bf_map  = sq->uar.bf_map;
 	sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2;
 	sq->max_inline  = param->max_inline;
 
@@ -524,11 +525,12 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 	txq_ix = c->ix + tc * priv->params.num_channels;
 	sq->txq = netdev_get_tx_queue(priv->netdev, txq_ix);
 
-	sq->pdev    = c->pdev;
-	sq->mkey_be = c->mkey_be;
-	sq->channel = c;
-	sq->tc      = tc;
-	sq->edge    = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS;
+	sq->pdev      = c->pdev;
+	sq->mkey_be   = c->mkey_be;
+	sq->channel   = c;
+	sq->tc        = tc;
+	sq->edge      = (sq->wq.sz_m1 + 1) - MLX5_SEND_WQE_MAX_WQEBBS;
+	sq->bf_budget = MLX5E_SQ_BF_BUDGET;
 	priv->txq_to_sq_map[txq_ix] = sq;
 
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 351ac6982e22..64380bc0cd6a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -57,7 +57,7 @@ void mlx5e_send_nop(struct mlx5e_sq *sq, bool notify_hw)
 
 	if (notify_hw) {
 		cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
-		mlx5e_tx_notify_hw(sq, wqe);
+		mlx5e_tx_notify_hw(sq, wqe, 0);
 	}
 }
 
@@ -110,7 +110,7 @@ u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
 }
 
 static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq,
-					    struct sk_buff *skb)
+					    struct sk_buff *skb, bool bf)
 {
 	/* Some NIC TX decisions, e.g loopback, are based on the packet
 	 * headers and occur before the data gather.
@@ -118,7 +118,7 @@ static inline u16 mlx5e_get_inline_hdr_size(struct mlx5e_sq *sq,
 	 */
 #define MLX5E_MIN_INLINE (ETH_HLEN + 2/*vlan tag*/)
 
-	if (skb_headlen(skb) <= sq->max_inline)
+	if (bf && (skb_headlen(skb) <= sq->max_inline))
 		return skb_headlen(skb);
 
 	return MLX5E_MIN_INLINE;
@@ -137,6 +137,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 
 	u8  opcode = MLX5_OPCODE_SEND;
 	dma_addr_t dma_addr = 0;
+	bool bf = false;
 	u16 headlen;
 	u16 ds_cnt;
 	u16 ihs;
@@ -149,6 +150,11 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 	else
 		sq->stats.csum_offload_none++;
 
+	if (sq->cc != sq->prev_cc) {
+		sq->prev_cc = sq->cc;
+		sq->bf_budget = (sq->cc == sq->pc) ? MLX5E_SQ_BF_BUDGET : 0;
+	}
+
 	if (skb_is_gso(skb)) {
 		u32 payload_len;
 
@@ -161,7 +167,10 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 		sq->stats.tso_packets++;
 		sq->stats.tso_bytes += payload_len;
 	} else {
-		ihs = mlx5e_get_inline_hdr_size(sq, skb);
+		bf = sq->bf_budget &&
+		     !skb->xmit_more &&
+		     !skb_shinfo(skb)->nr_frags;
+		ihs = mlx5e_get_inline_hdr_size(sq, skb, bf);
 		MLX5E_TX_SKB_CB(skb)->num_bytes = max_t(unsigned int, skb->len,
 							ETH_ZLEN);
 	}
@@ -233,14 +242,21 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 	}
 
 	if (!skb->xmit_more || netif_xmit_stopped(sq->txq)) {
+		int bf_sz = 0;
+
+		if (bf && sq->uar_bf_map)
+			bf_sz = MLX5E_TX_SKB_CB(skb)->num_wqebbs << 3;
+
 		cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
-		mlx5e_tx_notify_hw(sq, wqe);
+		mlx5e_tx_notify_hw(sq, wqe, bf_sz);
 	}
 
 	/* fill sq edge with nops to avoid wqe wrap around */
 	while ((sq->pc & wq->sz_m1) > sq->edge)
 		mlx5e_send_nop(sq, false);
 
+	sq->bf_budget = bf ? sq->bf_budget - 1 : 0;
+
 	sq->stats.packets++;
 	return NETDEV_TX_OK;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index c34eafbf1c04..603a8b0908ee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -654,6 +654,22 @@ static int mlx5_core_set_issi(struct mlx5_core_dev *dev)
 }
 #endif
 
+static int map_bf_area(struct mlx5_core_dev *dev)
+{
+	resource_size_t bf_start = pci_resource_start(dev->pdev, 0);
+	resource_size_t bf_len = pci_resource_len(dev->pdev, 0);
+
+	dev->priv.bf_mapping = io_mapping_create_wc(bf_start, bf_len);
+
+	return dev->priv.bf_mapping ? 0 : -ENOMEM;
+}
+
+static void unmap_bf_area(struct mlx5_core_dev *dev)
+{
+	if (dev->priv.bf_mapping)
+		io_mapping_free(dev->priv.bf_mapping);
+}
+
 static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
 {
 	struct mlx5_priv *priv = &dev->priv;
@@ -808,10 +824,13 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
 		goto err_stop_eqs;
 	}
 
+	if (map_bf_area(dev))
+		dev_err(&pdev->dev, "Failed to map blue flame area\n");
+
 	err = mlx5_irq_set_affinity_hints(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
-		goto err_free_comp_eqs;
+		goto err_unmap_bf_area;
 	}
 
 	MLX5_INIT_DOORBELL_LOCK(&priv->cq_uar_lock);
@@ -823,7 +842,9 @@ static int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
 
 	return 0;
 
-err_free_comp_eqs:
+err_unmap_bf_area:
+	unmap_bf_area(dev);
+
 	free_comp_eqs(dev);
 
 err_stop_eqs:
@@ -881,6 +902,7 @@ static void mlx5_dev_cleanup(struct mlx5_core_dev *dev)
 	mlx5_cleanup_qp_table(dev);
 	mlx5_cleanup_cq_table(dev);
 	mlx5_irq_clear_affinity_hints(dev);
+	unmap_bf_area(dev);
 	free_comp_eqs(dev);
 	mlx5_stop_eqs(dev);
 	mlx5_free_uuars(dev, &priv->uuari);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
index 9ef85873ceea..eb05c845ece9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -32,6 +32,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/io-mapping.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
@@ -246,6 +247,10 @@ int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
 		goto err_free_uar;
 	}
 
+	if (mdev->priv.bf_mapping)
+		uar->bf_map = io_mapping_map_wc(mdev->priv.bf_mapping,
+						uar->index << PAGE_SHIFT);
+
 	return 0;
 
 err_free_uar:
@@ -257,6 +262,7 @@ EXPORT_SYMBOL(mlx5_alloc_map_uar);
 
 void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar)
 {
+	io_mapping_unmap(uar->bf_map);
 	iounmap(uar->map);
 	mlx5_cmd_free_uar(mdev, uar->index);
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 1c0d5d062d7c..5fe0cae1a515 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -380,7 +380,7 @@ struct mlx5_uar {
 	u32			index;
 	struct list_head	bf_list;
 	unsigned		free_bf_bmap;
-	void __iomem	       *wc_map;
+	void __iomem	       *bf_map;
 	void __iomem	       *map;
 };
 
@@ -435,6 +435,8 @@ struct mlx5_priv {
 	struct mlx5_uuar_info	uuari;
 	MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock);
 
+	struct io_mapping	*bf_mapping;
+
 	/* pages stuff */
 	struct workqueue_struct *pg_wq;
 	struct rb_root		page_root;
-- 
cgit v1.2.3


From 77fc29c4bbbbd01ee22c50ce8260fd0f2e08c124 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 27 Jul 2015 14:46:31 +0300
Subject: net/mlx4_core: Preparations for 802.1ad VLAN support

mlx4_core preparation to support hardware accelerated 802.1ad VLAN
device.

To allow 802.1ad accelerated device, "packet has vlan" (phv)
Firmware capability should be available. Firmware without the
phv capability won't behave properly and can't support 802.1ad device
acceleration.

The driver checks the Firmware capability and sets the phv bit
accordingly in SET_PORT command.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/fw.c   | 82 +++++++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/fw.h   |  1 +
 drivers/net/ethernet/mellanox/mlx4/main.c | 15 ++++++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |  3 ++
 include/linux/mlx4/device.h               |  5 ++
 5 files changed, 106 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index e30bf57ad7a1..5a1c3d249530 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -154,6 +154,7 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[26] = "Port ETS Scheduler support",
 		[27] = "Port beacon support",
 		[28] = "RX-ALL support",
+		[29] = "802.1ad offload support",
 	};
 	int i;
 
@@ -307,6 +308,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 
 #define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80
 #define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS (1 << 31)
+#define QUERY_FUNC_CAP_PHV_BIT			0x40
 
 	if (vhcr->op_modifier == 1) {
 		struct mlx4_active_ports actv_ports =
@@ -351,6 +353,12 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave,
 		MLX4_PUT(outbox->buf, dev->caps.phys_port_id[vhcr->in_modifier],
 			 QUERY_FUNC_CAP_PHYS_PORT_ID);
 
+		if (dev->caps.phv_bit[port]) {
+			field = QUERY_FUNC_CAP_PHV_BIT;
+			MLX4_PUT(outbox->buf, field,
+				 QUERY_FUNC_CAP_FLAGS0_OFFSET);
+		}
+
 	} else if (vhcr->op_modifier == 0) {
 		struct mlx4_active_ports actv_ports =
 			mlx4_get_active_ports(dev, slave);
@@ -600,6 +608,9 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
 		MLX4_GET(func_cap->phys_port_id, outbox,
 			 QUERY_FUNC_CAP_PHYS_PORT_ID);
 
+	MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS0_OFFSET);
+	func_cap->flags |= (field & QUERY_FUNC_CAP_PHV_BIT);
+
 	/* All other resources are allocated by the master, but we still report
 	 * 'num' and 'reserved' capabilities as follows:
 	 * - num remains the maximum resource index
@@ -700,6 +711,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET	0x92
 #define QUERY_DEV_CAP_BMME_FLAGS_OFFSET		0x94
 #define QUERY_DEV_CAP_CONFIG_DEV_OFFSET		0x94
+#define QUERY_DEV_CAP_PHV_EN_OFFSET		0x96
 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET		0x98
 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET		0xa0
 #define QUERY_DEV_CAP_ETH_BACKPL_OFFSET		0x9c
@@ -898,6 +910,12 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CONFIG_DEV;
 	if (field & (1 << 2))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_IGNORE_FCS;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_PHV_EN_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PHV_EN;
+	if (field & 0x40)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN;
+
 	MLX4_GET(dev_cap->reserved_lkey, outbox,
 		 QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
 	MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETH_BACKPL_OFFSET);
@@ -1992,6 +2010,10 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 	MLX4_GET(param->uar_page_sz, outbox, INIT_HCA_UAR_PAGE_SZ_OFFSET);
 	MLX4_GET(param->log_uar_sz, outbox, INIT_HCA_LOG_UAR_SZ_OFFSET);
 
+	/* phv_check enable */
+	MLX4_GET(byte_field, outbox, INIT_HCA_CACHELINE_SZ_OFFSET);
+	if (byte_field & 0x2)
+		param->phv_check_en = 1;
 out:
 	mlx4_free_cmd_mailbox(dev, mailbox);
 
@@ -2758,3 +2780,63 @@ int mlx4_ACCESS_REG_wrapper(struct mlx4_dev *dev, int slave,
 			    0, MLX4_CMD_ACCESS_REG, MLX4_CMD_TIME_CLASS_C,
 			    MLX4_CMD_NATIVE);
 }
+
+static int mlx4_SET_PORT_phv_bit(struct mlx4_dev *dev, u8 port, u8 phv_bit)
+{
+#define SET_PORT_GEN_PHV_VALID	0x10
+#define SET_PORT_GEN_PHV_EN	0x80
+
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+
+	context->v_ignore_fcs |=  SET_PORT_GEN_PHV_VALID;
+	if (phv_bit)
+		context->phv_en |=  SET_PORT_GEN_PHV_EN;
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv)
+{
+	int err;
+	struct mlx4_func_cap func_cap;
+
+	memset(&func_cap, 0, sizeof(func_cap));
+	err = mlx4_QUERY_FUNC_CAP(dev, 1, &func_cap);
+	if (!err)
+		*phv = func_cap.flags & QUERY_FUNC_CAP_PHV_BIT;
+	return err;
+}
+EXPORT_SYMBOL(get_phv_bit);
+
+int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val)
+{
+	int ret;
+
+	if (mlx4_is_slave(dev))
+		return -EPERM;
+
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN &&
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN)) {
+		ret = mlx4_SET_PORT_phv_bit(dev, port, new_val);
+		if (!ret)
+			dev->caps.phv_bit[port] = new_val;
+		return ret;
+	}
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(set_phv_bit);
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index 07cb7c2461ad..08de5555c2f4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -204,6 +204,7 @@ struct mlx4_init_hca_param {
 	u16 cqe_size; /* For use only when CQE stride feature enabled */
 	u16 eqe_size; /* For use only when EQE stride feature enabled */
 	u8 rss_ip_frags;
+	u8 phv_check_en; /* for QUERY_HCA */
 };
 
 struct mlx4_init_ib_param {
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index d76f4257e305..6f35b6c06193 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -405,6 +405,21 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
 	dev->caps.max_rss_tbl_sz     = dev_cap->max_rss_tbl_sz;
 
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN) {
+		struct mlx4_init_hca_param hca_param;
+
+		memset(&hca_param, 0, sizeof(hca_param));
+		err = mlx4_QUERY_HCA(dev, &hca_param);
+		/* Turn off PHV_EN flag in case phv_check_en is set.
+		 * phv_check_en is a HW check that parse the packet and verify
+		 * phv bit was reported correctly in the wqe. To allow QinQ
+		 * PHV_EN flag should be set and phv_check_en must be cleared
+		 * otherwise QinQ packets will be drop by the HW.
+		 */
+		if (err || hca_param.phv_check_en)
+			dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_PHV_EN;
+	}
+
 	/* Sense port always allowed on supported devices for ConnectX-1 and -2 */
 	if (mlx4_priv(dev)->pci_dev_data & MLX4_PCI_DEV_FORCE_SENSE_PORT)
 		dev->caps.flags |= MLX4_DEV_CAP_FLAG_SENSE_SUPPORT;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index a092c5c34d43..232b2b55f23b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -787,6 +787,9 @@ struct mlx4_set_port_general_context {
 	u8 pprx;
 	u8 pfcrx;
 	u16 reserved4;
+	u32 reserved5;
+	u8 phv_en;
+	u8 reserved6[3];
 };
 
 struct mlx4_set_port_rqp_calc_context {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index fd13c1ce3b4a..bcbf8c72a77b 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -211,6 +211,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_ETS_CFG		= 1LL <<  26,
 	MLX4_DEV_CAP_FLAG2_PORT_BEACON		= 1LL <<  27,
 	MLX4_DEV_CAP_FLAG2_IGNORE_FCS		= 1LL <<  28,
+	MLX4_DEV_CAP_FLAG2_PHV_EN		= 1LL <<  29,
+	MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN	= 1LL <<  30,
 };
 
 enum {
@@ -581,6 +583,7 @@ struct mlx4_caps {
 	u64			phys_port_id[MLX4_MAX_PORTS + 1];
 	int			tunnel_offload_mode;
 	u8			rx_checksum_flags_port[MLX4_MAX_PORTS + 1];
+	u8			phv_bit[MLX4_MAX_PORTS + 1];
 	u8			alloc_res_qp_mask;
 	u32			dmfs_high_rate_qpn_base;
 	u32			dmfs_high_rate_qpn_range;
@@ -1332,6 +1335,8 @@ int mlx4_SET_PORT_BEACON(struct mlx4_dev *dev, u8 port, u16 time);
 int mlx4_SET_PORT_fcs_check(struct mlx4_dev *dev, u8 port,
 			    u8 ignore_fcs_value);
 int mlx4_SET_PORT_VXLAN(struct mlx4_dev *dev, u8 port, u8 steering, int enable);
+int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val);
+int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv);
 int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx);
 int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx);
 int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
-- 
cgit v1.2.3


From e802f8e4c54e6adf4215ef9fa3d6eea8fcb10bf9 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 27 Jul 2015 14:46:33 +0300
Subject: net/mlx4: Prepare VLAN macros for 802.1ad Hardware accelerated
 support

To add Hardware accelerated support in 802.1ad vlan, replace
Current VLAN macros to CVLAN.
Replace:
MLX4_WQE_CTRL_INS_VLAN
MLX4_CQE_VLAN_PRESENT_MASK
With:
MLX4_WQE_CTRL_INS_CVLAN
MLX4_CQE_CVLAN_PRESENT_MASK

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/cq.c            | 2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 6 +++---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c | 2 +-
 include/linux/mlx4/cq.h                    | 2 +-
 include/linux/mlx4/qp.h                    | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 36eb3d012b6d..180a8f7ec82d 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -871,7 +871,7 @@ repoll:
 		if (is_eth) {
 			wc->sl  = be16_to_cpu(cqe->sl_vid) >> 13;
 			if (be32_to_cpu(cqe->vlan_my_qpn) &
-					MLX4_CQE_VLAN_PRESENT_MASK) {
+					MLX4_CQE_CVLAN_PRESENT_MASK) {
 				wc->vlan_id = be16_to_cpu(cqe->sl_vid) &
 					MLX4_CQE_VID_MASK;
 			} else {
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 12c65e1ad6a9..10f6c2f1d5a0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -726,7 +726,7 @@ static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va,
 
 	hw_checksum = csum_unfold((__force __sum16)cqe->checksum);
 
-	if (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK) &&
+	if (cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK) &&
 	    !(dev_features & NETIF_F_HW_VLAN_CTAG_RX)) {
 		hw_checksum = get_fixed_vlan_csum(hw_checksum, hdr);
 		hdr += sizeof(struct vlan_hdr);
@@ -907,7 +907,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 				gro_skb->csum_level = 1;
 
 			if ((cqe->vlan_my_qpn &
-			    cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK)) &&
+			    cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK)) &&
 			    (dev->features & NETIF_F_HW_VLAN_CTAG_RX)) {
 				u16 vid = be16_to_cpu(cqe->sl_vid);
 
@@ -970,7 +970,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 					PKT_HASH_TYPE_L3);
 
 		if ((be32_to_cpu(cqe->vlan_my_qpn) &
-		    MLX4_CQE_VLAN_PRESENT_MASK) &&
+		    MLX4_CQE_CVLAN_PRESENT_MASK) &&
 		    (dev->features & NETIF_F_HW_VLAN_CTAG_RX))
 			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), be16_to_cpu(cqe->sl_vid));
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index c10d98f6ad96..7c858f67ef28 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -958,7 +958,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		ring->bf.offset ^= ring->bf.buf_size;
 	} else {
 		tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
-		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN *
+		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN *
 			!!skb_vlan_tag_present(skb);
 		tx_desc->ctrl.fence_size = real_size;
 
diff --git a/include/linux/mlx4/cq.h b/include/linux/mlx4/cq.h
index e7ecc12a1163..899a97b20d27 100644
--- a/include/linux/mlx4/cq.h
+++ b/include/linux/mlx4/cq.h
@@ -88,7 +88,7 @@ struct mlx4_ts_cqe {
 
 enum {
 	MLX4_CQE_L2_TUNNEL_IPOK		= 1 << 31,
-	MLX4_CQE_VLAN_PRESENT_MASK	= 1 << 29,
+	MLX4_CQE_CVLAN_PRESENT_MASK	= 1 << 29,
 	MLX4_CQE_L2_TUNNEL		= 1 << 27,
 	MLX4_CQE_L2_TUNNEL_CSUM		= 1 << 26,
 	MLX4_CQE_L2_TUNNEL_IPV4		= 1 << 25,
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 6fed539e5456..6c619006c21f 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -272,7 +272,7 @@ enum {
 	MLX4_WQE_CTRL_SOLICITED		= 1 << 1,
 	MLX4_WQE_CTRL_IP_CSUM		= 1 << 4,
 	MLX4_WQE_CTRL_TCP_UDP_CSUM	= 1 << 5,
-	MLX4_WQE_CTRL_INS_VLAN		= 1 << 6,
+	MLX4_WQE_CTRL_INS_CVLAN		= 1 << 6,
 	MLX4_WQE_CTRL_STRONG_ORDER	= 1 << 7,
 	MLX4_WQE_CTRL_FORCE_LOOPBACK	= 1 << 0,
 };
-- 
cgit v1.2.3


From e38af4faf01d0b35df6995fb395e5fa4a4898289 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 27 Jul 2015 14:46:34 +0300
Subject: net/mlx4_en: Add support for hardware accelerated 802.1ad vlan

To enable device support in accelerated 802.1ad vlan, the port
capability "packet has vlan enable" (phv_en) should be set.
Firmware won't work properly, in case phv_en is not set.

The user can enable "phv_en" port capability with the new ethtool
private flag phv-bit. The phv-bit private flag default value is OFF,
users who are interested in 802.1ad hardware acceleration should turn ON
the phv-bit private flag:
$ ethtool --set-priv-flags eth1 phv-bit on

Once the private flag is set, the device is ready for 802.1ad vlan
acceleration.

The user should also change the interface device features and turn on
"tx-vlan-stag-hw-insert" which is off by default:
$ ethtool -K eth1  tx-vlan-stag-hw-insert on

"phv-bit" private flag setting is available only for Physical
Functions(PF), the Virtual Function (VF) will be able to use the feature
by setting "tx-vlan-stag-hw-insert" ethtool device feature only if the
feature was enabled by the Hypervisor.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 16 +++++++++
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c  | 46 +++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/en_rx.c      | 16 ++++++++-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c      | 13 ++++---
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    |  1 +
 include/linux/mlx4/cq.h                         |  1 +
 include/linux/mlx4/qp.h                         |  1 +
 7 files changed, 89 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 70f65534e786..f79d8124321e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -102,6 +102,7 @@ mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo)
 
 static const char mlx4_en_priv_flags[][ETH_GSTRING_LEN] = {
 	"blueflame",
+	"phv-bit"
 };
 
 static const char main_strings[][ETH_GSTRING_LEN] = {
@@ -1797,9 +1798,13 @@ static int mlx4_en_get_ts_info(struct net_device *dev,
 static int mlx4_en_set_priv_flags(struct net_device *dev, u32 flags)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
 	bool bf_enabled_new = !!(flags & MLX4_EN_PRIV_FLAGS_BLUEFLAME);
 	bool bf_enabled_old = !!(priv->pflags & MLX4_EN_PRIV_FLAGS_BLUEFLAME);
+	bool phv_enabled_new = !!(flags & MLX4_EN_PRIV_FLAGS_PHV);
+	bool phv_enabled_old = !!(priv->pflags & MLX4_EN_PRIV_FLAGS_PHV);
 	int i;
+	int ret = 0;
 
 	if (bf_enabled_new != bf_enabled_old) {
 		if (bf_enabled_new) {
@@ -1825,6 +1830,17 @@ static int mlx4_en_set_priv_flags(struct net_device *dev, u32 flags)
 			bf_enabled_new ?  "Enabled" : "Disabled");
 	}
 
+	if (phv_enabled_new != phv_enabled_old) {
+		ret = set_phv_bit(mdev->dev, priv->port, (int)phv_enabled_new);
+		if (ret)
+			return ret;
+		else if (phv_enabled_new)
+			priv->pflags |= MLX4_EN_PRIV_FLAGS_PHV;
+		else
+			priv->pflags &= ~MLX4_EN_PRIV_FLAGS_PHV;
+		en_info(priv, "PHV bit %s\n",
+			phv_enabled_new ?  "Enabled" : "Disabled");
+	}
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index e0de2fd1ce12..4726122ea76b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2184,6 +2184,25 @@ static int mlx4_en_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	}
 }
 
+static netdev_features_t mlx4_en_fix_features(struct net_device *netdev,
+					      netdev_features_t features)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(netdev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	/* Since there is no support for separate RX C-TAG/S-TAG vlan accel
+	 * enable/disable make sure S-TAG flag is always in same state as
+	 * C-TAG.
+	 */
+	if (features & NETIF_F_HW_VLAN_CTAG_RX &&
+	    !(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN))
+		features |= NETIF_F_HW_VLAN_STAG_RX;
+	else
+		features &= ~NETIF_F_HW_VLAN_STAG_RX;
+
+	return features;
+}
+
 static int mlx4_en_set_features(struct net_device *netdev,
 		netdev_features_t features)
 {
@@ -2218,6 +2237,10 @@ static int mlx4_en_set_features(struct net_device *netdev,
 		en_info(priv, "Turn %s TX vlan strip offload\n",
 			(features & NETIF_F_HW_VLAN_CTAG_TX) ? "ON" : "OFF");
 
+	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_HW_VLAN_STAG_TX))
+		en_info(priv, "Turn %s TX S-VLAN strip offload\n",
+			(features & NETIF_F_HW_VLAN_STAG_TX) ? "ON" : "OFF");
+
 	if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_LOOPBACK)) {
 		en_info(priv, "Turn %s loopback\n",
 			(features & NETIF_F_LOOPBACK) ? "ON" : "OFF");
@@ -2460,6 +2483,7 @@ static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_poll_controller	= mlx4_en_netpoll,
 #endif
 	.ndo_set_features	= mlx4_en_set_features,
+	.ndo_fix_features	= mlx4_en_fix_features,
 	.ndo_setup_tc		= mlx4_en_setup_tc,
 #ifdef CONFIG_RFS_ACCEL
 	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
@@ -2500,6 +2524,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
 	.ndo_poll_controller	= mlx4_en_netpoll,
 #endif
 	.ndo_set_features	= mlx4_en_set_features,
+	.ndo_fix_features	= mlx4_en_fix_features,
 	.ndo_setup_tc		= mlx4_en_setup_tc,
 #ifdef CONFIG_RFS_ACCEL
 	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
@@ -2931,6 +2956,27 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 	dev->hw_features |= NETIF_F_LOOPBACK |
 			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN)) {
+		dev->features |= NETIF_F_HW_VLAN_STAG_RX |
+			NETIF_F_HW_VLAN_STAG_FILTER;
+		dev->hw_features |= NETIF_F_HW_VLAN_STAG_RX;
+	}
+
+	if (mlx4_is_slave(mdev->dev)) {
+		int phv;
+
+		err = get_phv_bit(mdev->dev, port, &phv);
+		if (!err && phv) {
+			dev->hw_features |= NETIF_F_HW_VLAN_STAG_TX;
+			priv->pflags |= MLX4_EN_PRIV_FLAGS_PHV;
+		}
+	} else {
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PHV_EN &&
+		    !(mdev->dev->caps.flags2 &
+		      MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN))
+			dev->hw_features |= NETIF_F_HW_VLAN_STAG_TX;
+	}
+
 	if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP)
 		dev->hw_features |= NETIF_F_RXFCS;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 10f6c2f1d5a0..a67fbb90d69e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -912,6 +912,12 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 				u16 vid = be16_to_cpu(cqe->sl_vid);
 
 				__vlan_hwaccel_put_tag(gro_skb, htons(ETH_P_8021Q), vid);
+			} else if ((be32_to_cpu(cqe->vlan_my_qpn) &
+				  MLX4_CQE_SVLAN_PRESENT_MASK) &&
+				 (dev->features & NETIF_F_HW_VLAN_STAG_RX)) {
+				__vlan_hwaccel_put_tag(gro_skb,
+						       htons(ETH_P_8021AD),
+						       be16_to_cpu(cqe->sl_vid));
 			}
 
 			if (dev->features & NETIF_F_RXHASH)
@@ -973,6 +979,11 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 		    MLX4_CQE_CVLAN_PRESENT_MASK) &&
 		    (dev->features & NETIF_F_HW_VLAN_CTAG_RX))
 			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), be16_to_cpu(cqe->sl_vid));
+		else if ((be32_to_cpu(cqe->vlan_my_qpn) &
+			  MLX4_CQE_SVLAN_PRESENT_MASK) &&
+			 (dev->features & NETIF_F_HW_VLAN_STAG_RX))
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD),
+					       be16_to_cpu(cqe->sl_vid));
 
 		if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) {
 			timestamp = mlx4_en_get_cqe_ts(cqe);
@@ -1070,7 +1081,10 @@ static const int frag_sizes[] = {
 void mlx4_en_calc_rx_buf(struct net_device *dev)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
-	int eff_mtu = dev->mtu + ETH_HLEN + VLAN_HLEN;
+	/* VLAN_HLEN is added twice,to support skb vlan tagged with multiple
+	 * headers. (For example: ETH_P_8021Q and ETH_P_8021AD).
+	 */
+	int eff_mtu = dev->mtu + ETH_HLEN + (2 * VLAN_HLEN);
 	int buf_size = 0;
 	int i = 0;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 7c858f67ef28..494e7762fdb1 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -718,6 +718,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 	u32 index, bf_index;
 	__be32 op_own;
 	u16 vlan_tag = 0;
+	u16 vlan_proto = 0;
 	int i_frag;
 	int lso_header_size;
 	void *fragptr = NULL;
@@ -750,9 +751,10 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto tx_drop;
 	}
 
-	if (skb_vlan_tag_present(skb))
+	if (skb_vlan_tag_present(skb)) {
 		vlan_tag = skb_vlan_tag_get(skb);
-
+		vlan_proto = be16_to_cpu(skb->vlan_proto);
+	}
 
 	netdev_txq_bql_enqueue_prefetchw(ring->tx_queue);
 
@@ -958,8 +960,11 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		ring->bf.offset ^= ring->bf.buf_size;
 	} else {
 		tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
-		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN *
-			!!skb_vlan_tag_present(skb);
+		if (vlan_proto == ETH_P_8021AD)
+			tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_SVLAN;
+		else if (vlan_proto == ETH_P_8021Q)
+			tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN;
+
 		tx_desc->ctrl.fence_size = real_size;
 
 		/* Ensure new descriptor hits memory
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 666d1669eb52..defcf8c395bf 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -95,6 +95,7 @@
  */
 
 #define MLX4_EN_PRIV_FLAGS_BLUEFLAME 1
+#define MLX4_EN_PRIV_FLAGS_PHV	     2
 
 #define MLX4_EN_WATCHDOG_TIMEOUT	(15 * HZ)
 
diff --git a/include/linux/mlx4/cq.h b/include/linux/mlx4/cq.h
index 899a97b20d27..09cebe528488 100644
--- a/include/linux/mlx4/cq.h
+++ b/include/linux/mlx4/cq.h
@@ -89,6 +89,7 @@ struct mlx4_ts_cqe {
 enum {
 	MLX4_CQE_L2_TUNNEL_IPOK		= 1 << 31,
 	MLX4_CQE_CVLAN_PRESENT_MASK	= 1 << 29,
+	MLX4_CQE_SVLAN_PRESENT_MASK	= 1 << 30,
 	MLX4_CQE_L2_TUNNEL		= 1 << 27,
 	MLX4_CQE_L2_TUNNEL_CSUM		= 1 << 26,
 	MLX4_CQE_L2_TUNNEL_IPV4		= 1 << 25,
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 6c619006c21f..de45a51b3f04 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -273,6 +273,7 @@ enum {
 	MLX4_WQE_CTRL_IP_CSUM		= 1 << 4,
 	MLX4_WQE_CTRL_TCP_UDP_CSUM	= 1 << 5,
 	MLX4_WQE_CTRL_INS_CVLAN		= 1 << 6,
+	MLX4_WQE_CTRL_INS_SVLAN		= 1 << 7,
 	MLX4_WQE_CTRL_STRONG_ORDER	= 1 << 7,
 	MLX4_WQE_CTRL_FORCE_LOOPBACK	= 1 << 0,
 };
-- 
cgit v1.2.3


From 0933328a1b8adb6c8b2b8c8b823dad0295659c40 Mon Sep 17 00:00:00 2001
From: Joachim Eastwood <manabian@gmail.com>
Date: Wed, 29 Jul 2015 00:09:02 +0200
Subject: stmmac: remove unused stmmac_of_data struct

As dwmac-* drivers that need OF match have been converted
to use their own internal OF match data structure this can
now be removed.

Signed-off-by: Joachim Eastwood <manabian@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/stmmac.txt |  2 --
 include/linux/stmmac.h              | 18 ------------------
 2 files changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
index 5fddefa69baf..de5c42342ec3 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -274,8 +274,6 @@ capability register can replace what has been passed from the platform.
 Please see the following document:
 	Documentation/devicetree/bindings/net/stmmac.txt
 
-and the stmmac_of_data structure inside the include/linux/stmmac.h header file.
-
 4.11) This is a summary of the content of some relevant files:
  o stmmac_main.c: to implement the main network device driver;
  o stmmac_mdio.c: to provide mdio functions;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index c86a20047cb1..b43cd56b78e9 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -125,22 +125,4 @@ struct plat_stmmacenet_data {
 	void (*exit)(struct platform_device *pdev, void *priv);
 	void *bsp_priv;
 };
-
-/* of_data for SoC glue layer device tree bindings */
-
-struct stmmac_of_data {
-	int has_gmac;
-	int enh_desc;
-	int tx_coe;
-	int rx_coe;
-	int bugged_jumbo;
-	int pmt;
-	int riwt_off;
-	void (*fix_mac_speed)(void *priv, unsigned int speed);
-	void (*bus_setup)(void __iomem *ioaddr);
-	void *(*setup)(struct platform_device *pdev);
-	void (*free)(struct platform_device *pdev, void *priv);
-	int (*init)(struct platform_device *pdev, void *priv);
-	void (*exit)(struct platform_device *pdev, void *priv);
-};
 #endif
-- 
cgit v1.2.3


From 75fee59550a9899fd9438ebc0a64c972829a8dd2 Mon Sep 17 00:00:00 2001
From: Joachim Eastwood <manabian@gmail.com>
Date: Wed, 29 Jul 2015 00:09:03 +0200
Subject: stmmac: remove setup/free glue callbacks

As all dwmac-* drivers have been converted to have a proper probe
function the setup callback can now be removed. Also remove the
free callback that wasn't used by any driver.

New dwmac-* drivers should implement standard probe and remove
functions to preform any needed setup and teardown.

Signed-off-by: Joachim Eastwood <manabian@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/stmmac.txt                   | 8 ++------
 drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c   | 7 -------
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 3 ---
 include/linux/stmmac.h                                | 2 --
 4 files changed, 2 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
index de5c42342ec3..2903b1cf4d70 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -135,8 +135,6 @@ struct plat_stmmacenet_data {
 	int maxmtu;
 	void (*fix_mac_speed)(void *priv, unsigned int speed);
 	void (*bus_setup)(void __iomem *ioaddr);
-	void *(*setup)(struct platform_device *pdev);
-	void (*free)(struct platform_device *pdev, void *priv);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
 	void *bsp_priv;
@@ -177,12 +175,10 @@ Where:
  o bus_setup: perform HW setup of the bus. For example, on some ST platforms
 	     this field is used to configure the AMBA  bridge to generate more
 	     efficient STBus traffic.
- o setup/init/exit: callbacks used for calling a custom initialization;
+ o init/exit: callbacks used for calling a custom initialization;
 	     this is sometime necessary on some platforms (e.g. ST boxes)
 	     where the HW needs to have set some PIO lines or system cfg
-	     registers. setup should return a pointer to private data,
-	     which will be stored in bsp_priv, and then passed to init and
-	     exit callbacks. init/exit callbacks should not use or modify
+	     registers.  init/exit callbacks should not use or modify
 	     platform data.
  o bsp_priv: another private pointer.
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c
index f4fe9f1a33b4..b1e5f24708c9 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c
@@ -46,13 +46,6 @@ static int dwmac_generic_probe(struct platform_device *pdev)
 		plat_dat->unicast_filter_entries = 1;
 	}
 
-	/* Custom setup (if needed) */
-	if (plat_dat->setup) {
-		plat_dat->bsp_priv = plat_dat->setup(pdev);
-		if (IS_ERR(plat_dat->bsp_priv))
-			return PTR_ERR(plat_dat->bsp_priv);
-	}
-
 	/* Custom initialisation (if needed) */
 	if (plat_dat->init) {
 		ret = plat_dat->init(pdev, plat_dat->bsp_priv);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 55e569b330b2..1cb660405f35 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -300,9 +300,6 @@ int stmmac_pltfr_remove(struct platform_device *pdev)
 	if (priv->plat->exit)
 		priv->plat->exit(pdev, priv->plat->bsp_priv);
 
-	if (priv->plat->free)
-		priv->plat->free(pdev, priv->plat->bsp_priv);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(stmmac_pltfr_remove);
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index b43cd56b78e9..eead8ab93c0a 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -119,8 +119,6 @@ struct plat_stmmacenet_data {
 	int rx_fifo_size;
 	void (*fix_mac_speed)(void *priv, unsigned int speed);
 	void (*bus_setup)(void __iomem *ioaddr);
-	void *(*setup)(struct platform_device *pdev);
-	void (*free)(struct platform_device *pdev, void *priv);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
 	void *bsp_priv;
-- 
cgit v1.2.3


From 72b1e5e4cac72efa6b739b47e41f53e4520b4194 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 23 Jul 2015 16:21:30 +0200
Subject: netfilter: bridge: reduce nf_bridge_info to 32 bytes again

We can use union for most of the temporary cruft (original ipv4/ipv6
address, source mac, physoutdev) since they're used during different
stages of br netfilter traversal.

Also get rid of the last two ->mask users.

Shrinks struct from 48 to 32 on 64bit arch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge.h          | 12 +++++++++---
 include/linux/skbuff.h                    | 19 +++++++++++++------
 net/bridge/br_netfilter_hooks.c           | 14 ++++++--------
 net/bridge/br_netfilter_ipv6.c            |  2 +-
 net/ipv4/netfilter/nf_defrag_ipv4.c       |  7 ++-----
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c |  7 ++-----
 6 files changed, 33 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index 6d80fc686323..2437b8a5d7a9 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -17,9 +17,6 @@ enum nf_br_hook_priorities {
 
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 
-#define BRNF_BRIDGED_DNAT		0x02
-#define BRNF_NF_BRIDGE_PREROUTING	0x08
-
 int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb);
 
 static inline void br_drop_fake_rtable(struct sk_buff *skb)
@@ -63,8 +60,17 @@ nf_bridge_get_physoutdev(const struct sk_buff *skb)
 {
 	return skb->nf_bridge ? skb->nf_bridge->physoutdev : NULL;
 }
+
+static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb)
+{
+	return skb->nf_bridge && skb->nf_bridge->in_prerouting;
+}
 #else
 #define br_drop_fake_rtable(skb)	        do { } while (0)
+static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb)
+{
+	return false;
+}
 #endif /* CONFIG_BRIDGE_NETFILTER */
 
 #endif
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d6cdd6e87d53..ac732e67a6c8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -173,17 +173,24 @@ struct nf_bridge_info {
 		BRNF_PROTO_8021Q,
 		BRNF_PROTO_PPPOE
 	} orig_proto:8;
-	bool			pkt_otherhost;
+	u8			pkt_otherhost:1;
+	u8			in_prerouting:1;
+	u8			bridged_dnat:1;
 	__u16			frag_max_size;
-	unsigned int		mask;
 	struct net_device	*physindev;
 	union {
-		struct net_device *physoutdev;
-		char neigh_header[8];
-	};
-	union {
+		/* prerouting: detect dnat in orig/reply direction */
 		__be32          ipv4_daddr;
 		struct in6_addr ipv6_daddr;
+
+		/* after prerouting + nat detected: store original source
+		 * mac since neigh resolution overwrites it, only used while
+		 * skb is out in neigh layer.
+		 */
+		char neigh_header[8];
+
+		/* always valid & non-NULL from FORWARD on, for physdev match */
+		struct net_device *physoutdev;
 	};
 };
 #endif
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index c8b9bcfe997e..ec51c2ba30e9 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -284,7 +284,7 @@ int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb)
 							 nf_bridge->neigh_header,
 							 ETH_HLEN-ETH_ALEN);
 			/* tell br_dev_xmit to continue with forwarding */
-			nf_bridge->mask |= BRNF_BRIDGED_DNAT;
+			nf_bridge->bridged_dnat = 1;
 			/* FIXME Need to refragment */
 			ret = neigh->output(neigh, skb);
 		}
@@ -356,7 +356,7 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb)
 		skb->pkt_type = PACKET_OTHERHOST;
 		nf_bridge->pkt_otherhost = false;
 	}
-	nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING;
+	nf_bridge->in_prerouting = 0;
 	if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) {
 		if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
 			struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -444,7 +444,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb)
 		nf_bridge->pkt_otherhost = true;
 	}
 
-	nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING;
+	nf_bridge->in_prerouting = 1;
 	nf_bridge->physindev = skb->dev;
 	skb->dev = brnf_get_logical_dev(skb, skb->dev);
 
@@ -850,10 +850,8 @@ static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops,
 				   struct sk_buff *skb,
 				   const struct nf_hook_state *state)
 {
-	if (skb->nf_bridge &&
-	    !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) {
+	if (skb->nf_bridge && !skb->nf_bridge->in_prerouting)
 		return NF_STOP;
-	}
 
 	return NF_ACCEPT;
 }
@@ -872,7 +870,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
 	struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 
 	skb_pull(skb, ETH_HLEN);
-	nf_bridge->mask &= ~BRNF_BRIDGED_DNAT;
+	nf_bridge->bridged_dnat = 0;
 
 	BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN));
 
@@ -887,7 +885,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
 
 static int br_nf_dev_xmit(struct sk_buff *skb)
 {
-	if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) {
+	if (skb->nf_bridge && skb->nf_bridge->bridged_dnat) {
 		br_nf_pre_routing_finish_bridge_slow(skb);
 		return 1;
 	}
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 13b7d1e3d185..77383bfe7ea3 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -174,7 +174,7 @@ static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb)
 		skb->pkt_type = PACKET_OTHERHOST;
 		nf_bridge->pkt_otherhost = false;
 	}
-	nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING;
+	nf_bridge->in_prerouting = 0;
 	if (br_nf_ipv6_daddr_was_changed(skb, nf_bridge)) {
 		skb_dst_drop(skb);
 		v6ops->route_input(skb);
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index c88b7d434718..b69e82bda215 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -49,12 +49,9 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
 	if (skb->nfct)
 		zone = nf_ct_zone((struct nf_conn *)skb->nfct);
 #endif
-
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	if (skb->nf_bridge &&
-	    skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
+	if (nf_bridge_in_prerouting(skb))
 		return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
-#endif
+
 	if (hooknum == NF_INET_PRE_ROUTING)
 		return IP_DEFRAG_CONNTRACK_IN + zone;
 	else
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index a45db0b4785c..267fb8d5876e 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -39,12 +39,9 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
 	if (skb->nfct)
 		zone = nf_ct_zone((struct nf_conn *)skb->nfct);
 #endif
-
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	if (skb->nf_bridge &&
-	    skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
+	if (nf_bridge_in_prerouting(skb))
 		return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
-#endif
+
 	if (hooknum == NF_INET_PRE_ROUTING)
 		return IP6_DEFRAG_CONNTRACK_IN + zone;
 	else
-- 
cgit v1.2.3


From 7b36f92934e40d1ee24e5617ddedb852e10086ca Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 30 Jul 2015 12:42:47 +0200
Subject: bpf: provide helper that indicates eBPF was migrated

During recent discussions we had with Michael, we found that it would
be useful to have an indicator that tells the JIT that an eBPF program
had been migrated from classic instructions into eBPF instructions, as
only in that case A and X need to be cleared in the prologue. Such eBPF
programs do not set a particular type, but all have BPF_PROG_TYPE_UNSPEC.
Thus, introduce a small helper for cde66c2d88da ("s390/bpf: Only clear
A and X for converted BPF programs") and possibly others in future.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/s390/net/bpf_jit_comp.c |  2 +-
 include/linux/filter.h       | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index bbbac6da37af..9f4bbc09bf07 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1245,7 +1245,7 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp)
 	jit->lit = jit->lit_start;
 	jit->prg = 0;
 
-	bpf_jit_prologue(jit, fp->type == BPF_PROG_TYPE_UNSPEC);
+	bpf_jit_prologue(jit, bpf_prog_was_classic(fp));
 	for (i = 0; i < fp->len; i += insn_count) {
 		insn_count = bpf_jit_insn(jit, fp, i);
 		if (insn_count < 0)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 69d00555ce35..6b025491120d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -354,6 +354,16 @@ static inline unsigned int bpf_prog_size(unsigned int proglen)
 		   offsetof(struct bpf_prog, insns[proglen]));
 }
 
+static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
+{
+	/* When classic BPF programs have been loaded and the arch
+	 * does not have a classic BPF JIT (anymore), they have been
+	 * converted via bpf_migrate_filter() to eBPF and thus always
+	 * have an unspec program type.
+	 */
+	return prog->type == BPF_PROG_TYPE_UNSPEC;
+}
+
 #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
 
 #ifdef CONFIG_DEBUG_SET_MODULE_RONX
-- 
cgit v1.2.3


From b13138ef72178a13f34e33883f9f093f9e3b1bda Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 30 Jul 2015 12:42:49 +0200
Subject: bpf: also show process name/pid in bpf_jit_dump

It can be useful for testing to see the actual process/pid who is loading
a given filter. I was running some BPF test program and noticed unusual
filter loads from time to time, triggered by some other application in the
background. bpf_jit_disasm is still working after this change.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6b025491120d..fa2cab985e57 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -12,6 +12,7 @@
 #include <linux/linkage.h>
 #include <linux/printk.h>
 #include <linux/workqueue.h>
+#include <linux/sched.h>
 
 #include <asm/cacheflush.h>
 
@@ -438,8 +439,9 @@ void bpf_jit_free(struct bpf_prog *fp);
 static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
 				u32 pass, void *image)
 {
-	pr_err("flen=%u proglen=%u pass=%u image=%pK\n",
-	       flen, proglen, pass, image);
+	pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen,
+	       proglen, pass, image, current->comm, task_pid_nr(current));
+
 	if (image)
 		print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
 			       16, 1, image, proglen, false);
-- 
cgit v1.2.3


From 8013d1d7eafb0589ca766db6b74026f76b7f5cb4 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Thu, 30 Jul 2015 14:28:42 +0800
Subject: net/ipv6: add sysctl option accept_ra_min_hop_limit

Commit 6fd99094de2b ("ipv6: Don't reduce hop limit for an interface")
disabled accept hop limit from RA if it is smaller than the current hop
limit for security stuff. But this behavior kind of break the RFC definition.

RFC 4861, 6.3.4.  Processing Received Router Advertisements
   A Router Advertisement field (e.g., Cur Hop Limit, Reachable Time,
   and Retrans Timer) may contain a value denoting that it is
   unspecified.  In such cases, the parameter should be ignored and the
   host should continue using whatever value it is already using.

   If the received Cur Hop Limit value is non-zero, the host SHOULD set
   its CurHopLimit variable to the received value.

So add sysctl option accept_ra_min_hop_limit to let user choose the minimum
hop limit value they can accept from RA. And set default to 1 to meet RFC
standards.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: YOSHIFUJI Hideaki <hideaki.yoshifuji@miraclelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  8 ++++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 10 ++++++++++
 net/ipv6/ndisc.c                       | 16 +++++++---------
 5 files changed, 27 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 1a5ab21bcca5..00d26d919459 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1340,6 +1340,14 @@ accept_ra_from_local - BOOLEAN
 	   disabled if accept_ra_from_local is disabled
                on a specific interface.
 
+accept_ra_min_hop_limit - INTEGER
+	Minimum hop limit Information in Router Advertisement.
+
+	Hop limit Information in Router Advertisement less than this
+	variable shall be ignored.
+
+	Default: 1
+
 accept_ra_pinfo - BOOLEAN
 	Learn Prefix Information in Router Advertisement.
 
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 06ed637225b8..cb9dcad72372 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -29,6 +29,7 @@ struct ipv6_devconf {
 	__s32		max_desync_factor;
 	__s32		max_addresses;
 	__s32		accept_ra_defrtr;
+	__s32		accept_ra_min_hop_limit;
 	__s32		accept_ra_pinfo;
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	__s32		accept_ra_rtr_pref;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 641a146ead7d..80f3b74446a1 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -172,6 +172,7 @@ enum {
 	DEVCONF_ACCEPT_RA_MTU,
 	DEVCONF_STABLE_SECRET,
 	DEVCONF_USE_OIF_ADDRS_ONLY,
+	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index eb0c6a3a8a00..53e3a9d756b0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -195,6 +195,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.max_addresses		= IPV6_MAX_ADDRESSES,
 	.accept_ra_defrtr	= 1,
 	.accept_ra_from_local	= 0,
+	.accept_ra_min_hop_limit= 1,
 	.accept_ra_pinfo	= 1,
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	.accept_ra_rtr_pref	= 1,
@@ -237,6 +238,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.max_addresses		= IPV6_MAX_ADDRESSES,
 	.accept_ra_defrtr	= 1,
 	.accept_ra_from_local	= 0,
+	.accept_ra_min_hop_limit= 1,
 	.accept_ra_pinfo	= 1,
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	.accept_ra_rtr_pref	= 1,
@@ -4588,6 +4590,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor;
 	array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
 	array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
+	array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit;
 	array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref;
@@ -5484,6 +5487,13 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname	= "accept_ra_min_hop_limit",
+			.data		= &ipv6_devconf.accept_ra_min_hop_limit,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
 		{
 			.procname	= "accept_ra_pinfo",
 			.data		= &ipv6_devconf.accept_ra_pinfo,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 0a05b35a90fc..6e184e02fd3c 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1225,18 +1225,16 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 
 	if (rt)
 		rt6_set_expires(rt, jiffies + (HZ * lifetime));
-	if (ra_msg->icmph.icmp6_hop_limit) {
-		/* Only set hop_limit on the interface if it is higher than
-		 * the current hop_limit.
-		 */
-		if (in6_dev->cnf.hop_limit < ra_msg->icmph.icmp6_hop_limit) {
+	if (in6_dev->cnf.accept_ra_min_hop_limit < 256 &&
+	    ra_msg->icmph.icmp6_hop_limit) {
+		if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) {
 			in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
+			if (rt)
+				dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
+					       ra_msg->icmph.icmp6_hop_limit);
 		} else {
-			ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than current\n");
+			ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n");
 		}
-		if (rt)
-			dst_metric_set(&rt->dst, RTAX_HOPLIMIT,
-				       ra_msg->icmph.icmp6_hop_limit);
 	}
 
 skip_defrtr:
-- 
cgit v1.2.3


From f70ea018da0631e10c26a02f5a82d626ffef5bd7 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Fri, 31 Jul 2015 16:52:10 -0700
Subject: net: Add functions to get skb->hash based on flow structures

Add skb_get_hash_flowi6 and skb_get_hash_flowi4 which derive an sk_buff
hash from flowi6 and flowi4 structures respectively. These functions
can be called when creating a packet in the output path where the new
sk_buff does not yet contain a fully formed packet that is parsable by
flow dissector.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    | 21 +++++++++++++++++
 net/core/flow_dissector.c | 58 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 75 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 648a2c241993..b7c1286e247d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -37,6 +37,7 @@
 #include <net/flow_dissector.h>
 #include <linux/splice.h>
 #include <linux/in6.h>
+#include <net/flow.h>
 
 /* A. Checksumming of received packets by device.
  *
@@ -945,6 +946,26 @@ static inline __u32 skb_get_hash(struct sk_buff *skb)
 	return skb->hash;
 }
 
+__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6);
+
+static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
+{
+	if (!skb->l4_hash && !skb->sw_hash)
+		__skb_get_hash_flowi6(skb, fl6);
+
+	return skb->hash;
+}
+
+__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl);
+
+static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
+{
+	if (!skb->l4_hash && !skb->sw_hash)
+		__skb_get_hash_flowi4(skb, fl4);
+
+	return skb->hash;
+}
+
 __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb);
 
 static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 2a834c6179b9..11e6540fa386 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -590,6 +590,15 @@ void make_flow_keys_digest(struct flow_keys_digest *digest,
 }
 EXPORT_SYMBOL(make_flow_keys_digest);
 
+static inline void __skb_set_sw_hash(struct sk_buff *skb, u32 hash,
+				     struct flow_keys *keys)
+{
+	if (keys->ports.ports)
+		skb->l4_hash = 1;
+	skb->sw_hash = 1;
+	skb->hash = hash;
+}
+
 /**
  * __skb_get_hash: calculate a flow hash
  * @skb: sk_buff to calculate flow hash from
@@ -609,10 +618,8 @@ void __skb_get_hash(struct sk_buff *skb)
 	hash = ___skb_get_hash(skb, &keys, hashrnd);
 	if (!hash)
 		return;
-	if (keys.ports.ports)
-		skb->l4_hash = 1;
-	skb->sw_hash = 1;
-	skb->hash = hash;
+
+	__skb_set_sw_hash(skb, hash, &keys);
 }
 EXPORT_SYMBOL(__skb_get_hash);
 
@@ -624,6 +631,49 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
 }
 EXPORT_SYMBOL(skb_get_hash_perturb);
 
+__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
+{
+	struct flow_keys keys;
+
+	memset(&keys, 0, sizeof(keys));
+
+	memcpy(&keys.addrs.v6addrs.src, &fl6->saddr,
+	       sizeof(keys.addrs.v6addrs.src));
+	memcpy(&keys.addrs.v6addrs.dst, &fl6->daddr,
+	       sizeof(keys.addrs.v6addrs.dst));
+	keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+	keys.ports.src = fl6->fl6_sport;
+	keys.ports.dst = fl6->fl6_dport;
+	keys.keyid.keyid = fl6->fl6_gre_key;
+	keys.tags.flow_label = (__force u32)fl6->flowlabel;
+	keys.basic.ip_proto = fl6->flowi6_proto;
+
+	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys);
+
+	return skb->hash;
+}
+EXPORT_SYMBOL(__skb_get_hash_flowi6);
+
+__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
+{
+	struct flow_keys keys;
+
+	memset(&keys, 0, sizeof(keys));
+
+	keys.addrs.v4addrs.src = fl4->saddr;
+	keys.addrs.v4addrs.dst = fl4->daddr;
+	keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+	keys.ports.src = fl4->fl4_sport;
+	keys.ports.dst = fl4->fl4_dport;
+	keys.keyid.keyid = fl4->fl4_gre_key;
+	keys.basic.ip_proto = fl4->flowi4_proto;
+
+	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys);
+
+	return skb->hash;
+}
+EXPORT_SYMBOL(__skb_get_hash_flowi4);
+
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen)
 {
-- 
cgit v1.2.3


From d9eea403ca81f60cd535d354c77ada4c2bee8d66 Mon Sep 17 00:00:00 2001
From: Achiad Shochat <achiad@mellanox.com>
Date: Tue, 4 Aug 2015 14:05:42 +0300
Subject: net/mlx5_core: Introduce access function to modify RSS/LRO params

To be used by the mlx5 Eth driver in following commit.

This is in preparation for netdev "light-weight" open/stop flow
change described in previous commit.

Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 12 ++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/transobj.h |  2 ++
 include/linux/mlx5/mlx5_ifc.h                      |  9 ++++++++-
 3 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
index c4f3f74908ec..e6453f61141e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -163,6 +163,18 @@ int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
 	return err;
 }
 
+int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
+			 int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_tir_out)];
+
+	MLX5_SET(modify_tir_in, in, tirn, tirn);
+	MLX5_SET(modify_tir_in, in, opcode, MLX5_CMD_OP_MODIFY_TIR);
+
+	memset(out, 0, sizeof(out));
+	return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+}
+
 void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn)
 {
 	u32 in[MLX5_ST_SZ_DW(destroy_tir_out)];
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
index 10bd75e7d9b1..d436c2d8b527 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
@@ -45,6 +45,8 @@ int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen);
 void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn);
 int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			 u32 *tirn);
+int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
+			 int inlen);
 void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn);
 int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			 u32 *tisn);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c60a62bba652..469b7bda3304 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4050,6 +4050,13 @@ struct mlx5_ifc_modify_tis_in_bits {
 	struct mlx5_ifc_tisc_bits ctx;
 };
 
+struct mlx5_ifc_modify_tir_bitmask_bits {
+	u8	   reserved[0x20];
+
+	u8         reserved1[0x1f];
+	u8         lro[0x1];
+};
+
 struct mlx5_ifc_modify_tir_out_bits {
 	u8         status[0x8];
 	u8         reserved_0[0x18];
@@ -4071,7 +4078,7 @@ struct mlx5_ifc_modify_tir_in_bits {
 
 	u8         reserved_3[0x20];
 
-	u8         modify_bitmask[0x40];
+	struct mlx5_ifc_modify_tir_bitmask_bits bitmask;
 
 	u8         reserved_4[0x40];
 
-- 
cgit v1.2.3


From 5c50368f38317627421bf24a0b66b1af0d44eddc Mon Sep 17 00:00:00 2001
From: Achiad Shochat <achiad@mellanox.com>
Date: Tue, 4 Aug 2015 14:05:43 +0300
Subject: net/mlx5e: Light-weight netdev open/stop

Create/destroy TIRs, TISs and flow tables upon PCI probe/remove rather
than upon the netdev ndo_open/stop.

Upon ndo_stop(), redirect all RX traffic to the (lately introduced)
"Drop RQ" and then close only the RX/TX rings, leaving the TIRs,
TISs and flow tables alive.

Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 237 ++++++++++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/transobj.c |  12 ++
 drivers/net/ethernet/mellanox/mlx5/core/transobj.h |   2 +
 include/linux/mlx5/mlx5_ifc.h                      |   9 +-
 4 files changed, 184 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index baa7a69bb694..33d08bb11f84 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1301,14 +1301,18 @@ static void mlx5e_fill_rqt_rqns(struct mlx5e_priv *priv, void *rqtc,
 
 			ix = ix % priv->params.num_channels;
 			MLX5_SET(rqtc, rqtc, rq_num[i],
-				 priv->channel[ix]->rq.rqn);
+				 test_bit(MLX5E_STATE_OPENED, &priv->state) ?
+				 priv->channel[ix]->rq.rqn :
+				 priv->drop_rq.rqn);
 		}
 
 		break;
 
 	default: /* MLX5E_SINGLE_RQ_RQT */
 		MLX5_SET(rqtc, rqtc, rq_num[0],
-			 priv->channel[0]->rq.rqn);
+			 test_bit(MLX5E_STATE_OPENED, &priv->state) ?
+			 priv->channel[0]->rq.rqn :
+			 priv->drop_rq.rqn);
 
 		break;
 	}
@@ -1347,19 +1351,95 @@ static int mlx5e_open_rqt(struct mlx5e_priv *priv, enum mlx5e_rqt_ix rqt_ix)
 	return err;
 }
 
+static int mlx5e_redirect_rqt(struct mlx5e_priv *priv, enum mlx5e_rqt_ix rqt_ix)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u32 *in;
+	void *rqtc;
+	int inlen;
+	int log_sz;
+	int sz;
+	int err;
+
+	log_sz = (rqt_ix == MLX5E_SINGLE_RQ_RQT) ? 0 :
+		  priv->params.rx_hash_log_tbl_sz;
+	sz = 1 << log_sz;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + sizeof(u32) * sz;
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
+
+	MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
+
+	mlx5e_fill_rqt_rqns(priv, rqtc, rqt_ix);
+
+	MLX5_SET(modify_rqt_in, in, bitmask.rqn_list, 1);
+
+	err = mlx5_core_modify_rqt(mdev, priv->rqtn[rqt_ix], in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
 static void mlx5e_close_rqt(struct mlx5e_priv *priv, enum mlx5e_rqt_ix rqt_ix)
 {
 	mlx5_core_destroy_rqt(priv->mdev, priv->rqtn[rqt_ix]);
 }
 
+static void mlx5e_build_tir_ctx_lro(void *tirc, struct mlx5e_priv *priv)
+{
+	if (!priv->params.lro_en)
+		return;
+
+#define ROUGH_MAX_L2_L3_HDR_SZ 256
+
+	MLX5_SET(tirc, tirc, lro_enable_mask,
+		 MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO |
+		 MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO);
+	MLX5_SET(tirc, tirc, lro_max_ip_payload_size,
+		 (priv->params.lro_wqe_sz -
+		  ROUGH_MAX_L2_L3_HDR_SZ) >> 8);
+	MLX5_SET(tirc, tirc, lro_timeout_period_usecs,
+		 MLX5_CAP_ETH(priv->mdev,
+			      lro_timer_supported_periods[3]));
+}
+
+static int mlx5e_modify_tir_lro(struct mlx5e_priv *priv, int tt)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	void *in;
+	void *tirc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_tir_in, in, bitmask.lro, 1);
+	tirc = MLX5_ADDR_OF(modify_tir_in, in, ctx);
+
+	mlx5e_build_tir_ctx_lro(tirc, priv);
+
+	err = mlx5_core_modify_tir(mdev, priv->tirn[tt], in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
 static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 *tirc, int tt)
 {
 	void *hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
 
 	MLX5_SET(tirc, tirc, transport_domain, priv->tdn);
 
-#define ROUGH_MAX_L2_L3_HDR_SZ 256
-
 #define MLX5_HASH_IP            (MLX5_HASH_FIELD_SEL_SRC_IP   |\
 				 MLX5_HASH_FIELD_SEL_DST_IP)
 
@@ -1372,17 +1452,7 @@ static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 *tirc, int tt)
 				 MLX5_HASH_FIELD_SEL_DST_IP   |\
 				 MLX5_HASH_FIELD_SEL_IPSEC_SPI)
 
-	if (priv->params.lro_en) {
-		MLX5_SET(tirc, tirc, lro_enable_mask,
-			 MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO |
-			 MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO);
-		MLX5_SET(tirc, tirc, lro_max_ip_payload_size,
-			 (priv->params.lro_wqe_sz -
-			  ROUGH_MAX_L2_L3_HDR_SZ) >> 8);
-		MLX5_SET(tirc, tirc, lro_timeout_period_usecs,
-			 MLX5_CAP_ETH(priv->mdev,
-				      lro_timer_supported_periods[3]));
-	}
+	mlx5e_build_tir_ctx_lro(tirc, priv);
 
 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
 
@@ -1568,12 +1638,20 @@ static int mlx5e_set_dev_port_mtu(struct net_device *netdev)
 	return 0;
 }
 
+static void mlx5e_redirect_rqts(struct mlx5e_priv *priv)
+{
+	mlx5e_redirect_rqt(priv, MLX5E_INDIRECTION_RQT);
+	mlx5e_redirect_rqt(priv, MLX5E_SINGLE_RQ_RQT);
+}
+
 int mlx5e_open_locked(struct net_device *netdev)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	int num_txqs;
 	int err;
 
+	set_bit(MLX5E_STATE_OPENED, &priv->state);
+
 	num_txqs = priv->params.num_channels * priv->params.num_tc;
 	netif_set_real_num_tx_queues(netdev, num_txqs);
 	netif_set_real_num_rx_queues(netdev, priv->params.num_channels);
@@ -1582,83 +1660,32 @@ int mlx5e_open_locked(struct net_device *netdev)
 	if (err)
 		return err;
 
-	err = mlx5e_open_tises(priv);
-	if (err) {
-		netdev_err(netdev, "%s: mlx5e_open_tises failed, %d\n",
-			   __func__, err);
-		return err;
-	}
-
 	err = mlx5e_open_channels(priv);
 	if (err) {
 		netdev_err(netdev, "%s: mlx5e_open_channels failed, %d\n",
 			   __func__, err);
-		goto err_close_tises;
-	}
-
-	err = mlx5e_open_rqt(priv, MLX5E_INDIRECTION_RQT);
-	if (err) {
-		netdev_err(netdev, "%s: mlx5e_open_rqt(INDIR) failed, %d\n",
-			   __func__, err);
-		goto err_close_channels;
-	}
-
-	err = mlx5e_open_rqt(priv, MLX5E_SINGLE_RQ_RQT);
-	if (err) {
-		netdev_err(netdev, "%s: mlx5e_open_rqt(SINGLE) failed, %d\n",
-			   __func__, err);
-		goto err_close_rqt_indir;
-	}
-
-	err = mlx5e_open_tirs(priv);
-	if (err) {
-		netdev_err(netdev, "%s: mlx5e_open_tir failed, %d\n",
-			   __func__, err);
-		goto err_close_rqt_single;
-	}
-
-	err = mlx5e_open_flow_table(priv);
-	if (err) {
-		netdev_err(netdev, "%s: mlx5e_open_flow_table failed, %d\n",
-			   __func__, err);
-		goto err_close_tirs;
+		return err;
 	}
 
 	err = mlx5e_add_all_vlan_rules(priv);
 	if (err) {
 		netdev_err(netdev, "%s: mlx5e_add_all_vlan_rules failed, %d\n",
 			   __func__, err);
-		goto err_close_flow_table;
+		goto err_close_channels;
 	}
 
 	mlx5e_init_eth_addr(priv);
 
-	set_bit(MLX5E_STATE_OPENED, &priv->state);
-
 	mlx5e_update_carrier(priv);
+	mlx5e_redirect_rqts(priv);
 	mlx5e_set_rx_mode_core(priv);
 
 	schedule_delayed_work(&priv->update_stats_work, 0);
 	return 0;
 
-err_close_flow_table:
-	mlx5e_close_flow_table(priv);
-
-err_close_tirs:
-	mlx5e_close_tirs(priv);
-
-err_close_rqt_single:
-	mlx5e_close_rqt(priv, MLX5E_SINGLE_RQ_RQT);
-
-err_close_rqt_indir:
-	mlx5e_close_rqt(priv, MLX5E_INDIRECTION_RQT);
-
 err_close_channels:
 	mlx5e_close_channels(priv);
 
-err_close_tises:
-	mlx5e_close_tises(priv);
-
 	return err;
 }
 
@@ -1682,13 +1709,9 @@ int mlx5e_close_locked(struct net_device *netdev)
 
 	mlx5e_set_rx_mode_core(priv);
 	mlx5e_del_all_vlan_rules(priv);
+	mlx5e_redirect_rqts(priv);
 	netif_carrier_off(priv->netdev);
-	mlx5e_close_flow_table(priv);
-	mlx5e_close_tirs(priv);
-	mlx5e_close_rqt(priv, MLX5E_SINGLE_RQ_RQT);
-	mlx5e_close_rqt(priv, MLX5E_INDIRECTION_RQT);
 	mlx5e_close_channels(priv);
-	mlx5e_close_tises(priv);
 
 	return 0;
 }
@@ -1766,6 +1789,8 @@ static int mlx5e_set_features(struct net_device *netdev,
 			mlx5e_close_locked(priv->netdev);
 
 		priv->params.lro_en = !!(features & NETIF_F_LRO);
+		mlx5e_modify_tir_lro(priv, MLX5E_TT_IPV4_TCP);
+		mlx5e_modify_tir_lro(priv, MLX5E_TT_IPV6_TCP);
 
 		if (was_opened)
 			err = mlx5e_open_locked(priv->netdev);
@@ -2026,16 +2051,72 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
 		goto err_dealloc_transport_domain;
 	}
 
+	err = mlx5e_open_tises(priv);
+	if (err) {
+		mlx5_core_warn(mdev, "open tises failed, %d\n", err);
+		goto err_destroy_mkey;
+	}
+
+	err = mlx5e_open_drop_rq(priv);
+	if (err) {
+		mlx5_core_err(mdev, "open drop rq failed, %d\n", err);
+		goto err_close_tises;
+	}
+
+	err = mlx5e_open_rqt(priv, MLX5E_INDIRECTION_RQT);
+	if (err) {
+		mlx5_core_warn(mdev, "open rqt(INDIR) failed, %d\n", err);
+		goto err_close_drop_rq;
+	}
+
+	err = mlx5e_open_rqt(priv, MLX5E_SINGLE_RQ_RQT);
+	if (err) {
+		mlx5_core_warn(mdev, "open rqt(SINGLE) failed, %d\n", err);
+		goto err_close_rqt_indir;
+	}
+
+	err = mlx5e_open_tirs(priv);
+	if (err) {
+		mlx5_core_warn(mdev, "open tirs failed, %d\n", err);
+		goto err_close_rqt_single;
+	}
+
+	err = mlx5e_open_flow_table(priv);
+	if (err) {
+		mlx5_core_warn(mdev, "open flow table failed, %d\n", err);
+		goto err_close_tirs;
+	}
+
+	mlx5e_init_eth_addr(priv);
+
 	err = register_netdev(netdev);
 	if (err) {
 		mlx5_core_err(mdev, "register_netdev failed, %d\n", err);
-		goto err_destroy_mkey;
+		goto err_close_flow_table;
 	}
 
 	mlx5e_enable_async_events(priv);
 
 	return priv;
 
+err_close_flow_table:
+	mlx5e_close_flow_table(priv);
+
+err_close_tirs:
+	mlx5e_close_tirs(priv);
+
+err_close_rqt_single:
+	mlx5e_close_rqt(priv, MLX5E_SINGLE_RQ_RQT);
+
+err_close_rqt_indir:
+	mlx5e_close_rqt(priv, MLX5E_INDIRECTION_RQT);
+
+err_close_drop_rq:
+	mlx5e_close_drop_rq(priv);
+
+err_close_tises:
+	mlx5e_close_tises(priv);
+
 err_destroy_mkey:
 	mlx5_core_destroy_mkey(mdev, &priv->mr);
 
@@ -2060,6 +2141,12 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv)
 	struct net_device *netdev = priv->netdev;
 
 	unregister_netdev(netdev);
+	mlx5e_close_flow_table(priv);
+	mlx5e_close_tirs(priv);
+	mlx5e_close_rqt(priv, MLX5E_SINGLE_RQ_RQT);
+	mlx5e_close_rqt(priv, MLX5E_INDIRECTION_RQT);
+	mlx5e_close_drop_rq(priv);
+	mlx5e_close_tises(priv);
 	mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
 	mlx5_dealloc_transport_domain(priv->mdev, priv->tdn);
 	mlx5_core_dealloc_pd(priv->mdev, priv->pdn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
index e6453f61141e..b4c87c7b0cf0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -387,6 +387,18 @@ int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen,
 	return err;
 }
 
+int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in,
+			 int inlen)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_rqt_out)];
+
+	MLX5_SET(modify_rqt_in, in, rqtn, rqtn);
+	MLX5_SET(modify_rqt_in, in, opcode, MLX5_CMD_OP_MODIFY_RQT);
+
+	memset(out, 0, sizeof(out));
+	return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out));
+}
+
 void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn)
 {
 	u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)];
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
index d436c2d8b527..74cae51436e4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.h
@@ -65,6 +65,8 @@ int mlx5_core_arm_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm);
 
 int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			 u32 *rqtn);
+int mlx5_core_modify_rqt(struct mlx5_core_dev *dev, u32 rqtn, u32 *in,
+			 int inlen);
 void mlx5_core_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn);
 
 #endif /* __TRANSOBJ_H__ */
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 469b7bda3304..dd2097455a2e 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4123,6 +4123,13 @@ struct mlx5_ifc_modify_rqt_out_bits {
 	u8         reserved_1[0x40];
 };
 
+struct mlx5_ifc_rqt_bitmask_bits {
+	u8	   reserved[0x20];
+
+	u8         reserved1[0x1f];
+	u8         rqn_list[0x1];
+};
+
 struct mlx5_ifc_modify_rqt_in_bits {
 	u8         opcode[0x10];
 	u8         reserved_0[0x10];
@@ -4135,7 +4142,7 @@ struct mlx5_ifc_modify_rqt_in_bits {
 
 	u8         reserved_3[0x20];
 
-	u8         modify_bitmask[0x40];
+	struct mlx5_ifc_rqt_bitmask_bits bitmask;
 
 	u8         reserved_4[0x40];
 
-- 
cgit v1.2.3


From efea389d3cc6427a9a94e92b2d7bf4c862f2cfcf Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Tue, 4 Aug 2015 14:05:47 +0300
Subject: net/mlx5_core: Support physical port counters

Added physical port counters in the following standard formats to
ethtool statistics:
  - IEEE 802.3
  - RFC2863
  - RFC2819

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       | 75 ++++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 10 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 42 ++++++++++++
 include/linux/mlx5/device.h                        | 10 +++
 include/linux/mlx5/driver.h                        |  1 +
 5 files changed, 137 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 35c33907a9ff..e9d7d90363a8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -138,6 +138,80 @@ struct mlx5e_vport_stats {
 #define NUM_VPORT_COUNTERS     31
 };
 
+static const char pport_strings[][ETH_GSTRING_LEN] = {
+	/* IEEE802.3 counters */
+	"frames_tx",
+	"frames_rx",
+	"check_seq_err",
+	"alignment_err",
+	"octets_tx",
+	"octets_received",
+	"multicast_xmitted",
+	"broadcast_xmitted",
+	"multicast_rx",
+	"broadcast_rx",
+	"in_range_len_errors",
+	"out_of_range_len",
+	"too_long_errors",
+	"symbol_err",
+	"mac_control_tx",
+	"mac_control_rx",
+	"unsupported_op_rx",
+	"pause_ctrl_rx",
+	"pause_ctrl_tx",
+
+	/* RFC2863 counters */
+	"in_octets",
+	"in_ucast_pkts",
+	"in_discards",
+	"in_errors",
+	"in_unknown_protos",
+	"out_octets",
+	"out_ucast_pkts",
+	"out_discards",
+	"out_errors",
+	"in_multicast_pkts",
+	"in_broadcast_pkts",
+	"out_multicast_pkts",
+	"out_broadcast_pkts",
+
+	/* RFC2819 counters */
+	"drop_events",
+	"octets",
+	"pkts",
+	"broadcast_pkts",
+	"multicast_pkts",
+	"crc_align_errors",
+	"undersize_pkts",
+	"oversize_pkts",
+	"fragments",
+	"jabbers",
+	"collisions",
+	"p64octets",
+	"p65to127octets",
+	"p128to255octets",
+	"p256to511octets",
+	"p512to1023octets",
+	"p1024to1518octets",
+	"p1519to2047octets",
+	"p2048to4095octets",
+	"p4096to8191octets",
+	"p8192to10239octets",
+};
+
+#define NUM_IEEE_802_3_COUNTERS		19
+#define NUM_RFC_2863_COUNTERS		13
+#define NUM_RFC_2819_COUNTERS		21
+#define NUM_PPORT_COUNTERS		(NUM_IEEE_802_3_COUNTERS + \
+					 NUM_RFC_2863_COUNTERS + \
+					 NUM_RFC_2819_COUNTERS)
+
+struct mlx5e_pport_stats {
+	__be64 IEEE_802_3_counters[NUM_IEEE_802_3_COUNTERS];
+	__be64 RFC_2863_counters[NUM_RFC_2863_COUNTERS];
+	__be64 RFC_2819_counters[NUM_RFC_2819_COUNTERS];
+};
+
 static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
 	"packets",
 	"csum_none",
@@ -180,6 +254,7 @@ struct mlx5e_sq_stats {
 
 struct mlx5e_stats {
 	struct mlx5e_vport_stats   vport;
+	struct mlx5e_pport_stats   pport;
 };
 
 struct mlx5e_params {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index b95aa3384c36..b549797b315f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -171,7 +171,7 @@ static int mlx5e_get_sset_count(struct net_device *dev, int sset)
 
 	switch (sset) {
 	case ETH_SS_STATS:
-		return NUM_VPORT_COUNTERS +
+		return NUM_VPORT_COUNTERS + NUM_PPORT_COUNTERS +
 		       priv->params.num_channels * NUM_RQ_STATS +
 		       priv->params.num_channels * priv->params.num_tc *
 						   NUM_SQ_STATS;
@@ -200,6 +200,11 @@ static void mlx5e_get_strings(struct net_device *dev,
 			strcpy(data + (idx++) * ETH_GSTRING_LEN,
 			       vport_strings[i]);
 
+		/* PPORT counters */
+		for (i = 0; i < NUM_PPORT_COUNTERS; i++)
+			strcpy(data + (idx++) * ETH_GSTRING_LEN,
+			       pport_strings[i]);
+
 		/* per channel counters */
 		for (i = 0; i < priv->params.num_channels; i++)
 			for (j = 0; j < NUM_RQ_STATS; j++)
@@ -234,6 +239,9 @@ static void mlx5e_get_ethtool_stats(struct net_device *dev,
 	for (i = 0; i < NUM_VPORT_COUNTERS; i++)
 		data[idx++] = ((u64 *)&priv->stats.vport)[i];
 
+	for (i = 0; i < NUM_PPORT_COUNTERS; i++)
+		data[idx++] = be64_to_cpu(((__be64 *)&priv->stats.pport)[i]);
+
 	/* per channel counters */
 	for (i = 0; i < priv->params.num_channels; i++)
 		for (j = 0; j < NUM_RQ_STATS; j++)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b8023a7484e0..111427b33ec8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -82,6 +82,47 @@ static void mlx5e_update_carrier_work(struct work_struct *work)
 	mutex_unlock(&priv->state_lock);
 }
 
+static void mlx5e_update_pport_counters(struct mlx5e_priv *priv)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct mlx5e_pport_stats *s = &priv->stats.pport;
+	u32 *in;
+	u32 *out;
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+
+	in  = mlx5_vzalloc(sz);
+	out = mlx5_vzalloc(sz);
+	if (!in || !out)
+		goto free_out;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out,
+			     sz, MLX5_REG_PPCNT, 0, 0);
+	memcpy(s->IEEE_802_3_counters,
+	       MLX5_ADDR_OF(ppcnt_reg, out, counter_set),
+	       sizeof(s->IEEE_802_3_counters));
+
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out,
+			     sz, MLX5_REG_PPCNT, 0, 0);
+	memcpy(s->RFC_2863_counters,
+	       MLX5_ADDR_OF(ppcnt_reg, out, counter_set),
+	       sizeof(s->RFC_2863_counters));
+
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP);
+	mlx5_core_access_reg(mdev, in, sz, out,
+			     sz, MLX5_REG_PPCNT, 0, 0);
+	memcpy(s->RFC_2819_counters,
+	       MLX5_ADDR_OF(ppcnt_reg, out, counter_set),
+	       sizeof(s->RFC_2819_counters));
+
+free_out:
+	kvfree(in);
+	kvfree(out);
+}
+
 void mlx5e_update_stats(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
@@ -202,6 +243,7 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 	s->tx_csum_offload = s->tx_packets - tx_offload_none;
 	s->rx_csum_good    = s->rx_packets - s->rx_csum_none;
 
+	mlx5e_update_pport_counters(priv);
 free_out:
 	kvfree(out);
 }
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index b943cd9e2097..250b1ff8b48d 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1182,6 +1182,16 @@ enum {
 	MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR	= 0x40,
 };
 
+enum {
+	MLX5_IEEE_802_3_COUNTERS_GROUP	      = 0x0,
+	MLX5_RFC_2863_COUNTERS_GROUP	      = 0x1,
+	MLX5_RFC_2819_COUNTERS_GROUP	      = 0x2,
+	MLX5_RFC_3635_COUNTERS_GROUP	      = 0x3,
+	MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP = 0x5,
+	MLX5_PER_PRIORITY_COUNTERS_GROUP      = 0x10,
+	MLX5_PER_TRAFFIC_CLASS_COUNTERS_GROUP = 0x11
+};
+
 static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz)
 {
 	if (pkey_sz > MLX5_MAX_LOG_PKEY_TABLE)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5fe0cae1a515..2039546b0ec6 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -103,6 +103,7 @@ enum {
 	MLX5_REG_PMTU		 = 0x5003,
 	MLX5_REG_PTYS		 = 0x5004,
 	MLX5_REG_PAOS		 = 0x5006,
+	MLX5_REG_PPCNT		 = 0x5008,
 	MLX5_REG_PMAOS		 = 0x5012,
 	MLX5_REG_PUDE		 = 0x5009,
 	MLX5_REG_PMPE		 = 0x5010,
-- 
cgit v1.2.3


From d92cff89a0c80e7e49796366e441d97f07b5d321 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 4 Aug 2015 18:26:19 +0200
Subject: net_dbg_ratelimited: turn into no-op when !DEBUG

The pr_debug family of functions turns into a no-op when -DDEBUG is not
specified, opting instead to call "no_printk", which gets compiled to a
no-op (but retains gcc's nice warnings about printf-style arguments).

The problem with net_dbg_ratelimited is that it is defined to be a
variant of net_ratelimited_function, which expands to essentially:

    if (net_ratelimit())
        pr_debug(fmt, ...);

When DEBUG is not defined, then this becomes,

    if (net_ratelimit())
        ;

This seems benign, except it isn't. Firstly, there's the obvious
overhead of calling net_ratelimit needlessly, which does quite some book
keeping for the rate limiting. Given that the pr_debug and
net_dbg_ratelimited family of functions are sprinkled liberally through
performance critical code, with developers assuming they'll be compiled
out to a no-op most of the time, we certainly do not want this needless
book keeping. Secondly, and most visibly, even though no debug message
is printed when DEBUG is not defined, if there is a flood of
invocations, dmesg winds up peppered with messages such as
"net_ratelimit: 320 callbacks suppressed". This is because our
aforementioned net_ratelimit() function actually prints this text in
some circumstances. It's especially odd to see this when there isn't any
other accompanying debug message.

So, in sum, it doesn't make sense to have this function's current
behavior, and instead it should match what every other debug family of
functions in the kernel does with !DEBUG -- nothing.

This patch replaces calls to net_dbg_ratelimited when !DEBUG with
no_printk, keeping with the idiom of all the other debug print helpers.

Also, though not strictly neccessary, it guards the call with an if (0)
so that all evaluation of any arguments are sure to be compiled out.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index 04aa06852771..049d4b03c4c4 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -239,8 +239,16 @@ do {								\
 	net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__)
 #define net_info_ratelimited(fmt, ...)				\
 	net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__)
+#if defined(DEBUG)
 #define net_dbg_ratelimited(fmt, ...)				\
 	net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__)
+#else
+#define net_dbg_ratelimited(fmt, ...)				\
+	do {							\
+		if (0)						\
+			no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \
+	} while (0)
+#endif
 
 bool __net_get_random_once(void *buf, int nbytes, bool *done,
 			   struct static_key *done_key);
-- 
cgit v1.2.3


From 3499abb249bb5ed9d21031944bc3059ec4aa2909 Mon Sep 17 00:00:00 2001
From: Andreas Schultz <aschultz@tpip.net>
Date: Wed, 5 Aug 2015 17:51:45 +0200
Subject: netfilter: nfacct: per network namespace support

- Move the nfnl_acct_list into the network namespace, initialize
  and destroy it per namespace
- Keep track of refcnt on nfacct objects, the old logic does not
  longer work with a per namespace list
- Adjust xt_nfacct to pass the namespace when registring objects

Signed-off-by: Andreas Schultz <aschultz@tpip.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink_acct.h |  3 +-
 include/net/net_namespace.h              |  3 ++
 net/netfilter/nfnetlink_acct.c           | 71 ++++++++++++++++++++++----------
 net/netfilter/xt_nfacct.c                |  2 +-
 4 files changed, 56 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink_acct.h b/include/linux/netfilter/nfnetlink_acct.h
index 6ec975748742..80ca889b164e 100644
--- a/include/linux/netfilter/nfnetlink_acct.h
+++ b/include/linux/netfilter/nfnetlink_acct.h
@@ -2,6 +2,7 @@
 #define _NFNL_ACCT_H_
 
 #include <uapi/linux/netfilter/nfnetlink_acct.h>
+#include <net/net_namespace.h>
 
 enum {
 	NFACCT_NO_QUOTA		= -1,
@@ -11,7 +12,7 @@ enum {
 
 struct nf_acct;
 
-struct nf_acct *nfnl_acct_find_get(const char *filter_name);
+struct nf_acct *nfnl_acct_find_get(struct net *net, const char *filter_name);
 void nfnl_acct_put(struct nf_acct *acct);
 void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct);
 extern int nfnl_acct_overquota(const struct sk_buff *skb,
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index e951453e0a23..2dcea635ecce 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -118,6 +118,9 @@ struct net {
 #endif
 	struct sock		*nfnl;
 	struct sock		*nfnl_stash;
+#if IS_ENABLED(CONFIG_NETFILTER_NETLINK_ACCT)
+	struct list_head        nfnl_acct_list;
+#endif
 #endif
 #ifdef CONFIG_WEXT_CORE
 	struct sk_buff_head	wext_nlevents;
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index c18af2f63eef..fefbf5f0b28d 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -27,8 +27,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
 MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure");
 
-static LIST_HEAD(nfnl_acct_list);
-
 struct nf_acct {
 	atomic64_t		pkts;
 	atomic64_t		bytes;
@@ -53,6 +51,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])
 {
 	struct nf_acct *nfacct, *matching = NULL;
+	struct net *net = sock_net(nfnl);
 	char *acct_name;
 	unsigned int size = 0;
 	u32 flags = 0;
@@ -64,7 +63,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
 	if (strlen(acct_name) == 0)
 		return -EINVAL;
 
-	list_for_each_entry(nfacct, &nfnl_acct_list, head) {
+	list_for_each_entry(nfacct, &net->nfnl_acct_list, head) {
 		if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0)
 			continue;
 
@@ -124,7 +123,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
 			     be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS])));
 	}
 	atomic_set(&nfacct->refcnt, 1);
-	list_add_tail_rcu(&nfacct->head, &nfnl_acct_list);
+	list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list);
 	return 0;
 }
 
@@ -185,6 +184,7 @@ nla_put_failure:
 static int
 nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = sock_net(skb->sk);
 	struct nf_acct *cur, *last;
 	const struct nfacct_filter *filter = cb->data;
 
@@ -196,7 +196,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		cb->args[1] = 0;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+	list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
 		if (last) {
 			if (cur != last)
 				continue;
@@ -257,6 +257,7 @@ static int
 nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])
 {
+	struct net *net = sock_net(nfnl);
 	int ret = -ENOENT;
 	struct nf_acct *cur;
 	char *acct_name;
@@ -283,7 +284,7 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb,
 		return -EINVAL;
 	acct_name = nla_data(tb[NFACCT_NAME]);
 
-	list_for_each_entry(cur, &nfnl_acct_list, head) {
+	list_for_each_entry(cur, &net->nfnl_acct_list, head) {
 		struct sk_buff *skb2;
 
 		if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
@@ -336,19 +337,20 @@ static int
 nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])
 {
+	struct net *net = sock_net(nfnl);
 	char *acct_name;
 	struct nf_acct *cur;
 	int ret = -ENOENT;
 
 	if (!tb[NFACCT_NAME]) {
-		list_for_each_entry(cur, &nfnl_acct_list, head)
+		list_for_each_entry(cur, &net->nfnl_acct_list, head)
 			nfnl_acct_try_del(cur);
 
 		return 0;
 	}
 	acct_name = nla_data(tb[NFACCT_NAME]);
 
-	list_for_each_entry(cur, &nfnl_acct_list, head) {
+	list_for_each_entry(cur, &net->nfnl_acct_list, head) {
 		if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0)
 			continue;
 
@@ -394,12 +396,12 @@ static const struct nfnetlink_subsystem nfnl_acct_subsys = {
 
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT);
 
-struct nf_acct *nfnl_acct_find_get(const char *acct_name)
+struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name)
 {
 	struct nf_acct *cur, *acct = NULL;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(cur, &nfnl_acct_list, head) {
+	list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) {
 		if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
 			continue;
 
@@ -422,7 +424,9 @@ EXPORT_SYMBOL_GPL(nfnl_acct_find_get);
 
 void nfnl_acct_put(struct nf_acct *acct)
 {
-	atomic_dec(&acct->refcnt);
+	if (atomic_dec_and_test(&acct->refcnt))
+		kfree_rcu(acct, rcu_head);
+
 	module_put(THIS_MODULE);
 }
 EXPORT_SYMBOL_GPL(nfnl_acct_put);
@@ -478,34 +482,59 @@ int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct)
 }
 EXPORT_SYMBOL_GPL(nfnl_acct_overquota);
 
+static int __net_init nfnl_acct_net_init(struct net *net)
+{
+	INIT_LIST_HEAD(&net->nfnl_acct_list);
+
+	return 0;
+}
+
+static void __net_exit nfnl_acct_net_exit(struct net *net)
+{
+	struct nf_acct *cur, *tmp;
+
+	list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) {
+		list_del_rcu(&cur->head);
+
+		if (atomic_dec_and_test(&cur->refcnt))
+			kfree_rcu(cur, rcu_head);
+	}
+}
+
+static struct pernet_operations nfnl_acct_ops = {
+        .init   = nfnl_acct_net_init,
+        .exit   = nfnl_acct_net_exit,
+};
+
 static int __init nfnl_acct_init(void)
 {
 	int ret;
 
+	ret = register_pernet_subsys(&nfnl_acct_ops);
+	if (ret < 0) {
+		pr_err("nfnl_acct_init: failed to register pernet ops\n");
+		goto err_out;
+	}
+
 	pr_info("nfnl_acct: registering with nfnetlink.\n");
 	ret = nfnetlink_subsys_register(&nfnl_acct_subsys);
 	if (ret < 0) {
 		pr_err("nfnl_acct_init: cannot register with nfnetlink.\n");
-		goto err_out;
+		goto cleanup_pernet;
 	}
 	return 0;
+
+cleanup_pernet:
+	unregister_pernet_subsys(&nfnl_acct_ops);
 err_out:
 	return ret;
 }
 
 static void __exit nfnl_acct_exit(void)
 {
-	struct nf_acct *cur, *tmp;
-
 	pr_info("nfnl_acct: unregistering from nfnetlink.\n");
 	nfnetlink_subsys_unregister(&nfnl_acct_subsys);
-
-	list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) {
-		list_del_rcu(&cur->head);
-		/* We are sure that our objects have no clients at this point,
-		 * it's safe to release them all without checking refcnt. */
-		kfree_rcu(cur, rcu_head);
-	}
+	unregister_pernet_subsys(&nfnl_acct_ops);
 }
 
 module_init(nfnl_acct_init);
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index 8c646ed9c921..3048a7e3a90a 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -37,7 +37,7 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par)
 	struct xt_nfacct_match_info *info = par->matchinfo;
 	struct nf_acct *nfacct;
 
-	nfacct = nfnl_acct_find_get(info->name);
+	nfacct = nfnl_acct_find_get(par->net, info->name);
 	if (nfacct == NULL) {
 		pr_info("xt_nfacct: accounting object with name `%s' "
 			"does not exists\n", info->name);
-- 
cgit v1.2.3


From ffe8690c85b8426db7783064724d106702f1b1e8 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Thu, 6 Aug 2015 07:02:32 +0000
Subject: perf: add the necessary core perf APIs when accessing events counters
 in eBPF programs

This patch add three core perf APIs:
 - perf_event_attrs(): export the struct perf_event_attr from struct
   perf_event;
 - perf_event_get(): get the struct perf_event from the given fd;
 - perf_event_read_local(): read the events counters active on the
   current CPU;
These APIs are needed when accessing events counters in eBPF programs.

The API perf_event_read_local() comes from Peter and I add the
corresponding SOB.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h | 10 ++++++
 kernel/events/core.c       | 78 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2027809433b3..092a0e8a479a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -641,6 +641,8 @@ extern int perf_event_init_task(struct task_struct *child);
 extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
+extern struct perf_event *perf_event_get(unsigned int fd);
+extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
@@ -659,6 +661,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				void *context);
 extern void perf_pmu_migrate_context(struct pmu *pmu,
 				int src_cpu, int dst_cpu);
+extern u64 perf_event_read_local(struct perf_event *event);
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
@@ -979,6 +982,12 @@ static inline int perf_event_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
 static inline void perf_event_delayed_put(struct task_struct *task)	{ }
+static inline struct perf_event *perf_event_get(unsigned int fd)	{ return ERR_PTR(-EINVAL); }
+static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
+{
+	return ERR_PTR(-EINVAL);
+}
+static inline u64 perf_event_read_local(struct perf_event *event)	{ return -EINVAL; }
 static inline void perf_event_print_debug(void)				{ }
 static inline int perf_event_task_disable(void)				{ return -EINVAL; }
 static inline int perf_event_task_enable(void)				{ return -EINVAL; }
@@ -1011,6 +1020,7 @@ static inline void perf_event_enable(struct perf_event *event)		{ }
 static inline void perf_event_disable(struct perf_event *event)		{ }
 static inline int __perf_event_disable(void *info)			{ return -1; }
 static inline void perf_event_task_tick(void)				{ }
+static inline int perf_event_release_kernel(struct perf_event *event)	{ return 0; }
 #endif
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae3419b99..e2c6a8886d4d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3212,6 +3212,59 @@ static inline u64 perf_event_count(struct perf_event *event)
 	return __perf_event_count(event);
 }
 
+/*
+ * NMI-safe method to read a local event, that is an event that
+ * is:
+ *   - either for the current task, or for this CPU
+ *   - does not have inherit set, for inherited task events
+ *     will not be local and we cannot read them atomically
+ *   - must not have a pmu::count method
+ */
+u64 perf_event_read_local(struct perf_event *event)
+{
+	unsigned long flags;
+	u64 val;
+
+	/*
+	 * Disabling interrupts avoids all counter scheduling (context
+	 * switches, timer based rotation and IPIs).
+	 */
+	local_irq_save(flags);
+
+	/* If this is a per-task event, it must be for current */
+	WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
+		     event->hw.target != current);
+
+	/* If this is a per-CPU event, it must be for this CPU */
+	WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
+		     event->cpu != smp_processor_id());
+
+	/*
+	 * It must not be an event with inherit set, we cannot read
+	 * all child counters from atomic context.
+	 */
+	WARN_ON_ONCE(event->attr.inherit);
+
+	/*
+	 * It must not have a pmu::count method, those are not
+	 * NMI safe.
+	 */
+	WARN_ON_ONCE(event->pmu->count);
+
+	/*
+	 * If the event is currently on this CPU, its either a per-task event,
+	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
+	 * oncpu == -1).
+	 */
+	if (event->oncpu == smp_processor_id())
+		event->pmu->read(event);
+
+	val = local64_read(&event->count);
+	local_irq_restore(flags);
+
+	return val;
+}
+
 static u64 perf_event_read(struct perf_event *event)
 {
 	/*
@@ -8574,6 +8627,31 @@ void perf_event_delayed_put(struct task_struct *task)
 		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
 }
 
+struct perf_event *perf_event_get(unsigned int fd)
+{
+	int err;
+	struct fd f;
+	struct perf_event *event;
+
+	err = perf_fget_light(fd, &f);
+	if (err)
+		return ERR_PTR(err);
+
+	event = f.file->private_data;
+	atomic_long_inc(&event->refcount);
+	fdput(f);
+
+	return event;
+}
+
+const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
+{
+	if (!event)
+		return ERR_PTR(-EINVAL);
+
+	return &event->attr;
+}
+
 /*
  * inherit a event from parent task to child task:
  */
-- 
cgit v1.2.3


From 2a36f0b92eb638dd023870574eb471b1c56be9ad Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Thu, 6 Aug 2015 07:02:33 +0000
Subject: bpf: Make the bpf_prog_array_map more generic

All the map backends are of generic nature. In order to avoid
adding much special code into the eBPF core, rewrite part of
the bpf_prog_array map code and make it more generic. So the
new perf_event_array map type can reuse most of code with
bpf_prog_array map and add fewer lines of special code.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/net/bpf_jit_comp.c |  6 ++--
 include/linux/bpf.h         |  8 +++--
 kernel/bpf/arraymap.c       | 80 +++++++++++++++++++++++++++------------------
 kernel/bpf/core.c           |  2 +-
 kernel/bpf/syscall.c        |  2 +-
 5 files changed, 60 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index ec5214f39aa8..70efcd0940f9 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog)
  *     goto out;
  *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
  *     goto out;
- *   prog = array->prog[index];
+ *   prog = array->ptrs[index];
  *   if (prog == NULL)
  *     goto out;
  *   goto *(prog->bpf_func + prologue_size);
@@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog)
 	EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
 	EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
 
-	/* prog = array->prog[index]; */
+	/* prog = array->ptrs[index]; */
 	EMIT4_off32(0x48, 0x8D, 0x84, 0xD6,       /* lea rax, [rsi + rdx * 8 + offsetof(...)] */
-		    offsetof(struct bpf_array, prog));
+		    offsetof(struct bpf_array, ptrs));
 	EMIT3(0x48, 0x8B, 0x00);                  /* mov rax, qword ptr [rax] */
 
 	/* if (prog == NULL)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 139d6d2e123f..d495211d63d1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -24,6 +24,10 @@ struct bpf_map_ops {
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
 	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
 	int (*map_delete_elem)(struct bpf_map *map, void *key);
+
+	/* funcs called by prog_array and perf_event_array map */
+	void *(*map_fd_get_ptr) (struct bpf_map *map, int fd);
+	void (*map_fd_put_ptr) (void *ptr);
 };
 
 struct bpf_map {
@@ -142,13 +146,13 @@ struct bpf_array {
 	bool owner_jited;
 	union {
 		char value[0] __aligned(8);
-		struct bpf_prog *prog[0] __aligned(8);
+		void *ptrs[0] __aligned(8);
 	};
 };
 #define MAX_TAIL_CALL_CNT 32
 
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
-void bpf_prog_array_map_clear(struct bpf_map *map);
+void bpf_fd_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229a6fa4..45df6572ecfd 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -150,15 +150,15 @@ static int __init register_array_map(void)
 }
 late_initcall(register_array_map);
 
-static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
 {
-	/* only bpf_prog file descriptors can be stored in prog_array map */
+	/* only file descriptors can be stored in this type of map */
 	if (attr->value_size != sizeof(u32))
 		return ERR_PTR(-EINVAL);
 	return array_map_alloc(attr);
 }
 
-static void prog_array_map_free(struct bpf_map *map)
+static void fd_array_map_free(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
@@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map)
 
 	/* make sure it's empty */
 	for (i = 0; i < array->map.max_entries; i++)
-		BUG_ON(array->prog[i] != NULL);
+		BUG_ON(array->ptrs[i] != NULL);
 	kvfree(array);
 }
 
-static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	return NULL;
 }
 
 /* only called from syscall */
-static int prog_array_map_update_elem(struct bpf_map *map, void *key,
-				      void *value, u64 map_flags)
+static int fd_array_map_update_elem(struct bpf_map *map, void *key,
+				    void *value, u64 map_flags)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
-	struct bpf_prog *prog, *old_prog;
+	void *new_ptr, *old_ptr;
 	u32 index = *(u32 *)key, ufd;
 
 	if (map_flags != BPF_ANY)
@@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key,
 		return -E2BIG;
 
 	ufd = *(u32 *)value;
-	prog = bpf_prog_get(ufd);
-	if (IS_ERR(prog))
-		return PTR_ERR(prog);
-
-	if (!bpf_prog_array_compatible(array, prog)) {
-		bpf_prog_put(prog);
-		return -EINVAL;
-	}
+	new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+	if (IS_ERR(new_ptr))
+		return PTR_ERR(new_ptr);
 
-	old_prog = xchg(array->prog + index, prog);
-	if (old_prog)
-		bpf_prog_put_rcu(old_prog);
+	old_ptr = xchg(array->ptrs + index, new_ptr);
+	if (old_ptr)
+		map->ops->map_fd_put_ptr(old_ptr);
 
 	return 0;
 }
 
-static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
-	struct bpf_prog *old_prog;
+	void *old_ptr;
 	u32 index = *(u32 *)key;
 
 	if (index >= array->map.max_entries)
 		return -E2BIG;
 
-	old_prog = xchg(array->prog + index, NULL);
-	if (old_prog) {
-		bpf_prog_put_rcu(old_prog);
+	old_ptr = xchg(array->ptrs + index, NULL);
+	if (old_ptr) {
+		map->ops->map_fd_put_ptr(old_ptr);
 		return 0;
 	} else {
 		return -ENOENT;
 	}
 }
 
+static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_prog *prog = bpf_prog_get(fd);
+	if (IS_ERR(prog))
+		return prog;
+
+	if (!bpf_prog_array_compatible(array, prog)) {
+		bpf_prog_put(prog);
+		return ERR_PTR(-EINVAL);
+	}
+	return prog;
+}
+
+static void prog_fd_array_put_ptr(void *ptr)
+{
+	struct bpf_prog *prog = ptr;
+
+	bpf_prog_put_rcu(prog);
+}
+
 /* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_prog_array_map_clear(struct bpf_map *map)
+void bpf_fd_array_map_clear(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
 
 	for (i = 0; i < array->map.max_entries; i++)
-		prog_array_map_delete_elem(map, &i);
+		fd_array_map_delete_elem(map, &i);
 }
 
 static const struct bpf_map_ops prog_array_ops = {
-	.map_alloc = prog_array_map_alloc,
-	.map_free = prog_array_map_free,
+	.map_alloc = fd_array_map_alloc,
+	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
-	.map_lookup_elem = prog_array_map_lookup_elem,
-	.map_update_elem = prog_array_map_update_elem,
-	.map_delete_elem = prog_array_map_delete_elem,
+	.map_lookup_elem = fd_array_map_lookup_elem,
+	.map_update_elem = fd_array_map_update_elem,
+	.map_delete_elem = fd_array_map_delete_elem,
+	.map_fd_get_ptr = prog_fd_array_get_ptr,
+	.map_fd_put_ptr = prog_fd_array_put_ptr,
 };
 
 static struct bpf_map_type_list prog_array_type __read_mostly = {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index fafa74161445..67c380cfa9ca 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -450,7 +450,7 @@ select_insn:
 
 		tail_call_cnt++;
 
-		prog = READ_ONCE(array->prog[index]);
+		prog = READ_ONCE(array->ptrs[index]);
 		if (unlikely(!prog))
 			goto out;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1b14d197a4f..dc9b464fefa9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -72,7 +72,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 		/* prog_array stores refcnt-ed bpf_prog pointers
 		 * release them all when user space closes prog_array_fd
 		 */
-		bpf_prog_array_map_clear(map);
+		bpf_fd_array_map_clear(map);
 
 	bpf_map_put(map);
 	return 0;
-- 
cgit v1.2.3


From ea317b267e9d03a8241893aa176fba7661d07579 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Thu, 6 Aug 2015 07:02:34 +0000
Subject: bpf: Add new bpf map type to store the pointer to struct perf_event

Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
This map only stores the pointer to struct perf_event. The
user space event FDs from perf_event_open() syscall are converted
to the pointer to struct perf_event and stored in map.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/arraymap.c    | 57 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d495211d63d1..4fc1f4070789 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -10,6 +10,7 @@
 #include <uapi/linux/bpf.h>
 #include <linux/workqueue.h>
 #include <linux/file.h>
+#include <linux/perf_event.h>
 
 struct bpf_map;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2ce13c109b00..a1814e8e53a7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -114,6 +114,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH,
 	BPF_MAP_TYPE_ARRAY,
 	BPF_MAP_TYPE_PROG_ARRAY,
+	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 45df6572ecfd..29ace107f236 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -273,3 +273,60 @@ static int __init register_prog_array_map(void)
 	return 0;
 }
 late_initcall(register_prog_array_map);
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+	bpf_fd_array_map_clear(map);
+	fd_array_map_free(map);
+}
+
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+	struct perf_event *event;
+	const struct perf_event_attr *attr;
+
+	event = perf_event_get(fd);
+	if (IS_ERR(event))
+		return event;
+
+	attr = perf_event_attrs(event);
+	if (IS_ERR(attr))
+		return (void *)attr;
+
+	if (attr->type != PERF_TYPE_RAW &&
+	    attr->type != PERF_TYPE_HARDWARE) {
+		perf_event_release_kernel(event);
+		return ERR_PTR(-EINVAL);
+	}
+	return event;
+}
+
+static void perf_event_fd_array_put_ptr(void *ptr)
+{
+	struct perf_event *event = ptr;
+
+	perf_event_release_kernel(event);
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+	.map_alloc = fd_array_map_alloc,
+	.map_free = perf_event_array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = fd_array_map_lookup_elem,
+	.map_update_elem = fd_array_map_update_elem,
+	.map_delete_elem = fd_array_map_delete_elem,
+	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
+	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+	.ops = &perf_event_array_ops,
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+	bpf_register_map_type(&perf_event_array_type);
+	return 0;
+}
+late_initcall(register_perf_event_array_map);
-- 
cgit v1.2.3


From 35578d7984003097af2b1e34502bc943d40c1804 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Thu, 6 Aug 2015 07:02:35 +0000
Subject: bpf: Implement function bpf_perf_event_read() that get the selected
 hardware PMU conuter

According to the perf_event_map_fd and index, the function
bpf_perf_event_read() can convert the corresponding map
value to the pointer to struct perf_event and return the
Hardware PMU counter value.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/verifier.c    | 48 +++++++++++++++++++++++++++++++++---------------
 kernel/trace/bpf_trace.c | 31 +++++++++++++++++++++++++++++++
 4 files changed, 66 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4fc1f4070789..f57d7fed9ec3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -190,6 +190,7 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
+extern const struct bpf_func_proto bpf_perf_event_read_proto;
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a1814e8e53a7..92a48e2d5461 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -271,6 +271,7 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_skb_get_tunnel_key,
 	BPF_FUNC_skb_set_tunnel_key,
+	BPF_FUNC_perf_event_read,	/* u64 bpf_perf_event_read(&map, index) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cd307df98cb3..48e1c7192560 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -238,6 +238,14 @@ static const char * const reg_type_str[] = {
 	[CONST_IMM]		= "imm",
 };
 
+static const struct {
+	int map_type;
+	int func_id;
+} func_limit[] = {
+	{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
+	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+};
+
 static void print_verifier_state(struct verifier_env *env)
 {
 	enum bpf_reg_type t;
@@ -837,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 	return err;
 }
 
+static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+{
+	bool bool_map, bool_func;
+	int i;
+
+	if (!map)
+		return 0;
+
+	for (i = 0; i <= ARRAY_SIZE(func_limit); i++) {
+		bool_map = (map->map_type == func_limit[i].map_type);
+		bool_func = (func_id == func_limit[i].func_id);
+		/* only when map & func pair match it can continue.
+		 * don't allow any other map type to be passed into
+		 * the special func;
+		 */
+		if (bool_map != bool_func)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int check_call(struct verifier_env *env, int func_id)
 {
 	struct verifier_state *state = &env->cur_state;
@@ -912,21 +942,9 @@ static int check_call(struct verifier_env *env, int func_id)
 		return -EINVAL;
 	}
 
-	if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
-	    func_id != BPF_FUNC_tail_call)
-		/* prog_array map type needs extra care:
-		 * only allow to pass it into bpf_tail_call() for now.
-		 * bpf_map_delete_elem() can be allowed in the future,
-		 * while bpf_map_update_elem() must only be done via syscall
-		 */
-		return -EINVAL;
-
-	if (func_id == BPF_FUNC_tail_call &&
-	    map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
-		/* don't allow any other map type to be passed into
-		 * bpf_tail_call()
-		 */
-		return -EINVAL;
+	err = check_map_func_compatibility(map, func_id);
+	if (err)
+		return err;
 
 	return 0;
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041adee90..ef9936df1b04 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -158,6 +158,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct perf_event *event;
+
+	if (unlikely(index >= array->map.max_entries))
+		return -E2BIG;
+
+	event = (struct perf_event *)array->ptrs[index];
+	if (!event)
+		return -ENOENT;
+
+	/*
+	 * we don't know if the function is run successfully by the
+	 * return value. It can be judged in other places, such as
+	 * eBPF programs.
+	 */
+	return perf_event_read_local(event);
+}
+
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+	.func		= bpf_perf_event_read,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -183,6 +212,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return bpf_get_trace_printk_proto();
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_perf_event_read:
+		return &bpf_perf_event_read_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 4c96b7dc0d393f12c17e0d81db15aa4a820a6ab3 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Wed, 12 Aug 2015 17:06:26 -0500
Subject: Add a matching set of device_ functions for determining mac/phy

OF has some helper functions for parsing MAC and PHY settings.
In cases where the platform is providing this information rather
than the device itself, there needs to be similar functions for ACPI.

These functions are slightly modified versions of the ones in
of_net which can use information provided via DT or ACPI.

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/base/property.c  | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/property.h |  4 +++
 2 files changed, 77 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index f3f6d167f3f1..2e8cd147f02d 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -16,6 +16,8 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/property.h>
+#include <linux/etherdevice.h>
+#include <linux/phy.h>
 
 /**
  * device_add_property_set - Add a collection of properties to a device object.
@@ -533,3 +535,74 @@ bool device_dma_is_coherent(struct device *dev)
 	return coherent;
 }
 EXPORT_SYMBOL_GPL(device_dma_is_coherent);
+
+/**
+ * device_get_phy_mode - Get phy mode for given device_node
+ * @dev:	Pointer to the given device
+ *
+ * The function gets phy interface string from property 'phy-mode' or
+ * 'phy-connection-type', and return its index in phy_modes table, or errno in
+ * error case.
+ */
+int device_get_phy_mode(struct device *dev)
+{
+	const char *pm;
+	int err, i;
+
+	err = device_property_read_string(dev, "phy-mode", &pm);
+	if (err < 0)
+		err = device_property_read_string(dev,
+						  "phy-connection-type", &pm);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++)
+		if (!strcasecmp(pm, phy_modes(i)))
+			return i;
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(device_get_phy_mode);
+
+static void *device_get_mac_addr(struct device *dev,
+				 const char *name, char *addr,
+				 int alen)
+{
+	int ret = device_property_read_u8_array(dev, name, addr, alen);
+
+	if (ret == 0 && is_valid_ether_addr(addr))
+		return addr;
+	return NULL;
+}
+
+/**
+ * Search the device tree for the best MAC address to use.  'mac-address' is
+ * checked first, because that is supposed to contain to "most recent" MAC
+ * address. If that isn't set, then 'local-mac-address' is checked next,
+ * because that is the default address.  If that isn't set, then the obsolete
+ * 'address' is checked, just in case we're using an old device tree.
+ *
+ * Note that the 'address' property is supposed to contain a virtual address of
+ * the register set, but some DTS files have redefined that property to be the
+ * MAC address.
+ *
+ * All-zero MAC addresses are rejected, because those could be properties that
+ * exist in the device tree, but were not set by U-Boot.  For example, the
+ * DTS could define 'mac-address' and 'local-mac-address', with zero MAC
+ * addresses.  Some older U-Boots only initialized 'local-mac-address'.  In
+ * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists
+ * but is all zeros.
+*/
+void *device_get_mac_address(struct device *dev, char *addr, int alen)
+{
+	addr = device_get_mac_addr(dev, "mac-address", addr, alen);
+	if (addr)
+		return addr;
+
+	addr = device_get_mac_addr(dev, "local-mac-address", addr, alen);
+	if (addr)
+		return addr;
+
+	return device_get_mac_addr(dev, "address", addr, alen);
+}
+EXPORT_SYMBOL(device_get_mac_address);
diff --git a/include/linux/property.h b/include/linux/property.h
index 76ebde9c11d4..a59c6ee566c2 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -166,4 +166,8 @@ void device_add_property_set(struct device *dev, struct property_set *pset);
 
 bool device_dma_is_coherent(struct device *dev);
 
+int device_get_phy_mode(struct device *dev);
+
+void *device_get_mac_address(struct device *dev, char *addr, int alen);
+
 #endif /* _LINUX_PROPERTY_H_ */
-- 
cgit v1.2.3


From 35103d11173b8fea874183f8aa508ae71234d299 Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <gospo@cumulusnetworks.com>
Date: Thu, 13 Aug 2015 10:39:01 -0400
Subject: net: ipv6 sysctl option to ignore routes when nexthop link is down

Like the ipv4 patch with a similar title, this adds a sysctl to allow
the user to change routing behavior based on whether or not the
interface associated with the nexthop was an up or down link.  The
default setting preserves the current behavior, but anyone that enables
it will notice that nexthops on down interfaces will no longer be
selected:

net.ipv6.conf.all.ignore_routes_with_linkdown = 0
net.ipv6.conf.default.ignore_routes_with_linkdown = 0
net.ipv6.conf.lo.ignore_routes_with_linkdown = 0
...

When the above sysctls are set, not only will link status be reported to
userspace, but an indication that a nexthop is dead and will not be used
is also reported.

1000::/8 via 7000::2 dev p7p1  metric 1024 dead linkdown  pref medium
1000::/8 via 8000::2 dev p8p1  metric 1024  pref medium
7000::/8 dev p7p1  proto kernel  metric 256 dead linkdown  pref medium
8000::/8 dev p8p1  proto kernel  metric 256  pref medium
9000::/8 via 8000::2 dev p8p1  metric 2048  pref medium
9000::/8 via 7000::2 dev p7p1  metric 1024 dead linkdown  pref medium
fe80::/64 dev p7p1  proto kernel  metric 256 dead linkdown  pref medium
fe80::/64 dev p8p1  proto kernel  metric 256  pref medium

This also adds devconf support and notification when sysctl values
change.

v2: drop use of rt6i_nhflags since it is not needed right now

Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: Dinesh Dutt <ddutt@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h      |   1 +
 include/uapi/linux/ipv6.h |   1 +
 net/ipv6/addrconf.c       | 105 +++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/route.c          |  11 ++++-
 4 files changed, 116 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index cb9dcad72372..f1f32af6d9b9 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -31,6 +31,7 @@ struct ipv6_devconf {
 	__s32		accept_ra_defrtr;
 	__s32		accept_ra_min_hop_limit;
 	__s32		accept_ra_pinfo;
+	__s32		ignore_routes_with_linkdown;
 #ifdef CONFIG_IPV6_ROUTER_PREF
 	__s32		accept_ra_rtr_pref;
 	__s32		rtr_probe_interval;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 80f3b74446a1..38b4fef20219 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -173,6 +173,7 @@ enum {
 	DEVCONF_STABLE_SECRET,
 	DEVCONF_USE_OIF_ADDRS_ONLY,
 	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
+	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 53e3a9d756b0..5dfbac72f1ab 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -214,6 +214,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 		.initialized = false,
 	},
 	.use_oif_addrs_only	= 0,
+	.ignore_routes_with_linkdown = 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -257,6 +258,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 		.initialized = false,
 	},
 	.use_oif_addrs_only	= 0,
+	.ignore_routes_with_linkdown = 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -472,6 +474,9 @@ static int inet6_netconf_msgsize_devconf(int type)
 	if (type == -1 || type == NETCONFA_PROXY_NEIGH)
 		size += nla_total_size(4);
 
+	if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+		size += nla_total_size(4);
+
 	return size;
 }
 
@@ -508,6 +513,11 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0)
 		goto nla_put_failure;
 
+	if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+			devconf->ignore_routes_with_linkdown) < 0)
+		goto nla_put_failure;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
@@ -544,6 +554,7 @@ static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = {
 	[NETCONFA_IFINDEX]	= { .len = sizeof(int) },
 	[NETCONFA_FORWARDING]	= { .len = sizeof(int) },
 	[NETCONFA_PROXY_NEIGH]	= { .len = sizeof(int) },
+	[NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN]	= { .len = sizeof(int) },
 };
 
 static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
@@ -766,6 +777,63 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf)
 		rt6_purge_dflt_routers(net);
 	return 1;
 }
+
+static void addrconf_linkdown_change(struct net *net, __s32 newf)
+{
+	struct net_device *dev;
+	struct inet6_dev *idev;
+
+	for_each_netdev(net, dev) {
+		idev = __in6_dev_get(dev);
+		if (idev) {
+			int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf);
+
+			idev->cnf.ignore_routes_with_linkdown = newf;
+			if (changed)
+				inet6_netconf_notify_devconf(dev_net(dev),
+							     NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+							     dev->ifindex,
+							     &idev->cnf);
+		}
+	}
+}
+
+static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf)
+{
+	struct net *net;
+	int old;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	net = (struct net *)table->extra2;
+	old = *p;
+	*p = newf;
+
+	if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) {
+		if ((!newf) ^ (!old))
+			inet6_netconf_notify_devconf(net,
+						     NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+						     NETCONFA_IFINDEX_DEFAULT,
+						     net->ipv6.devconf_dflt);
+		rtnl_unlock();
+		return 0;
+	}
+
+	if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) {
+		net->ipv6.devconf_dflt->ignore_routes_with_linkdown = newf;
+		addrconf_linkdown_change(net, newf);
+		if ((!newf) ^ (!old))
+			inet6_netconf_notify_devconf(net,
+						     NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+						     NETCONFA_IFINDEX_ALL,
+						     net->ipv6.devconf_all);
+	}
+	rtnl_unlock();
+
+	return 1;
+}
+
 #endif
 
 /* Nobody refers to this ifaddr, destroy it */
@@ -4616,6 +4684,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc;
 	array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local;
 	array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu;
+	array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown;
 	/* we omit DEVCONF_STABLE_SECRET for now */
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 }
@@ -5338,6 +5407,34 @@ out:
 	return err;
 }
 
+static
+int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
+						int write,
+						void __user *buffer,
+						size_t *lenp,
+						loff_t *ppos)
+{
+	int *valp = ctl->data;
+	int val = *valp;
+	loff_t pos = *ppos;
+	struct ctl_table lctl;
+	int ret;
+
+	/* ctl->data points to idev->cnf.ignore_routes_when_linkdown
+	 * we should not modify it until we get the rtnl lock.
+	 */
+	lctl = *ctl;
+	lctl.data = &val;
+
+	ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);
+
+	if (write)
+		ret = addrconf_fixup_linkdown(ctl, valp, val);
+	if (ret)
+		*ppos = pos;
+	return ret;
+}
+
 static struct addrconf_sysctl_table
 {
 	struct ctl_table_header *sysctl_header;
@@ -5629,7 +5726,13 @@ static struct addrconf_sysctl_table
 			.maxlen         = sizeof(int),
 			.mode           = 0644,
 			.proc_handler   = proc_dointvec,
-
+		},
+		{
+			.procname	= "ignore_routes_with_linkdown",
+			.data		= &ipv6_devconf.ignore_routes_with_linkdown,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= addrconf_sysctl_ignore_routes_with_linkdown,
 		},
 		{
 			/* sentinel */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 370f72785385..1c0217e61357 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -665,6 +665,12 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 {
 	int m;
 	bool match_do_rr = false;
+	struct inet6_dev *idev = rt->rt6i_idev;
+	struct net_device *dev = rt->dst.dev;
+
+	if (dev && !netif_carrier_ok(dev) &&
+	    idev->cnf.ignore_routes_with_linkdown)
+		goto out;
 
 	if (rt6_check_expired(rt))
 		goto out;
@@ -2887,8 +2893,11 @@ static int rt6_fill_node(struct net *net,
 	else
 		rtm->rtm_type = RTN_UNICAST;
 	rtm->rtm_flags = 0;
-	if (!netif_carrier_ok(rt->dst.dev))
+	if (!netif_carrier_ok(rt->dst.dev)) {
 		rtm->rtm_flags |= RTNH_F_LINKDOWN;
+		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
+			rtm->rtm_flags |= RTNH_F_DEAD;
+	}
 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
 	rtm->rtm_protocol = rt->rt6i_protocol;
 	if (rt->rt6i_flags & RTF_DYNAMIC)
-- 
cgit v1.2.3


From 4e3c89920cd3a6cfce22c6f537690747c26128dd Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 13 Aug 2015 14:59:00 -0600
Subject: net: Introduce VRF related flags and helpers

Add a VRF_MASTER flag for interfaces and helper functions for determining
if a device is a VRF_MASTER.

Add link attribute for passing VRF_TABLE id.

Add vrf_ptr to netdevice.

Add various macros for determining if a device is a VRF device, the index
of the master VRF device and table associated with VRF device.

Signed-off-by: Shrijeet Mukherjee <shm@cumulusnetworks.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    |  20 +++++++
 include/net/vrf.h            | 139 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/if_link.h |   9 +++
 3 files changed, 168 insertions(+)
 create mode 100644 include/net/vrf.h

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 607b5f41f46f..f7a6ef2fae3a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1289,6 +1289,7 @@ enum netdev_priv_flags {
 	IFF_XMIT_DST_RELEASE_PERM	= 1<<22,
 	IFF_IPVLAN_MASTER		= 1<<23,
 	IFF_IPVLAN_SLAVE		= 1<<24,
+	IFF_VRF_MASTER			= 1<<25,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1316,6 +1317,7 @@ enum netdev_priv_flags {
 #define IFF_XMIT_DST_RELEASE_PERM	IFF_XMIT_DST_RELEASE_PERM
 #define IFF_IPVLAN_MASTER		IFF_IPVLAN_MASTER
 #define IFF_IPVLAN_SLAVE		IFF_IPVLAN_SLAVE
+#define IFF_VRF_MASTER			IFF_VRF_MASTER
 
 /**
  *	struct net_device - The DEVICE structure.
@@ -1432,6 +1434,7 @@ enum netdev_priv_flags {
  *	@dn_ptr:	DECnet specific data
  *	@ip6_ptr:	IPv6 specific data
  *	@ax25_ptr:	AX.25 specific data
+ *	@vrf_ptr:	VRF specific data
  *	@ieee80211_ptr:	IEEE 802.11 specific data, assign before registering
  *
  *	@last_rx:	Time of last Rx
@@ -1650,6 +1653,7 @@ struct net_device {
 	struct dn_dev __rcu     *dn_ptr;
 	struct inet6_dev __rcu	*ip6_ptr;
 	void			*ax25_ptr;
+	struct net_vrf_dev __rcu *vrf_ptr;
 	struct wireless_dev	*ieee80211_ptr;
 	struct wpan_dev		*ieee802154_ptr;
 #if IS_ENABLED(CONFIG_MPLS_ROUTING)
@@ -3808,6 +3812,22 @@ static inline bool netif_supports_nofcs(struct net_device *dev)
 	return dev->priv_flags & IFF_SUPP_NOFCS;
 }
 
+static inline bool netif_is_vrf(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_VRF_MASTER;
+}
+
+static inline bool netif_index_is_vrf(struct net *net, int ifindex)
+{
+	struct net_device *dev = dev_get_by_index_rcu(net, ifindex);
+	bool rc = false;
+
+	if (dev)
+		rc = netif_is_vrf(dev);
+
+	return rc;
+}
+
 /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
 static inline void netif_keep_dst(struct net_device *dev)
 {
diff --git a/include/net/vrf.h b/include/net/vrf.h
new file mode 100644
index 000000000000..0484d29d4589
--- /dev/null
+++ b/include/net/vrf.h
@@ -0,0 +1,139 @@
+/*
+ * include/net/net_vrf.h - adds vrf dev structure definitions
+ * Copyright (c) 2015 Cumulus Networks
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_NET_VRF_H
+#define __LINUX_NET_VRF_H
+
+struct net_vrf_dev {
+	struct rcu_head		rcu;
+	int                     ifindex; /* ifindex of master dev */
+	u32                     tb_id;   /* table id for VRF */
+};
+
+struct slave {
+	struct list_head	list;
+	struct net_device	*dev;
+};
+
+struct slave_queue {
+	struct list_head	all_slaves;
+	int			num_slaves;
+};
+
+struct net_vrf {
+	struct slave_queue	queue;
+	struct rtable           *rth;
+	u32			tb_id;
+};
+
+
+#if IS_ENABLED(CONFIG_NET_VRF)
+/* called with rcu_read_lock() */
+static inline int vrf_master_ifindex_rcu(const struct net_device *dev)
+{
+	struct net_vrf_dev *vrf_ptr;
+	int ifindex = 0;
+
+	if (!dev)
+		return 0;
+
+	if (netif_is_vrf(dev))
+		ifindex = dev->ifindex;
+	else {
+		vrf_ptr = rcu_dereference(dev->vrf_ptr);
+		if (vrf_ptr)
+			ifindex = vrf_ptr->ifindex;
+	}
+
+	return ifindex;
+}
+
+/* called with rcu_read_lock */
+static inline int vrf_dev_table_rcu(const struct net_device *dev)
+{
+	int tb_id = 0;
+
+	if (dev) {
+		struct net_vrf_dev *vrf_ptr;
+
+		vrf_ptr = rcu_dereference(dev->vrf_ptr);
+		if (vrf_ptr)
+			tb_id = vrf_ptr->tb_id;
+	}
+	return tb_id;
+}
+
+static inline int vrf_dev_table(const struct net_device *dev)
+{
+	int tb_id;
+
+	rcu_read_lock();
+	tb_id = vrf_dev_table_rcu(dev);
+	rcu_read_unlock();
+
+	return tb_id;
+}
+
+/* called with rtnl */
+static inline int vrf_dev_table_rtnl(const struct net_device *dev)
+{
+	int tb_id = 0;
+
+	if (dev) {
+		struct net_vrf_dev *vrf_ptr;
+
+		vrf_ptr = rtnl_dereference(dev->vrf_ptr);
+		if (vrf_ptr)
+			tb_id = vrf_ptr->tb_id;
+	}
+	return tb_id;
+}
+
+/* caller has already checked netif_is_vrf(dev) */
+static inline struct rtable *vrf_dev_get_rth(const struct net_device *dev)
+{
+	struct rtable *rth = ERR_PTR(-ENETUNREACH);
+	struct net_vrf *vrf = netdev_priv(dev);
+
+	if (vrf) {
+		rth = vrf->rth;
+		atomic_inc(&rth->dst.__refcnt);
+	}
+	return rth;
+}
+
+#else
+static inline int vrf_master_ifindex_rcu(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline int vrf_dev_table_rcu(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline int vrf_dev_table(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline int vrf_dev_table_rtnl(const struct net_device *dev)
+{
+	return 0;
+}
+
+static inline struct rtable *vrf_dev_get_rth(const struct net_device *dev)
+{
+	return ERR_PTR(-ENETUNREACH);
+}
+#endif
+
+#endif /* __LINUX_NET_VRF_H */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d450be36add2..313c305fd1ad 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -341,6 +341,15 @@ enum macvlan_macaddr_mode {
 
 #define MACVLAN_FLAG_NOPROMISC	1
 
+/* VRF section */
+enum {
+	IFLA_VRF_UNSPEC,
+	IFLA_VRF_TABLE,
+	__IFLA_VRF_MAX
+};
+
+#define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1)
+
 /* IPVLAN section */
 enum {
 	IFLA_IPVLAN_UNSPEC,
-- 
cgit v1.2.3


From 2377799c084d86d22074cd4acd20edc32024d669 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 13 Jul 2015 12:17:25 +0200
Subject: average: provide macro to create static EWMA

Having the EWMA parameters stored in the runtime struct imposes
memory requirements for the constant values that could just be
inlined in the code. This particularly makes sense if there are
a lot of such structs, for example in mac80211 in the station
table where each station has a number of these in an array, and
there can be many stations.

Provide a macro DECLARE_EWMA() that declares the necessary struct
and inline functions to access it with the parameters hard-coded;
using this also means the user no longer needs to 'select AVERAGE'
as it's entirely self-contained.

In the mac80211 case, on x86-64, this actually slightly *reduces*
code size, while also saving 80 bytes of runtime memory per sta.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/average.h | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/average.h b/include/linux/average.h
index c6028fd742c1..60f8226c5652 100644
--- a/include/linux/average.h
+++ b/include/linux/average.h
@@ -27,4 +27,43 @@ static inline unsigned long ewma_read(const struct ewma *avg)
 	return avg->internal >> avg->factor;
 }
 
+#define DECLARE_EWMA(name, _factor, _weight)				\
+	struct ewma_##name {						\
+		unsigned long internal;					\
+	};								\
+	static inline void ewma_##name##_init(struct ewma_##name *e)	\
+	{								\
+		BUILD_BUG_ON(!__builtin_constant_p(_factor));		\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight));		\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_factor);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight);			\
+		e->internal = 0;					\
+	}								\
+	static inline unsigned long					\
+	ewma_##name##_read(struct ewma_##name *e)			\
+	{								\
+		BUILD_BUG_ON(!__builtin_constant_p(_factor));		\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight));		\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_factor);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight);			\
+		return e->internal >> ilog2(_factor);			\
+	}								\
+	static inline void ewma_##name##_add(struct ewma_##name *e,	\
+					     unsigned long val)		\
+	{								\
+		unsigned long internal = ACCESS_ONCE(e->internal);	\
+		unsigned long weight = ilog2(_weight);			\
+		unsigned long factor = ilog2(_factor);			\
+									\
+		BUILD_BUG_ON(!__builtin_constant_p(_factor));		\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight));		\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_factor);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight);			\
+									\
+		ACCESS_ONCE(e->internal) = internal ?			\
+			(((internal << weight) - internal) +		\
+				(val << factor)) >> weight :		\
+			(val << factor);				\
+	}
+
 #endif /* _LINUX_AVERAGE_H */
-- 
cgit v1.2.3


From 8f9c98df949333f08b74e5df1caacf7e2c5e8552 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Sun, 19 Jul 2015 16:09:12 +0300
Subject: mac80211: fix BIT position for TDLS WIDE extended cap

The bit was not according to ieee80211 specification.
Fix that.

Reviewed-by: Arik Nemtsov <arik@wizery.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index b9c7897dc566..cfa906f28b7a 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2074,8 +2074,8 @@ enum ieee80211_tdls_actioncode {
 #define WLAN_EXT_CAPA5_TDLS_PROHIBITED	BIT(6)
 #define WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED	BIT(7)
 
+#define WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED	BIT(5)
 #define WLAN_EXT_CAPA8_OPMODE_NOTIF	BIT(6)
-#define WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED	BIT(7)
 
 /* TDLS specific payload type in the LLC/SNAP header */
 #define WLAN_TDLS_SNAP_RFTYPE	0x2
-- 
cgit v1.2.3


From 76b733d15874128ee2d0365b4cbe7d51decd8d37 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Fri, 14 Aug 2015 22:33:30 +0200
Subject: nfc: st-nci: Remove duplicate file platform_data/st_nci.h

commit "nfc: st-nci: Rename st21nfcb to st-nci" adds
include/linux/platform_data/st_nci.h duplicated with
include/linux/platform_data/st-nci.h.

Only drivers/nfc/st-nci/i2c.c uses platform_data/st_nci.h.

Cc: stable@vger.kernel.org
Reported-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/nfc/st-nci/i2c.c             |  2 +-
 include/linux/platform_data/st_nci.h | 29 -----------------------------
 2 files changed, 1 insertion(+), 30 deletions(-)
 delete mode 100644 include/linux/platform_data/st_nci.h

(limited to 'include/linux')

diff --git a/drivers/nfc/st-nci/i2c.c b/drivers/nfc/st-nci/i2c.c
index 06175ce769bb..b2a467c2d668 100644
--- a/drivers/nfc/st-nci/i2c.c
+++ b/drivers/nfc/st-nci/i2c.c
@@ -25,7 +25,7 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/nfc.h>
-#include <linux/platform_data/st_nci.h>
+#include <linux/platform_data/st-nci.h>
 
 #include "ndlc.h"
 
diff --git a/include/linux/platform_data/st_nci.h b/include/linux/platform_data/st_nci.h
deleted file mode 100644
index d9d400a297bd..000000000000
--- a/include/linux/platform_data/st_nci.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Driver include for ST NCI NFC chip family.
- *
- * Copyright (C) 2014-2015  STMicroelectronics SAS. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _ST_NCI_H_
-#define _ST_NCI_H_
-
-#define ST_NCI_DRIVER_NAME "st_nci"
-
-struct st_nci_nfc_platform_data {
-	unsigned int gpio_reset;
-	unsigned int irq_polarity;
-};
-
-#endif /* _ST_NCI_H_ */
-- 
cgit v1.2.3


From fa8187c96471c49419c25d4ec3299d17d3f274b2 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 13 Aug 2015 19:01:06 +0200
Subject: net: declare new net_device priv_flag IFF_NO_QUEUE

This private net_device flag can be set by drivers to inform that a
device runs fine without a qdisc attached. This was formerly done by
setting tx_queue_len to zero.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f7a6ef2fae3a..d131755516ca 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1262,6 +1262,7 @@ struct net_device_ops {
  * @IFF_LIVE_ADDR_CHANGE: device supports hardware address
  *	change when it's running
  * @IFF_MACVLAN: Macvlan device
+ * @IFF_NO_QUEUE: device can run without qdisc attached
  */
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
@@ -1290,6 +1291,7 @@ enum netdev_priv_flags {
 	IFF_IPVLAN_MASTER		= 1<<23,
 	IFF_IPVLAN_SLAVE		= 1<<24,
 	IFF_VRF_MASTER			= 1<<25,
+	IFF_NO_QUEUE			= 1<<26,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1318,6 +1320,7 @@ enum netdev_priv_flags {
 #define IFF_IPVLAN_MASTER		IFF_IPVLAN_MASTER
 #define IFF_IPVLAN_SLAVE		IFF_IPVLAN_SLAVE
 #define IFF_VRF_MASTER			IFF_VRF_MASTER
+#define IFF_NO_QUEUE			IFF_NO_QUEUE
 
 /**
  *	struct net_device - The DEVICE structure.
-- 
cgit v1.2.3


From fbaff3ef859a86dc3df2128d2de9f8a6e255a967 Mon Sep 17 00:00:00 2001
From: Jesse Brandeburg <jesse.brandeburg@intel.com>
Date: Thu, 13 Aug 2015 18:34:03 -0700
Subject: net: fix endian check warning in etherdevice.h

Sparse builds have been warning for a really long time now
that etherdevice.h has a conversion that is unsafe.

  include/linux/etherdevice.h:79:32: warning: restricted __be16 degrades to integer

This code change fixes the issue and generates the exact
same assembly before/after (checked on x86_64)

Fixes: 2c722fe1c821 (etherdevice: Optimize a few is_<foo>_ether_addr functions)
Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
CC: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 9012f8775208..eb049c622208 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -76,7 +76,7 @@ static inline bool is_link_local_ether_addr(const u8 *addr)
 
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
 	return (((*(const u32 *)addr) ^ (*(const u32 *)b)) |
-		((a[2] ^ b[2]) & m)) == 0;
+		(__force int)((a[2] ^ b[2]) & m)) == 0;
 #else
 	return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0;
 #endif
-- 
cgit v1.2.3


From 6fa1bcab6be6e9bd93f80e345c7e9a4ec7861df9 Mon Sep 17 00:00:00 2001
From: Achiad Shochat <achiad@mellanox.com>
Date: Sun, 16 Aug 2015 16:04:50 +0300
Subject: net/mlx5e: Ethtool link speed setting fixes

- Port speed settings are applied by the device only upon
  port admin status transition from DOWN to UP.
  So we enforce this transition regardless of the port's
  current operation state (which may be occasionally DOWN if
  for example the network cable is disconnected).
- Fix the PORT_UP/DOWN device interface enum
- Set the local_port bit in the device PAOS register
- EXPORT the PAOS (Port Administrative and Operational Status)
  register set/query access functions.

Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 25 ++++++----------------
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 14 ++++++++----
 include/linux/mlx5/driver.h                        | 11 +++++-----
 3 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 975828521913..77158b31eb0b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -628,7 +628,7 @@ static int mlx5e_set_settings(struct net_device *netdev,
 	u32 link_modes;
 	u32 speed;
 	u32 eth_proto_cap, eth_proto_admin;
-	u8 port_status;
+	enum mlx5_port_status ps;
 	int err;
 
 	speed = ethtool_cmd_speed(cmd);
@@ -662,24 +662,13 @@ static int mlx5e_set_settings(struct net_device *netdev,
 	if (link_modes == eth_proto_admin)
 		goto out;
 
-	err = mlx5_set_port_proto(mdev, link_modes, MLX5_PTYS_EN);
-	if (err) {
-		netdev_err(netdev, "%s: set port eth proto admin failed: %d\n",
-			   __func__, err);
-		goto out;
-	}
+	mlx5_query_port_admin_status(mdev, &ps);
+	if (ps == MLX5_PORT_UP)
+		mlx5_set_port_admin_status(mdev, MLX5_PORT_DOWN);
+	mlx5_set_port_proto(mdev, link_modes, MLX5_PTYS_EN);
+	if (ps == MLX5_PORT_UP)
+		mlx5_set_port_admin_status(mdev, MLX5_PORT_UP);
 
-	err = mlx5_query_port_status(mdev, &port_status);
-	if (err)
-		goto out;
-
-	if (port_status == MLX5_PORT_DOWN)
-		return 0;
-
-	err = mlx5_set_port_status(mdev, MLX5_PORT_DOWN);
-	if (err)
-		goto out;
-	err = mlx5_set_port_status(mdev, MLX5_PORT_UP);
 out:
 	return err;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 70147999f657..f8db52bd8f18 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -216,22 +216,25 @@ int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin,
 }
 EXPORT_SYMBOL_GPL(mlx5_set_port_proto);
 
-int mlx5_set_port_status(struct mlx5_core_dev *dev,
-			 enum mlx5_port_status status)
+int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
+			       enum mlx5_port_status status)
 {
 	u32 in[MLX5_ST_SZ_DW(paos_reg)];
 	u32 out[MLX5_ST_SZ_DW(paos_reg)];
 
 	memset(in, 0, sizeof(in));
 
+	MLX5_SET(paos_reg, in, local_port, 1);
 	MLX5_SET(paos_reg, in, admin_status, status);
 	MLX5_SET(paos_reg, in, ase, 1);
 
 	return mlx5_core_access_reg(dev, in, sizeof(in), out,
 				    sizeof(out), MLX5_REG_PAOS, 0, 1);
 }
+EXPORT_SYMBOL_GPL(mlx5_set_port_admin_status);
 
-int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status)
+int mlx5_query_port_admin_status(struct mlx5_core_dev *dev,
+				 enum mlx5_port_status *status)
 {
 	u32 in[MLX5_ST_SZ_DW(paos_reg)];
 	u32 out[MLX5_ST_SZ_DW(paos_reg)];
@@ -239,14 +242,17 @@ int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status)
 
 	memset(in, 0, sizeof(in));
 
+	MLX5_SET(paos_reg, in, local_port, 1);
+
 	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
 				   sizeof(out), MLX5_REG_PAOS, 0, 0);
 	if (err)
 		return err;
 
-	*status = MLX5_GET(paos_reg, out, oper_status);
+	*status = MLX5_GET(paos_reg, out, admin_status);
 	return err;
 }
+EXPORT_SYMBOL_GPL(mlx5_query_port_admin_status);
 
 static void mlx5_query_port_mtu(struct mlx5_core_dev *dev, int *admin_mtu,
 				int *max_mtu, int *oper_mtu, u8 port)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2039546b0ec6..4b5d7fc88d0f 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -152,8 +152,8 @@ enum mlx5_dev_event {
 };
 
 enum mlx5_port_status {
-	MLX5_PORT_UP        = 1 << 1,
-	MLX5_PORT_DOWN      = 1 << 2,
+	MLX5_PORT_UP        = 1,
+	MLX5_PORT_DOWN      = 2,
 };
 
 struct mlx5_uuar_info {
@@ -761,9 +761,10 @@ int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev,
 			       u8 local_port);
 int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin,
 			int proto_mask);
-int mlx5_set_port_status(struct mlx5_core_dev *dev,
-			 enum mlx5_port_status status);
-int mlx5_query_port_status(struct mlx5_core_dev *dev, u8 *status);
+int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
+			       enum mlx5_port_status status);
+int mlx5_query_port_admin_status(struct mlx5_core_dev *dev,
+				 enum mlx5_port_status *status);
 
 int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port);
 void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 port);
-- 
cgit v1.2.3


From 3c2d18ef22df1bdccfb11a5b85b29e4e61b9d9c6 Mon Sep 17 00:00:00 2001
From: Achiad Shochat <achiad@mellanox.com>
Date: Sun, 16 Aug 2015 16:04:51 +0300
Subject: net/mlx5e: Support ethtool get/set_pauseparam

Only rx/tx pause settings.
Autoneg setting is currently not supported.

Signed-off-by: Achiad Shochat <achiad@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 38 ++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 42 ++++++++++++++++++++++
 include/linux/mlx5/driver.h                        |  5 +++
 3 files changed, 85 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 77158b31eb0b..bce912688ca8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -820,6 +820,42 @@ static int mlx5e_set_tunable(struct net_device *dev,
 	return err;
 }
 
+static void mlx5e_get_pauseparam(struct net_device *netdev,
+				 struct ethtool_pauseparam *pauseparam)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	err = mlx5_query_port_pause(mdev, &pauseparam->rx_pause,
+				    &pauseparam->tx_pause);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5_query_port_pause failed:0x%x\n",
+			   __func__, err);
+	}
+}
+
+static int mlx5e_set_pauseparam(struct net_device *netdev,
+				struct ethtool_pauseparam *pauseparam)
+{
+	struct mlx5e_priv *priv    = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	int err;
+
+	if (pauseparam->autoneg)
+		return -EINVAL;
+
+	err = mlx5_set_port_pause(mdev,
+				  pauseparam->rx_pause ? 1 : 0,
+				  pauseparam->tx_pause ? 1 : 0);
+	if (err) {
+		netdev_err(netdev, "%s: mlx5_set_port_pause failed:0x%x\n",
+			   __func__, err);
+	}
+
+	return err;
+}
+
 const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_drvinfo       = mlx5e_get_drvinfo,
 	.get_link          = ethtool_op_get_link,
@@ -841,4 +877,6 @@ const struct ethtool_ops mlx5e_ethtool_ops = {
 	.get_rxnfc         = mlx5e_get_rxnfc,
 	.get_tunable       = mlx5e_get_tunable,
 	.set_tunable       = mlx5e_set_tunable,
+	.get_pauseparam    = mlx5e_get_pauseparam,
+	.set_pauseparam    = mlx5e_set_pauseparam,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index f8db52bd8f18..821caaab9bfb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -334,3 +334,45 @@ int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx5_query_port_vl_hw_cap);
+
+int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause)
+{
+	u32 in[MLX5_ST_SZ_DW(pfcc_reg)];
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(pfcc_reg, in, local_port, 1);
+	MLX5_SET(pfcc_reg, in, pptx, tx_pause);
+	MLX5_SET(pfcc_reg, in, pprx, rx_pause);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_PFCC, 0, 1);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_pause);
+
+int mlx5_query_port_pause(struct mlx5_core_dev *dev,
+			  u32 *rx_pause, u32 *tx_pause)
+{
+	u32 in[MLX5_ST_SZ_DW(pfcc_reg)];
+	u32 out[MLX5_ST_SZ_DW(pfcc_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	MLX5_SET(pfcc_reg, in, local_port, 1);
+
+	err = mlx5_core_access_reg(dev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_PFCC, 0, 0);
+	if (err)
+		return err;
+
+	if (rx_pause)
+		*rx_pause = MLX5_GET(pfcc_reg, out, pprx);
+
+	if (tx_pause)
+		*tx_pause = MLX5_GET(pfcc_reg, out, pptx);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_pause);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 4b5d7fc88d0f..8b6d6f2154a4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -103,6 +103,7 @@ enum {
 	MLX5_REG_PMTU		 = 0x5003,
 	MLX5_REG_PTYS		 = 0x5004,
 	MLX5_REG_PAOS		 = 0x5006,
+	MLX5_REG_PFCC            = 0x5007,
 	MLX5_REG_PPCNT		 = 0x5008,
 	MLX5_REG_PMAOS		 = 0x5012,
 	MLX5_REG_PUDE		 = 0x5009,
@@ -774,6 +775,10 @@ void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu,
 int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev,
 			      u8 *vl_hw_cap, u8 local_port);
 
+int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause);
+int mlx5_query_port_pause(struct mlx5_core_dev *dev,
+			  u32 *rx_pause, u32 *tx_pause);
+
 int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
 int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
-- 
cgit v1.2.3


From 2f52bdcf6ba1bd81597b1505a222430676548b3a Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sun, 16 Aug 2015 07:49:20 -0600
Subject: net: Updates to netif_index_is_vrf

As Eric noted netif_index_is_vrf is not called with rcu_read_lock held,
so wrap the dev_get_by_index_rcu in rcu_read_lock and unlock.

If VRF is not enabled or oif is 0 skip the device lookup. In both cases
index cannot be the VRF master.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d131755516ca..8a530f930754 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3822,12 +3822,22 @@ static inline bool netif_is_vrf(const struct net_device *dev)
 
 static inline bool netif_index_is_vrf(struct net *net, int ifindex)
 {
-	struct net_device *dev = dev_get_by_index_rcu(net, ifindex);
 	bool rc = false;
 
+#if IS_ENABLED(CONFIG_NET_VRF)
+	struct net_device *dev;
+
+	if (ifindex == 0)
+		return false;
+
+	rcu_read_lock();
+
+	dev = dev_get_by_index_rcu(net, ifindex);
 	if (dev)
 		rc = netif_is_vrf(dev);
 
+	rcu_read_unlock();
+#endif
 	return rc;
 }
 
-- 
cgit v1.2.3


From 808d28c468a89e8680396d98aea96024b6f5afdc Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sun, 16 Aug 2015 10:26:49 -0600
Subject: net: Fix docbook warning for IFF_VRF_MASTER enum

kbuild test robot reported:
tree:   git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
head:   d52736e24fe2e927c26817256f8d1a3c8b5d51a0
commit: 4e3c89920cd3a6cfce22c6f537690747c26128dd [751/762] net: Introduce VRF related flags and helpers
reproduce: make htmldocs

>> Warning(include/linux/netdevice.h:1293): Enum value 'IFF_VRF_MASTER' not described in enum 'netdev_priv_flags'

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8a530f930754..4bd177faa90c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1262,6 +1262,7 @@ struct net_device_ops {
  * @IFF_LIVE_ADDR_CHANGE: device supports hardware address
  *	change when it's running
  * @IFF_MACVLAN: Macvlan device
+ * @IFF_VRF_MASTER: device is a VRF master
  * @IFF_NO_QUEUE: device can run without qdisc attached
  */
 enum netdev_priv_flags {
-- 
cgit v1.2.3


From 74f4e0cc61080f63f28e8d519bdf437957e64217 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 14 Aug 2015 00:21:45 +0200
Subject: bcma: switch GPIO portions to use GPIOLIB_IRQCHIP

This switches the BCMA GPIO driver to use GPIOLIB_IRQCHIP to
handle its interrupts instead of rolling its own copy of the
irqdomain handling etc.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/Kconfig                        |  2 +-
 drivers/bcma/driver_gpio.c                  | 92 ++++++++++-------------------
 include/linux/bcma/bcma_driver_chipcommon.h |  1 -
 3 files changed, 31 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/Kconfig b/drivers/bcma/Kconfig
index be5fffb6da24..023d448ed3fa 100644
--- a/drivers/bcma/Kconfig
+++ b/drivers/bcma/Kconfig
@@ -92,7 +92,7 @@ config BCMA_DRIVER_GMAC_CMN
 config BCMA_DRIVER_GPIO
 	bool "BCMA GPIO driver"
 	depends on BCMA && GPIOLIB
-	select IRQ_DOMAIN if BCMA_HOST_SOC
+	select GPIOLIB_IRQCHIP if BCMA_HOST_SOC
 	help
 	  Driver to provide access to the GPIO pins of the bcma bus.
 
diff --git a/drivers/bcma/driver_gpio.c b/drivers/bcma/driver_gpio.c
index 5f6018e7cd4c..504899a72966 100644
--- a/drivers/bcma/driver_gpio.c
+++ b/drivers/bcma/driver_gpio.c
@@ -8,10 +8,8 @@
  * Licensed under the GNU/GPL. See COPYING for details.
  */
 
-#include <linux/gpio.h>
-#include <linux/irq.h>
+#include <linux/gpio/driver.h>
 #include <linux/interrupt.h>
-#include <linux/irqdomain.h>
 #include <linux/export.h>
 #include <linux/bcma/bcma.h>
 
@@ -79,19 +77,11 @@ static void bcma_gpio_free(struct gpio_chip *chip, unsigned gpio)
 }
 
 #if IS_BUILTIN(CONFIG_BCM47XX) || IS_BUILTIN(CONFIG_ARCH_BCM_5301X)
-static int bcma_gpio_to_irq(struct gpio_chip *chip, unsigned gpio)
-{
-	struct bcma_drv_cc *cc = bcma_gpio_get_cc(chip);
-
-	if (cc->core->bus->hosttype == BCMA_HOSTTYPE_SOC)
-		return irq_find_mapping(cc->irq_domain, gpio);
-	else
-		return -EINVAL;
-}
 
 static void bcma_gpio_irq_unmask(struct irq_data *d)
 {
-	struct bcma_drv_cc *cc = irq_data_get_irq_chip_data(d);
+	struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+	struct bcma_drv_cc *cc = bcma_gpio_get_cc(gc);
 	int gpio = irqd_to_hwirq(d);
 	u32 val = bcma_chipco_gpio_in(cc, BIT(gpio));
 
@@ -101,7 +91,8 @@ static void bcma_gpio_irq_unmask(struct irq_data *d)
 
 static void bcma_gpio_irq_mask(struct irq_data *d)
 {
-	struct bcma_drv_cc *cc = irq_data_get_irq_chip_data(d);
+	struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+	struct bcma_drv_cc *cc = bcma_gpio_get_cc(gc);
 	int gpio = irqd_to_hwirq(d);
 
 	bcma_chipco_gpio_intmask(cc, BIT(gpio), 0);
@@ -116,6 +107,7 @@ static struct irq_chip bcma_gpio_irq_chip = {
 static irqreturn_t bcma_gpio_irq_handler(int irq, void *dev_id)
 {
 	struct bcma_drv_cc *cc = dev_id;
+	struct gpio_chip *gc = &cc->gpio;
 	u32 val = bcma_cc_read32(cc, BCMA_CC_GPIOIN);
 	u32 mask = bcma_cc_read32(cc, BCMA_CC_GPIOIRQ);
 	u32 pol = bcma_cc_read32(cc, BCMA_CC_GPIOPOL);
@@ -125,81 +117,58 @@ static irqreturn_t bcma_gpio_irq_handler(int irq, void *dev_id)
 	if (!irqs)
 		return IRQ_NONE;
 
-	for_each_set_bit(gpio, &irqs, cc->gpio.ngpio)
-		generic_handle_irq(bcma_gpio_to_irq(&cc->gpio, gpio));
+	for_each_set_bit(gpio, &irqs, gc->ngpio)
+		generic_handle_irq(irq_find_mapping(gc->irqdomain, gpio));
 	bcma_chipco_gpio_polarity(cc, irqs, val & irqs);
 
 	return IRQ_HANDLED;
 }
 
-static int bcma_gpio_irq_domain_init(struct bcma_drv_cc *cc)
+static int bcma_gpio_irq_init(struct bcma_drv_cc *cc)
 {
 	struct gpio_chip *chip = &cc->gpio;
-	int gpio, hwirq, err;
+	int hwirq, err;
 
 	if (cc->core->bus->hosttype != BCMA_HOSTTYPE_SOC)
 		return 0;
 
-	cc->irq_domain = irq_domain_add_linear(NULL, chip->ngpio,
-					       &irq_domain_simple_ops, cc);
-	if (!cc->irq_domain) {
-		err = -ENODEV;
-		goto err_irq_domain;
-	}
-	for (gpio = 0; gpio < chip->ngpio; gpio++) {
-		int irq = irq_create_mapping(cc->irq_domain, gpio);
-
-		irq_set_chip_data(irq, cc);
-		irq_set_chip_and_handler(irq, &bcma_gpio_irq_chip,
-					 handle_simple_irq);
-	}
-
 	hwirq = bcma_core_irq(cc->core, 0);
 	err = request_irq(hwirq, bcma_gpio_irq_handler, IRQF_SHARED, "gpio",
 			  cc);
 	if (err)
-		goto err_req_irq;
+		return err;
 
 	bcma_chipco_gpio_intmask(cc, ~0, 0);
 	bcma_cc_set32(cc, BCMA_CC_IRQMASK, BCMA_CC_IRQ_GPIO);
 
-	return 0;
-
-err_req_irq:
-	for (gpio = 0; gpio < chip->ngpio; gpio++) {
-		int irq = irq_find_mapping(cc->irq_domain, gpio);
-
-		irq_dispose_mapping(irq);
+	err =  gpiochip_irqchip_add(chip,
+				    &bcma_gpio_irq_chip,
+				    0,
+				    handle_simple_irq,
+				    IRQ_TYPE_NONE);
+	if (err) {
+		free_irq(hwirq, cc);
+		return err;
 	}
-	irq_domain_remove(cc->irq_domain);
-err_irq_domain:
-	return err;
+
+	return 0;
 }
 
-static void bcma_gpio_irq_domain_exit(struct bcma_drv_cc *cc)
+static void bcma_gpio_irq_exit(struct bcma_drv_cc *cc)
 {
-	struct gpio_chip *chip = &cc->gpio;
-	int gpio;
-
 	if (cc->core->bus->hosttype != BCMA_HOSTTYPE_SOC)
 		return;
 
 	bcma_cc_mask32(cc, BCMA_CC_IRQMASK, ~BCMA_CC_IRQ_GPIO);
 	free_irq(bcma_core_irq(cc->core, 0), cc);
-	for (gpio = 0; gpio < chip->ngpio; gpio++) {
-		int irq = irq_find_mapping(cc->irq_domain, gpio);
-
-		irq_dispose_mapping(irq);
-	}
-	irq_domain_remove(cc->irq_domain);
 }
 #else
-static int bcma_gpio_irq_domain_init(struct bcma_drv_cc *cc)
+static int bcma_gpio_irq_init(struct bcma_drv_cc *cc)
 {
 	return 0;
 }
 
-static void bcma_gpio_irq_domain_exit(struct bcma_drv_cc *cc)
+static void bcma_gpio_irq_exit(struct bcma_drv_cc *cc)
 {
 }
 #endif
@@ -218,9 +187,8 @@ int bcma_gpio_init(struct bcma_drv_cc *cc)
 	chip->set		= bcma_gpio_set_value;
 	chip->direction_input	= bcma_gpio_direction_input;
 	chip->direction_output	= bcma_gpio_direction_output;
-#if IS_BUILTIN(CONFIG_BCM47XX) || IS_BUILTIN(CONFIG_ARCH_BCM_5301X)
-	chip->to_irq		= bcma_gpio_to_irq;
-#endif
+	chip->owner		= THIS_MODULE;
+	chip->dev		= bcma_bus_get_host_dev(bus);
 #if IS_BUILTIN(CONFIG_OF)
 	if (cc->core->bus->hosttype == BCMA_HOSTTYPE_SOC)
 		chip->of_node	= cc->core->dev.of_node;
@@ -248,13 +216,13 @@ int bcma_gpio_init(struct bcma_drv_cc *cc)
 	else
 		chip->base		= -1;
 
-	err = bcma_gpio_irq_domain_init(cc);
+	err = gpiochip_add(chip);
 	if (err)
 		return err;
 
-	err = gpiochip_add(chip);
+	err = bcma_gpio_irq_init(cc);
 	if (err) {
-		bcma_gpio_irq_domain_exit(cc);
+		gpiochip_remove(chip);
 		return err;
 	}
 
@@ -263,7 +231,7 @@ int bcma_gpio_init(struct bcma_drv_cc *cc)
 
 int bcma_gpio_unregister(struct bcma_drv_cc *cc)
 {
-	bcma_gpio_irq_domain_exit(cc);
+	bcma_gpio_irq_exit(cc);
 	gpiochip_remove(&cc->gpio);
 	return 0;
 }
diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h
index 6cceedf65ca2..cf038431a5cc 100644
--- a/include/linux/bcma/bcma_driver_chipcommon.h
+++ b/include/linux/bcma/bcma_driver_chipcommon.h
@@ -640,7 +640,6 @@ struct bcma_drv_cc {
 	spinlock_t gpio_lock;
 #ifdef CONFIG_BCMA_DRIVER_GPIO
 	struct gpio_chip gpio;
-	struct irq_domain *irq_domain;
 #endif
 };
 
-- 
cgit v1.2.3


From f4e774f55fe0bb568a0877b2eb9e1b4b5a6f5cbc Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 19 Aug 2015 09:46:22 +0200
Subject: average: remove out-of-line implementation

Since all users are now converted to the inline implementation,
remove the out-of-line implementation entirely.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/average.h | 24 -------------------
 lib/Kconfig             | 10 --------
 lib/average.c           | 64 -------------------------------------------------
 3 files changed, 98 deletions(-)
 delete mode 100644 lib/average.c

(limited to 'include/linux')

diff --git a/include/linux/average.h b/include/linux/average.h
index 60f8226c5652..d04aa58280de 100644
--- a/include/linux/average.h
+++ b/include/linux/average.h
@@ -3,30 +3,6 @@
 
 /* Exponentially weighted moving average (EWMA) */
 
-/* For more documentation see lib/average.c */
-
-struct ewma {
-	unsigned long internal;
-	unsigned long factor;
-	unsigned long weight;
-};
-
-extern void ewma_init(struct ewma *avg, unsigned long factor,
-		      unsigned long weight);
-
-extern struct ewma *ewma_add(struct ewma *avg, unsigned long val);
-
-/**
- * ewma_read() - Get average value
- * @avg: Average structure
- *
- * Returns the average value held in @avg.
- */
-static inline unsigned long ewma_read(const struct ewma *avg)
-{
-	return avg->internal >> avg->factor;
-}
-
 #define DECLARE_EWMA(name, _factor, _weight)				\
 	struct ewma_##name {						\
 		unsigned long internal;					\
diff --git a/lib/Kconfig b/lib/Kconfig
index 3a2ef67db6c7..278890dd1049 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -460,16 +460,6 @@ config ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 config LRU_CACHE
 	tristate
 
-config AVERAGE
-	bool "Averaging functions"
-	help
-	  This option is provided for the case where no in-kernel-tree
-	  modules require averaging functions, but a module built outside
-	  the kernel tree does. Such modules that use library averaging
-	  functions require Y here.
-
-	  If unsure, say N.
-
 config CLZ_TAB
 	bool
 
diff --git a/lib/average.c b/lib/average.c
deleted file mode 100644
index 114d1beae0c7..000000000000
--- a/lib/average.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * lib/average.c
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2.  See the file COPYING for more details.
- */
-
-#include <linux/export.h>
-#include <linux/average.h>
-#include <linux/kernel.h>
-#include <linux/bug.h>
-#include <linux/log2.h>
-
-/**
- * DOC: Exponentially Weighted Moving Average (EWMA)
- *
- * These are generic functions for calculating Exponentially Weighted Moving
- * Averages (EWMA). We keep a structure with the EWMA parameters and a scaled
- * up internal representation of the average value to prevent rounding errors.
- * The factor for scaling up and the exponential weight (or decay rate) have to
- * be specified thru the init fuction. The structure should not be accessed
- * directly but only thru the helper functions.
- */
-
-/**
- * ewma_init() - Initialize EWMA parameters
- * @avg: Average structure
- * @factor: Factor to use for the scaled up internal value. The maximum value
- *	of averages can be ULONG_MAX/(factor*weight). For performance reasons
- *	factor has to be a power of 2.
- * @weight: Exponential weight, or decay rate. This defines how fast the
- *	influence of older values decreases. For performance reasons weight has
- *	to be a power of 2.
- *
- * Initialize the EWMA parameters for a given struct ewma @avg.
- */
-void ewma_init(struct ewma *avg, unsigned long factor, unsigned long weight)
-{
-	WARN_ON(!is_power_of_2(weight) || !is_power_of_2(factor));
-
-	avg->weight = ilog2(weight);
-	avg->factor = ilog2(factor);
-	avg->internal = 0;
-}
-EXPORT_SYMBOL(ewma_init);
-
-/**
- * ewma_add() - Exponentially weighted moving average (EWMA)
- * @avg: Average structure
- * @val: Current value
- *
- * Add a sample to the average.
- */
-struct ewma *ewma_add(struct ewma *avg, unsigned long val)
-{
-	unsigned long internal = ACCESS_ONCE(avg->internal);
-
-	ACCESS_ONCE(avg->internal) = internal ?
-		(((internal << avg->weight) - internal) +
-			(val << avg->factor)) >> avg->weight :
-		(val << avg->factor);
-	return avg;
-}
-EXPORT_SYMBOL(ewma_add);
-- 
cgit v1.2.3


From b7fe10e5ebac2a3f37e95535e616494b65fa020f Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Wed, 19 Aug 2015 17:07:32 -0700
Subject: gro: Fix remcsum offload to deal with frags in GRO

The remote checksum offload GRO did not consider the case that frag0
might be in use. This patch fixes that by accessing headers using the
skb_gro functions and not saving offsets relative to skb->head.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       | 23 +++++++++--------------
 include/linux/netdevice.h | 44 ++++++++++++++++++++++++++++++++------------
 net/ipv4/fou.c            | 28 ++++++++++++----------------
 3 files changed, 53 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 54615bb9d916..64fcd2402562 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -519,10 +519,10 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 					  u32 data, struct gro_remcsum *grc,
 					  bool nopartial)
 {
-	size_t start, offset, plen;
+	size_t start, offset;
 
 	if (skb->remcsum_offload)
-		return NULL;
+		return vh;
 
 	if (!NAPI_GRO_CB(skb)->csum_valid)
 		return NULL;
@@ -532,17 +532,8 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 			  offsetof(struct udphdr, check) :
 			  offsetof(struct tcphdr, check));
 
-	plen = hdrlen + offset + sizeof(u16);
-
-	/* Pull checksum that will be written */
-	if (skb_gro_header_hard(skb, off + plen)) {
-		vh = skb_gro_header_slow(skb, off + plen, off);
-		if (!vh)
-			return NULL;
-	}
-
-	skb_gro_remcsum_process(skb, (void *)vh + hdrlen,
-				start, offset, grc, nopartial);
+	vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen,
+				     start, offset, grc, nopartial);
 
 	skb->remcsum_offload = 1;
 
@@ -573,7 +564,6 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
 			goto out;
 	}
 
-	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
 	skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));
 
 	flags = ntohl(vh->vx_flags);
@@ -588,6 +578,8 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
 			goto out;
 	}
 
+	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
+
 	flush = 0;
 
 	for (p = *head; p; p = p->next) {
@@ -1110,6 +1102,9 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 {
 	size_t start, offset, plen;
 
+	if (skb->remcsum_offload)
+		return vh;
+
 	start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
 	offset = start + ((data & VXLAN_RCO_UDP) ?
 			  offsetof(struct udphdr, check) :
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4bd177faa90c..6abe0d6f1e1d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2311,8 +2311,7 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb);
 
 static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb)
 {
-	return (NAPI_GRO_CB(skb)->gro_remcsum_start - skb_headroom(skb) ==
-		skb_gro_offset(skb));
+	return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb));
 }
 
 static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
@@ -2408,37 +2407,58 @@ static inline void skb_gro_remcsum_init(struct gro_remcsum *grc)
 	grc->delta = 0;
 }
 
-static inline void skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
-					   int start, int offset,
-					   struct gro_remcsum *grc,
-					   bool nopartial)
+static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
+					    unsigned int off, size_t hdrlen,
+					    int start, int offset,
+					    struct gro_remcsum *grc,
+					    bool nopartial)
 {
 	__wsum delta;
+	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
 
 	BUG_ON(!NAPI_GRO_CB(skb)->csum_valid);
 
 	if (!nopartial) {
-		NAPI_GRO_CB(skb)->gro_remcsum_start =
-		    ((unsigned char *)ptr + start) - skb->head;
-		return;
+		NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start;
+		return ptr;
+	}
+
+	ptr = skb_gro_header_fast(skb, off);
+	if (skb_gro_header_hard(skb, off + plen)) {
+		ptr = skb_gro_header_slow(skb, off + plen, off);
+		if (!ptr)
+			return NULL;
 	}
 
-	delta = remcsum_adjust(ptr, NAPI_GRO_CB(skb)->csum, start, offset);
+	delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum,
+			       start, offset);
 
 	/* Adjust skb->csum since we changed the packet */
 	NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
 
-	grc->offset = (ptr + offset) - (void *)skb->head;
+	grc->offset = off + hdrlen + offset;
 	grc->delta = delta;
+
+	return ptr;
 }
 
 static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
 					   struct gro_remcsum *grc)
 {
+	void *ptr;
+	size_t plen = grc->offset + sizeof(u16);
+
 	if (!grc->delta)
 		return;
 
-	remcsum_unadjust((__sum16 *)(skb->head + grc->offset), grc->delta);
+	ptr = skb_gro_header_fast(skb, grc->offset);
+	if (skb_gro_header_hard(skb, grc->offset + sizeof(u16))) {
+		ptr = skb_gro_header_slow(skb, plen, grc->offset);
+		if (!ptr)
+			return;
+	}
+
+	remcsum_unadjust((__sum16 *)ptr, grc->delta);
 }
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 34968cd5c146..eb11f9506894 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -79,7 +79,11 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
 	__be16 *pd = data;
 	size_t start = ntohs(pd[0]);
 	size_t offset = ntohs(pd[1]);
-	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
+	size_t plen = sizeof(struct udphdr) + hdrlen +
+	    max_t(size_t, offset + sizeof(u16), start);
+
+	if (skb->remcsum_offload)
+		return guehdr;
 
 	if (!pskb_may_pull(skb, plen))
 		return NULL;
@@ -221,29 +225,21 @@ out_unlock:
 
 static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 				      struct guehdr *guehdr, void *data,
-				      size_t hdrlen, u8 ipproto,
-				      struct gro_remcsum *grc, bool nopartial)
+				      size_t hdrlen, struct gro_remcsum *grc,
+				      bool nopartial)
 {
 	__be16 *pd = data;
 	size_t start = ntohs(pd[0]);
 	size_t offset = ntohs(pd[1]);
-	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
 
 	if (skb->remcsum_offload)
-		return NULL;
+		return guehdr;
 
 	if (!NAPI_GRO_CB(skb)->csum_valid)
 		return NULL;
 
-	/* Pull checksum that will be written */
-	if (skb_gro_header_hard(skb, off + plen)) {
-		guehdr = skb_gro_header_slow(skb, off + plen, off);
-		if (!guehdr)
-			return NULL;
-	}
-
-	skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen,
-				start, offset, grc, nopartial);
+	guehdr = skb_gro_remcsum_process(skb, (void *)guehdr, off, hdrlen,
+					 start, offset, grc, nopartial);
 
 	skb->remcsum_offload = 1;
 
@@ -307,10 +303,10 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
 
 		if (flags & GUE_PFLAG_REMCSUM) {
 			guehdr = gue_gro_remcsum(skb, off, guehdr,
-						 data + doffset, hdrlen,
-						 guehdr->proto_ctype, &grc,
+						 data + doffset, hdrlen, &grc,
 						 !!(fou->flags &
 						    FOU_F_REMCSUM_NOPARTIAL));
+
 			if (!guehdr)
 				goto out;
 
-- 
cgit v1.2.3


From 0e4ead9d7b3655d76371604abb9b0dcc4e79bb7d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 27 Aug 2015 09:31:18 +0200
Subject: net: introduce change upper device notifier change info

Add info that is passed along with NETDEV_CHANGEUPPER event.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  7 +++++++
 net/core/dev.c            | 16 ++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6abe0d6f1e1d..39f30daac483 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2127,6 +2127,13 @@ struct netdev_notifier_change_info {
 	unsigned int flags_changed;
 };
 
+struct netdev_notifier_changeupper_info {
+	struct netdev_notifier_info info; /* must be first */
+	struct net_device *upper_dev; /* new upper dev */
+	bool master; /* is upper dev master */
+	bool linking; /* is the nofication for link or unlink */
+};
+
 static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
 					     struct net_device *dev)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 7bb24f1879b8..a8e6cf4298d3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5311,6 +5311,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 				   struct net_device *upper_dev, bool master,
 				   void *private)
 {
+	struct netdev_notifier_changeupper_info changeupper_info;
 	struct netdev_adjacent *i, *j, *to_i, *to_j;
 	int ret = 0;
 
@@ -5329,6 +5330,10 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	if (master && netdev_master_upper_dev_get(dev))
 		return -EBUSY;
 
+	changeupper_info.upper_dev = upper_dev;
+	changeupper_info.master = master;
+	changeupper_info.linking = true;
+
 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
 						   master);
 	if (ret)
@@ -5367,7 +5372,8 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 			goto rollback_lower_mesh;
 	}
 
-	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+				      &changeupper_info.info);
 	return 0;
 
 rollback_lower_mesh:
@@ -5462,9 +5468,14 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev)
 {
+	struct netdev_notifier_changeupper_info changeupper_info;
 	struct netdev_adjacent *i, *j;
 	ASSERT_RTNL();
 
+	changeupper_info.upper_dev = upper_dev;
+	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
+	changeupper_info.linking = false;
+
 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 
 	/* Here is the tricky part. We must remove all dev's lower
@@ -5484,7 +5495,8 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
 		__netdev_adjacent_dev_unlink(dev, i->dev);
 
-	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+				      &changeupper_info.info);
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
-- 
cgit v1.2.3


From 0894ae3f0a587bda9733ec4a4b67af7ded3a9498 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 27 Aug 2015 09:31:19 +0200
Subject: net: add netif_is_bridge_master helper

Add this helper so code can easily figure out if netdev is a bridge.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 39f30daac483..be625f4754b9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3848,6 +3848,11 @@ static inline bool netif_is_vrf(const struct net_device *dev)
 	return dev->priv_flags & IFF_VRF_MASTER;
 }
 
+static inline bool netif_is_bridge_master(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_EBRIDGE;
+}
+
 static inline bool netif_index_is_vrf(struct net *net, int ifindex)
 {
 	bool rc = false;
-- 
cgit v1.2.3


From 35d4e1725202e6656fcfa8b88447327ad3ae0c0c Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 27 Aug 2015 09:31:20 +0200
Subject: net: add netif_is_ovs_master helper with IFF_OPENVSWITCH private flag

Add this helper so code can easily figure out if netdev is openswitch.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h            | 8 ++++++++
 net/openvswitch/vport-internal_dev.c | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index be625f4754b9..6656a43c072d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1264,6 +1264,7 @@ struct net_device_ops {
  * @IFF_MACVLAN: Macvlan device
  * @IFF_VRF_MASTER: device is a VRF master
  * @IFF_NO_QUEUE: device can run without qdisc attached
+ * @IFF_OPENVSWITCH: device is a Open vSwitch master
  */
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
@@ -1293,6 +1294,7 @@ enum netdev_priv_flags {
 	IFF_IPVLAN_SLAVE		= 1<<24,
 	IFF_VRF_MASTER			= 1<<25,
 	IFF_NO_QUEUE			= 1<<26,
+	IFF_OPENVSWITCH			= 1<<27,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1322,6 +1324,7 @@ enum netdev_priv_flags {
 #define IFF_IPVLAN_SLAVE		IFF_IPVLAN_SLAVE
 #define IFF_VRF_MASTER			IFF_VRF_MASTER
 #define IFF_NO_QUEUE			IFF_NO_QUEUE
+#define IFF_OPENVSWITCH			IFF_OPENVSWITCH
 
 /**
  *	struct net_device - The DEVICE structure.
@@ -3853,6 +3856,11 @@ static inline bool netif_is_bridge_master(const struct net_device *dev)
 	return dev->priv_flags & IFF_EBRIDGE;
 }
 
+static inline bool netif_is_ovs_master(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_OPENVSWITCH;
+}
+
 static inline bool netif_index_is_vrf(struct net *net, int ifindex)
 {
 	bool rc = false;
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index c058bbf876c3..80b3e12ec882 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -135,7 +135,7 @@ static void do_setup(struct net_device *netdev)
 	netdev->netdev_ops = &internal_dev_netdev_ops;
 
 	netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
-	netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH;
 	netdev->destructor = internal_dev_destructor;
 	netdev->ethtool_ops = &internal_dev_ethtool_ops;
 	netdev->rtnl_link_ops = &internal_dev_link_ops;
-- 
cgit v1.2.3


From 0dc1549bfd67053181415a3f7544628a6bcd2a08 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 27 Aug 2015 09:31:21 +0200
Subject: net: kill long time unused bonding private flags

We don't use them for years, just kill them now.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 57 +++++++++++++++++------------------------------
 1 file changed, 21 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6656a43c072d..88a00694eda5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1240,13 +1240,8 @@ struct net_device_ops {
  *
  * @IFF_802_1Q_VLAN: 802.1Q VLAN device
  * @IFF_EBRIDGE: Ethernet bridging device
- * @IFF_SLAVE_INACTIVE: bonding slave not the curr. active
- * @IFF_MASTER_8023AD: bonding master, 802.3ad
- * @IFF_MASTER_ALB: bonding master, balance-alb
  * @IFF_BONDING: bonding master or slave
- * @IFF_SLAVE_NEEDARP: need ARPs for validation
  * @IFF_ISATAP: ISATAP interface (RFC4214)
- * @IFF_MASTER_ARPMON: bonding master, ARP mon in use
  * @IFF_WAN_HDLC: WAN HDLC device
  * @IFF_XMIT_DST_RELEASE: dev_hard_start_xmit() is allowed to
  *	release skb->dst
@@ -1269,43 +1264,33 @@ struct net_device_ops {
 enum netdev_priv_flags {
 	IFF_802_1Q_VLAN			= 1<<0,
 	IFF_EBRIDGE			= 1<<1,
-	IFF_SLAVE_INACTIVE		= 1<<2,
-	IFF_MASTER_8023AD		= 1<<3,
-	IFF_MASTER_ALB			= 1<<4,
-	IFF_BONDING			= 1<<5,
-	IFF_SLAVE_NEEDARP		= 1<<6,
-	IFF_ISATAP			= 1<<7,
-	IFF_MASTER_ARPMON		= 1<<8,
-	IFF_WAN_HDLC			= 1<<9,
-	IFF_XMIT_DST_RELEASE		= 1<<10,
-	IFF_DONT_BRIDGE			= 1<<11,
-	IFF_DISABLE_NETPOLL		= 1<<12,
-	IFF_MACVLAN_PORT		= 1<<13,
-	IFF_BRIDGE_PORT			= 1<<14,
-	IFF_OVS_DATAPATH		= 1<<15,
-	IFF_TX_SKB_SHARING		= 1<<16,
-	IFF_UNICAST_FLT			= 1<<17,
-	IFF_TEAM_PORT			= 1<<18,
-	IFF_SUPP_NOFCS			= 1<<19,
-	IFF_LIVE_ADDR_CHANGE		= 1<<20,
-	IFF_MACVLAN			= 1<<21,
-	IFF_XMIT_DST_RELEASE_PERM	= 1<<22,
-	IFF_IPVLAN_MASTER		= 1<<23,
-	IFF_IPVLAN_SLAVE		= 1<<24,
-	IFF_VRF_MASTER			= 1<<25,
-	IFF_NO_QUEUE			= 1<<26,
-	IFF_OPENVSWITCH			= 1<<27,
+	IFF_BONDING			= 1<<2,
+	IFF_ISATAP			= 1<<3,
+	IFF_WAN_HDLC			= 1<<4,
+	IFF_XMIT_DST_RELEASE		= 1<<5,
+	IFF_DONT_BRIDGE			= 1<<6,
+	IFF_DISABLE_NETPOLL		= 1<<7,
+	IFF_MACVLAN_PORT		= 1<<8,
+	IFF_BRIDGE_PORT			= 1<<9,
+	IFF_OVS_DATAPATH		= 1<<10,
+	IFF_TX_SKB_SHARING		= 1<<11,
+	IFF_UNICAST_FLT			= 1<<12,
+	IFF_TEAM_PORT			= 1<<13,
+	IFF_SUPP_NOFCS			= 1<<14,
+	IFF_LIVE_ADDR_CHANGE		= 1<<15,
+	IFF_MACVLAN			= 1<<16,
+	IFF_XMIT_DST_RELEASE_PERM	= 1<<17,
+	IFF_IPVLAN_MASTER		= 1<<18,
+	IFF_IPVLAN_SLAVE		= 1<<19,
+	IFF_VRF_MASTER			= 1<<20,
+	IFF_NO_QUEUE			= 1<<21,
+	IFF_OPENVSWITCH			= 1<<22,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
 #define IFF_EBRIDGE			IFF_EBRIDGE
-#define IFF_SLAVE_INACTIVE		IFF_SLAVE_INACTIVE
-#define IFF_MASTER_8023AD		IFF_MASTER_8023AD
-#define IFF_MASTER_ALB			IFF_MASTER_ALB
 #define IFF_BONDING			IFF_BONDING
-#define IFF_SLAVE_NEEDARP		IFF_SLAVE_NEEDARP
 #define IFF_ISATAP			IFF_ISATAP
-#define IFF_MASTER_ARPMON		IFF_MASTER_ARPMON
 #define IFF_WAN_HDLC			IFF_WAN_HDLC
 #define IFF_XMIT_DST_RELEASE		IFF_XMIT_DST_RELEASE
 #define IFF_DONT_BRIDGE			IFF_DONT_BRIDGE
-- 
cgit v1.2.3


From 2e4cfae2a8e3f9ce3925c9b6e9e865fe8476fc4f Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Thu, 27 Aug 2015 15:25:45 -0700
Subject: netfilter: Define v6ops in !CONFIG_NETFILTER case.

When CONFIG_OPENVSWITCH is set, and CONFIG_NETFILTER is not set, the
openvswitch IPv6 fragmentation handling cannot refer to ipv6_ops because
it isn't defined. Add a dummy version to avoid #ifdefs in source files.

Fixes: 7f8a436 "openvswitch: Add conntrack action"
Signed-off-by: Joe Stringer <joestringer@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv6.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 8b7d28f3aada..771574677e83 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -9,15 +9,6 @@
 
 #include <uapi/linux/netfilter_ipv6.h>
 
-
-#ifdef CONFIG_NETFILTER
-int ip6_route_me_harder(struct sk_buff *skb);
-__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
-			unsigned int dataoff, u_int8_t protocol);
-
-int ipv6_netfilter_init(void);
-void ipv6_netfilter_fini(void);
-
 /*
  * Hook functions for ipv6 to allow xt_* modules to be built-in even
  * if IPv6 is a module.
@@ -30,6 +21,14 @@ struct nf_ipv6_ops {
 			int (*output)(struct sock *, struct sk_buff *));
 };
 
+#ifdef CONFIG_NETFILTER
+int ip6_route_me_harder(struct sk_buff *skb);
+__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
+			unsigned int dataoff, u_int8_t protocol);
+
+int ipv6_netfilter_init(void);
+void ipv6_netfilter_fini(void);
+
 extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops;
 static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void)
 {
@@ -39,6 +38,7 @@ static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void)
 #else /* CONFIG_NETFILTER */
 static inline int ipv6_netfilter_init(void) { return 0; }
 static inline void ipv6_netfilter_fini(void) { return; }
+static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void) { return NULL; }
 #endif /* CONFIG_NETFILTER */
 
 #endif /*__LINUX_IP6_NETFILTER_H*/
-- 
cgit v1.2.3


From df2cf4a78e488d26728590cb3c6b4fe4c4862c77 Mon Sep 17 00:00:00 2001
From: Philip Downey <pdowney@brocade.com>
Date: Thu, 27 Aug 2015 16:46:26 +0100
Subject: IGMP: Inhibit reports for local multicast groups

The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is
reserved for the use of routing protocols and other low-level topology
discovery or maintenance protocols, such as gateway discovery and
group membership reporting.  Multicast routers should not forward any
multicast datagram with destination addresses in this range,
regardless of its TTL.

Currently, IGMP reports are generated for this reserved range of
addresses even though a router will ignore this information since it
has no purpose.  However, the presence of reserved group addresses in
an IGMP membership report uses up network bandwidth and can also
obscure addresses of interest when inspecting membership reports using
packet inspection or debug messages.

Although the RFCs for the various version of IGMP (e.g.RFC 3376 for
v3) do not specify that the reserved addresses be excluded from
membership reports, it should do no harm in doing so.  In particular
there should be no adverse effect in any IGMP snooping functionality
since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD
Snooping Switches Considerations) section 2.1.2. Data Forwarding
Rules:

    2) Packets with a destination IP (DIP) address in the 224.0.0.X
       range which are not IGMP must be forwarded on all ports.

IGMP reports for local multicast groups can now be optionally
inhibited by means of a system control variable (by setting the value
to zero) e.g.:
    echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports

To retain backwards compatibility the previous behaviour is retained
by default on system boot or reverted by setting the value back to
non-zero e.g.:
    echo 1 >  /proc/sys/net/ipv4/igmp_link_local_mcast_reports

Signed-off-by: Philip Downey <pdowney@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h       |  1 +
 net/ipv4/igmp.c            | 26 +++++++++++++++++++++++++-
 net/ipv4/sysctl_net_ipv4.c |  7 +++++++
 3 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 193ad488d3e2..908429216d9f 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -37,6 +37,7 @@ static inline struct igmpv3_query *
 	return (struct igmpv3_query *)skb_transport_header(skb);
 }
 
+extern int sysctl_igmp_llm_reports;
 extern int sysctl_igmp_max_memberships;
 extern int sysctl_igmp_max_msf;
 extern int sysctl_igmp_qrv;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 9fdfd9deac11..d38b8b61eaee 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -110,6 +110,9 @@
 #define IP_MAX_MEMBERSHIPS	20
 #define IP_MAX_MSF		10
 
+/* IGMP reports for link-local multicast groups are enabled by default */
+int sysctl_igmp_llm_reports __read_mostly = 1;
+
 #ifdef CONFIG_IP_MULTICAST
 /* Parameter names and values are taken from igmp-v2-06 draft */
 
@@ -437,6 +440,8 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
 
 	if (pmc->multiaddr == IGMP_ALL_HOSTS)
 		return skb;
+	if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports)
+		return skb;
 
 	isquery = type == IGMPV3_MODE_IS_INCLUDE ||
 		  type == IGMPV3_MODE_IS_EXCLUDE;
@@ -545,6 +550,9 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
 		for_each_pmc_rcu(in_dev, pmc) {
 			if (pmc->multiaddr == IGMP_ALL_HOSTS)
 				continue;
+			if (ipv4_is_local_multicast(pmc->multiaddr) &&
+			     !sysctl_igmp_llm_reports)
+				continue;
 			spin_lock_bh(&pmc->lock);
 			if (pmc->sfcount[MCAST_EXCLUDE])
 				type = IGMPV3_MODE_IS_EXCLUDE;
@@ -678,7 +686,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
 
 	if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
 		return igmpv3_send_report(in_dev, pmc);
-	else if (type == IGMP_HOST_LEAVE_MESSAGE)
+
+	if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+		return 0;
+
+	if (type == IGMP_HOST_LEAVE_MESSAGE)
 		dst = IGMP_ALL_ROUTER;
 	else
 		dst = group;
@@ -851,6 +863,8 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
 
 	if (group == IGMP_ALL_HOSTS)
 		return false;
+	if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+		return false;
 
 	rcu_read_lock();
 	for_each_pmc_rcu(in_dev, im) {
@@ -957,6 +971,9 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 			continue;
 		if (im->multiaddr == IGMP_ALL_HOSTS)
 			continue;
+		if (ipv4_is_local_multicast(im->multiaddr) &&
+		    !sysctl_igmp_llm_reports)
+			continue;
 		spin_lock_bh(&im->lock);
 		if (im->tm_running)
 			im->gsquery = im->gsquery && mark;
@@ -1181,6 +1198,8 @@ static void igmp_group_dropped(struct ip_mc_list *im)
 #ifdef CONFIG_IP_MULTICAST
 	if (im->multiaddr == IGMP_ALL_HOSTS)
 		return;
+	if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+		return;
 
 	reporter = im->reporter;
 	igmp_stop_timer(im);
@@ -1213,6 +1232,8 @@ static void igmp_group_added(struct ip_mc_list *im)
 #ifdef CONFIG_IP_MULTICAST
 	if (im->multiaddr == IGMP_ALL_HOSTS)
 		return;
+	if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+		return;
 
 	if (in_dev->dead)
 		return;
@@ -1518,6 +1539,9 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)
 	for_each_pmc_rtnl(in_dev, im) {
 		if (im->multiaddr == IGMP_ALL_HOSTS)
 			continue;
+		if (ipv4_is_local_multicast(im->multiaddr) &&
+		    !sysctl_igmp_llm_reports)
+			continue;
 
 		/* a failover is happening and switches
 		 * must be notified immediately
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 879bdc5c95b1..894da3a70aff 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -929,6 +929,13 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "igmp_link_local_mcast_reports",
+		.data		= &sysctl_igmp_llm_reports,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
 	{ }
 };
 
-- 
cgit v1.2.3


From 1a6877b9c0c2ad901d4335d909432d3bb6d3a330 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 28 Aug 2015 15:56:22 -0700
Subject: lib: introduce strncpy_from_unsafe()

generalize FETCH_FUNC_NAME(memory, string) into
strncpy_from_unsafe() and fix sparse warnings that were
present in original implementation.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/uaccess.h     |  2 ++
 kernel/trace/trace_kprobe.c | 20 ++++----------------
 lib/strncpy_from_user.c     | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index ae572c138607..d6f2c2c5b043 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -129,4 +129,6 @@ extern long __probe_kernel_read(void *dst, const void *src, size_t size);
 extern long notrace probe_kernel_write(void *dst, const void *src, size_t size);
 extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size);
 
+extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
+
 #endif		/* __LINUX_UACCESS_H__ */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b7d0cdd9906c..c9956440d0e6 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
 static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
 					    void *addr, void *dest)
 {
-	long ret;
 	int maxlen = get_rloc_len(*(u32 *)dest);
 	u8 *dst = get_rloc_data(dest);
-	u8 *src = addr;
-	mm_segment_t old_fs = get_fs();
+	long ret;
 
 	if (!maxlen)
 		return;
@@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
 	 * Try to get string again, since the string can be changed while
 	 * probing.
 	 */
-	set_fs(KERNEL_DS);
-	pagefault_disable();
-
-	do
-		ret = __copy_from_user_inatomic(dst++, src++, 1);
-	while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
-
-	dst[-1] = '\0';
-	pagefault_enable();
-	set_fs(old_fs);
+	ret = strncpy_from_unsafe(dst, addr, maxlen);
 
 	if (ret < 0) {	/* Failed to fetch string */
-		((u8 *)get_rloc_data(dest))[0] = '\0';
+		dst[0] = '\0';
 		*(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
 	} else {
-		*(u32 *)dest = make_data_rloc(src - (u8 *)addr,
-					      get_rloc_offs(*(u32 *)dest));
+		*(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
 	}
 }
 NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index e0af6ff73d14..ead8c4a068d1 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -112,3 +112,44 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
 	return -EFAULT;
 }
 EXPORT_SYMBOL(strncpy_from_user);
+
+/**
+ * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
+ * @dst:   Destination address, in kernel space.  This buffer must be at
+ *         least @count bytes long.
+ * @src:   Unsafe address.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Copies a NUL-terminated string from unsafe address to kernel buffer.
+ *
+ * On success, returns the length of the string INCLUDING the trailing NUL.
+ *
+ * If access fails, returns -EFAULT (some data may have been copied
+ * and the trailing NUL added).
+ *
+ * If @count is smaller than the length of the string, copies @count-1 bytes,
+ * sets the last byte of @dst buffer to NUL and returns @count.
+ */
+long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
+{
+	mm_segment_t old_fs = get_fs();
+	const void *src = unsafe_addr;
+	long ret;
+
+	if (unlikely(count <= 0))
+		return 0;
+
+	set_fs(KERNEL_DS);
+	pagefault_disable();
+
+	do {
+		ret = __copy_from_user_inatomic(dst++,
+						(const void __user __force *)src++, 1);
+	} while (dst[-1] && ret == 0 && src - unsafe_addr < count);
+
+	dst[-1] = '\0';
+	pagefault_enable();
+	set_fs(old_fs);
+
+	return ret < 0 ? ret : src - unsafe_addr;
+}
-- 
cgit v1.2.3


From 5a11dd7d9649149f336ca72069d56ce52b21567f Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 31 Aug 2015 15:56:46 +0200
Subject: net: phy: Allow PHY devices to identify themselves as Ethernet
 switches, etc.

Some Ethernet MAC drivers using the PHY library require the hardcoding
of link parameters when interfaced to a switch device, SFP module,
switch to switch port, etc. This has typically lead to various ad-hoc
implementations looking like this:

- using a "fixed PHY" emulated device, which will provide link
  indication towards the Ethernet MAC driver and hardware

- pretend there is no PHY and hardcode link parameters, ala mv643x_eth

Based on that, it is desireable to have the PHY drivers advertise the
correct link parameters, just like regular Ethernet PHYs towards their
CPU Ethernet MAC drivers, however, Ethernet MAC drivers should be able
to tell whether this link should be monitored or not. In the context
of an Ethernet switch, SFP module, switch to switch link, we do not
need to monitor this link since it should be always up.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/fixed_phy.c |  1 +
 include/linux/phy.h         | 12 ++++++++++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index 99d9bc19c94a..ce46428ff21f 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -303,6 +303,7 @@ struct phy_device *fixed_phy_register(unsigned int irq,
 
 	of_node_get(np);
 	phy->dev.of_node = np;
+	phy->is_pseudo_fixed_link = true;
 
 	ret = phy_device_register(phy);
 	if (ret) {
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e5fb1d415961..962387a192f1 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -330,6 +330,7 @@ struct phy_c45_device_ids {
  * c45_ids: 802.3-c45 Device Identifers if is_c45.
  * is_c45:  Set to true if this phy uses clause 45 addressing.
  * is_internal: Set to true if this phy is internal to a MAC.
+ * is_pseudo_fixed_link: Set to true if this phy is an Ethernet switch, etc.
  * has_fixups: Set to true if this phy has fixups/quirks.
  * suspended: Set to true if this phy has been suspended successfully.
  * state: state of the PHY for management purposes
@@ -368,6 +369,7 @@ struct phy_device {
 	struct phy_c45_device_ids c45_ids;
 	bool is_c45;
 	bool is_internal;
+	bool is_pseudo_fixed_link;
 	bool has_fixups;
 	bool suspended;
 
@@ -688,6 +690,16 @@ static inline bool phy_interface_is_rgmii(struct phy_device *phydev)
 {
 	return phydev->interface >= PHY_INTERFACE_MODE_RGMII &&
 		phydev->interface <= PHY_INTERFACE_MODE_RGMII_TXID;
+};
+
+/*
+ * phy_is_pseudo_fixed_link - Convenience function for testing if this
+ * PHY is the CPU port facing side of an Ethernet switch, or similar.
+ * @phydev: the phy_device struct
+ */
+static inline bool phy_is_pseudo_fixed_link(struct phy_device *phydev)
+{
+	return phydev->is_pseudo_fixed_link;
 }
 
 /**
-- 
cgit v1.2.3


From a5597008dbc230876db2d344561d634f4d52ea4a Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 31 Aug 2015 15:56:53 +0200
Subject: phy: fixed_phy: Add gpio to determine link up/down.

An SFP module may have a link up/down status pin which can be
connection to a GPIO line of the host. Add support for reading such an
GPIO in the fixed_phy driver.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../devicetree/bindings/net/fixed-link.txt         | 14 +++++++++++-
 Documentation/networking/stmmac.txt                |  2 +-
 arch/m68k/coldfire/m5272.c                         |  2 +-
 arch/mips/ar7/platform.c                           |  5 +++--
 arch/mips/bcm47xx/setup.c                          |  2 +-
 drivers/net/ethernet/broadcom/genet/bcmmii.c       |  2 +-
 drivers/net/phy/fixed_phy.c                        | 26 +++++++++++++++++++---
 drivers/of/of_mdio.c                               | 13 ++++++++---
 include/linux/phy_fixed.h                          |  8 +++++--
 9 files changed, 59 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/fixed-link.txt b/Documentation/devicetree/bindings/net/fixed-link.txt
index 82bf7e0f47b6..ec5d889fe3d8 100644
--- a/Documentation/devicetree/bindings/net/fixed-link.txt
+++ b/Documentation/devicetree/bindings/net/fixed-link.txt
@@ -17,6 +17,8 @@ properties:
   enabled.
 * 'asym-pause' (boolean, optional), to indicate that asym_pause should
   be enabled.
+* 'link-gpios' ('gpio-list', optional), to indicate if a gpio can be read
+  to determine if the link is up.
 
 Old, deprecated 'fixed-link' binding:
 
@@ -30,7 +32,7 @@ Old, deprecated 'fixed-link' binding:
   - e: asymmetric pause configuration: 0 for no asymmetric pause, 1 for
     asymmetric pause
 
-Example:
+Examples:
 
 ethernet@0 {
 	...
@@ -40,3 +42,13 @@ ethernet@0 {
 	};
 	...
 };
+
+ethernet@1 {
+	...
+	fixed-link {
+	      speed = <1000>;
+	      pause;
+	      link-gpios = <&gpio0 12 GPIO_ACTIVE_HIGH>;
+	};
+	...
+};
diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
index 2903b1cf4d70..d64a14714236 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -254,7 +254,7 @@ static struct fixed_phy_status stmmac0_fixed_phy_status = {
 
 During the board's device_init we can configure the first
 MAC for fixed_link by calling:
-  fixed_phy_add(PHY_POLL, 1, &stmmac0_fixed_phy_status));)
+  fixed_phy_add(PHY_POLL, 1, &stmmac0_fixed_phy_status, -1);
 and the second one, with a real PHY device attached to the bus,
 by using the stmmac_mdio_bus_data structure (to provide the id, the
 reset procedure etc).
diff --git a/arch/m68k/coldfire/m5272.c b/arch/m68k/coldfire/m5272.c
index b15219ed22bf..c525e4c08f84 100644
--- a/arch/m68k/coldfire/m5272.c
+++ b/arch/m68k/coldfire/m5272.c
@@ -126,7 +126,7 @@ static struct fixed_phy_status nettel_fixed_phy_status __initdata = {
 static int __init init_BSP(void)
 {
 	m5272_uarts_init();
-	fixed_phy_add(PHY_POLL, 0, &nettel_fixed_phy_status);
+	fixed_phy_add(PHY_POLL, 0, &nettel_fixed_phy_status, -1);
 	return 0;
 }
 
diff --git a/arch/mips/ar7/platform.c b/arch/mips/ar7/platform.c
index be9ff1673ded..298b97715d5f 100644
--- a/arch/mips/ar7/platform.c
+++ b/arch/mips/ar7/platform.c
@@ -679,7 +679,8 @@ static int __init ar7_register_devices(void)
 	}
 
 	if (ar7_has_high_cpmac()) {
-		res = fixed_phy_add(PHY_POLL, cpmac_high.id, &fixed_phy_status);
+		res = fixed_phy_add(PHY_POLL, cpmac_high.id,
+				    &fixed_phy_status, -1);
 		if (!res) {
 			cpmac_get_mac(1, cpmac_high_data.dev_addr);
 
@@ -692,7 +693,7 @@ static int __init ar7_register_devices(void)
 	} else
 		cpmac_low_data.phy_mask = 0xffffffff;
 
-	res = fixed_phy_add(PHY_POLL, cpmac_low.id, &fixed_phy_status);
+	res = fixed_phy_add(PHY_POLL, cpmac_low.id, &fixed_phy_status, -1);
 	if (!res) {
 		cpmac_get_mac(0, cpmac_low_data.dev_addr);
 		res = platform_device_register(&cpmac_low);
diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c
index 98c075f81795..17503a05938e 100644
--- a/arch/mips/bcm47xx/setup.c
+++ b/arch/mips/bcm47xx/setup.c
@@ -263,7 +263,7 @@ static int __init bcm47xx_register_bus_complete(void)
 	bcm47xx_leds_register();
 	bcm47xx_workarounds();
 
-	fixed_phy_add(PHY_POLL, 0, &bcm47xx_fixed_phy_status);
+	fixed_phy_add(PHY_POLL, 0, &bcm47xx_fixed_phy_status, -1);
 	return 0;
 }
 device_initcall(bcm47xx_register_bus_complete);
diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c
index b3679ad1c1c7..c8affad76f36 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -585,7 +585,7 @@ static int bcmgenet_mii_pd_init(struct bcmgenet_priv *priv)
 			.asym_pause = 0,
 		};
 
-		phydev = fixed_phy_register(PHY_POLL, &fphy_status, NULL);
+		phydev = fixed_phy_register(PHY_POLL, &fphy_status, -1, NULL);
 		if (!phydev || IS_ERR(phydev)) {
 			dev_err(kdev, "failed to register fixed PHY device\n");
 			return -ENODEV;
diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index 2f9457f05a2e..1bb70e3cc03e 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -22,6 +22,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/of.h>
+#include <linux/gpio.h>
 
 #define MII_REGS_NUM 29
 
@@ -38,6 +39,7 @@ struct fixed_phy {
 	struct fixed_phy_status status;
 	int (*link_update)(struct net_device *, struct fixed_phy_status *);
 	struct list_head node;
+	int link_gpio;
 };
 
 static struct platform_device *pdev;
@@ -52,6 +54,9 @@ static int fixed_phy_update_regs(struct fixed_phy *fp)
 	u16 lpagb = 0;
 	u16 lpa = 0;
 
+	if (gpio_is_valid(fp->link_gpio))
+		fp->status.link = !!gpio_get_value_cansleep(fp->link_gpio);
+
 	if (!fp->status.link)
 		goto done;
 	bmsr |= BMSR_LSTATUS | BMSR_ANEGCOMPLETE;
@@ -215,7 +220,8 @@ int fixed_phy_update_state(struct phy_device *phydev,
 EXPORT_SYMBOL(fixed_phy_update_state);
 
 int fixed_phy_add(unsigned int irq, int phy_addr,
-		  struct fixed_phy_status *status)
+		  struct fixed_phy_status *status,
+		  int link_gpio)
 {
 	int ret;
 	struct fixed_mdio_bus *fmb = &platform_fmb;
@@ -231,15 +237,26 @@ int fixed_phy_add(unsigned int irq, int phy_addr,
 
 	fp->addr = phy_addr;
 	fp->status = *status;
+	fp->link_gpio = link_gpio;
+
+	if (gpio_is_valid(fp->link_gpio)) {
+		ret = gpio_request_one(fp->link_gpio, GPIOF_DIR_IN,
+				       "fixed-link-gpio-link");
+		if (ret)
+			goto err_regs;
+	}
 
 	ret = fixed_phy_update_regs(fp);
 	if (ret)
-		goto err_regs;
+		goto err_gpio;
 
 	list_add_tail(&fp->node, &fmb->phys);
 
 	return 0;
 
+err_gpio:
+	if (gpio_is_valid(fp->link_gpio))
+		gpio_free(fp->link_gpio);
 err_regs:
 	kfree(fp);
 	return ret;
@@ -254,6 +271,8 @@ void fixed_phy_del(int phy_addr)
 	list_for_each_entry_safe(fp, tmp, &fmb->phys, node) {
 		if (fp->addr == phy_addr) {
 			list_del(&fp->node);
+			if (gpio_is_valid(fp->link_gpio))
+				gpio_free(fp->link_gpio);
 			kfree(fp);
 			return;
 		}
@@ -266,6 +285,7 @@ static DEFINE_SPINLOCK(phy_fixed_addr_lock);
 
 struct phy_device *fixed_phy_register(unsigned int irq,
 				      struct fixed_phy_status *status,
+				      int link_gpio,
 				      struct device_node *np)
 {
 	struct fixed_mdio_bus *fmb = &platform_fmb;
@@ -282,7 +302,7 @@ struct phy_device *fixed_phy_register(unsigned int irq,
 	phy_addr = phy_fixed_addr++;
 	spin_unlock(&phy_fixed_addr_lock);
 
-	ret = fixed_phy_add(PHY_POLL, phy_addr, status);
+	ret = fixed_phy_add(PHY_POLL, phy_addr, status, link_gpio);
 	if (ret < 0)
 		return ERR_PTR(ret);
 
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index 7c8c23cc6896..1350fa25cdb0 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -16,6 +16,7 @@
 #include <linux/phy.h>
 #include <linux/phy_fixed.h>
 #include <linux/of.h>
+#include <linux/of_gpio.h>
 #include <linux/of_irq.h>
 #include <linux/of_mdio.h>
 #include <linux/module.h>
@@ -294,6 +295,7 @@ int of_phy_register_fixed_link(struct device_node *np)
 	struct fixed_phy_status status = {};
 	struct device_node *fixed_link_node;
 	const __be32 *fixed_link_prop;
+	int link_gpio;
 	int len, err;
 	struct phy_device *phy;
 	const char *managed;
@@ -302,7 +304,7 @@ int of_phy_register_fixed_link(struct device_node *np)
 	if (err == 0) {
 		if (strcmp(managed, "in-band-status") == 0) {
 			/* status is zeroed, namely its .link member */
-			phy = fixed_phy_register(PHY_POLL, &status, np);
+			phy = fixed_phy_register(PHY_POLL, &status, -1, np);
 			return IS_ERR(phy) ? PTR_ERR(phy) : 0;
 		}
 	}
@@ -318,8 +320,13 @@ int of_phy_register_fixed_link(struct device_node *np)
 		status.pause = of_property_read_bool(fixed_link_node, "pause");
 		status.asym_pause = of_property_read_bool(fixed_link_node,
 							  "asym-pause");
+		link_gpio = of_get_named_gpio_flags(fixed_link_node,
+						    "link-gpios", 0, NULL);
 		of_node_put(fixed_link_node);
-		phy = fixed_phy_register(PHY_POLL, &status, np);
+		if (link_gpio == -EPROBE_DEFER)
+			return -EPROBE_DEFER;
+
+		phy = fixed_phy_register(PHY_POLL, &status, link_gpio, np);
 		return IS_ERR(phy) ? PTR_ERR(phy) : 0;
 	}
 
@@ -331,7 +338,7 @@ int of_phy_register_fixed_link(struct device_node *np)
 		status.speed = be32_to_cpu(fixed_link_prop[2]);
 		status.pause = be32_to_cpu(fixed_link_prop[3]);
 		status.asym_pause = be32_to_cpu(fixed_link_prop[4]);
-		phy = fixed_phy_register(PHY_POLL, &status, np);
+		phy = fixed_phy_register(PHY_POLL, &status, -1, np);
 		return IS_ERR(phy) ? PTR_ERR(phy) : 0;
 	}
 
diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h
index fe5732d53eda..2400d2ea4f34 100644
--- a/include/linux/phy_fixed.h
+++ b/include/linux/phy_fixed.h
@@ -13,9 +13,11 @@ struct device_node;
 
 #if IS_ENABLED(CONFIG_FIXED_PHY)
 extern int fixed_phy_add(unsigned int irq, int phy_id,
-			 struct fixed_phy_status *status);
+			 struct fixed_phy_status *status,
+			 int link_gpio);
 extern struct phy_device *fixed_phy_register(unsigned int irq,
 					     struct fixed_phy_status *status,
+					     int link_gpio,
 					     struct device_node *np);
 extern void fixed_phy_del(int phy_addr);
 extern int fixed_phy_set_link_update(struct phy_device *phydev,
@@ -26,12 +28,14 @@ extern int fixed_phy_update_state(struct phy_device *phydev,
 			   const struct fixed_phy_status *changed);
 #else
 static inline int fixed_phy_add(unsigned int irq, int phy_id,
-				struct fixed_phy_status *status)
+				struct fixed_phy_status *status,
+				int link_gpio)
 {
 	return -ENODEV;
 }
 static inline struct phy_device *fixed_phy_register(unsigned int irq,
 						struct fixed_phy_status *status,
+						int gpio_link,
 						struct device_node *np)
 {
 	return ERR_PTR(-ENODEV);
-- 
cgit v1.2.3


From e5276937ae6e654a811345f0716266f12e77bede Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 09:24:23 -0700
Subject: flow_dissector: Move skb related functions to skbuff.h

Move the flow dissector functions that are specific to skbuffs into
skbuff.h out of flow_dissector.h. This makes flow_dissector.h have
no dependencies on skbuff.h.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 47 +++++++++++++++++++++++++++++++++++++++++
 include/net/flow_dissector.h | 50 --------------------------------------------
 2 files changed, 47 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 989307f991db..8a697c673b58 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -945,6 +945,53 @@ skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
 	skb->hash = hash;
 }
 
+void __skb_get_hash(struct sk_buff *skb);
+u32 skb_get_poff(const struct sk_buff *skb);
+u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+		   const struct flow_keys *keys, int hlen);
+__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+			    void *data, int hlen_proto);
+
+static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
+					int thoff, u8 ip_proto)
+{
+	return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
+}
+
+void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
+			     const struct flow_dissector_key *key,
+			     unsigned int key_count);
+
+bool __skb_flow_dissect(const struct sk_buff *skb,
+			struct flow_dissector *flow_dissector,
+			void *target_container,
+			void *data, __be16 proto, int nhoff, int hlen);
+
+static inline bool skb_flow_dissect(const struct sk_buff *skb,
+				    struct flow_dissector *flow_dissector,
+				    void *target_container)
+{
+	return __skb_flow_dissect(skb, flow_dissector, target_container,
+				  NULL, 0, 0, 0);
+}
+
+static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
+					      struct flow_keys *flow)
+{
+	memset(flow, 0, sizeof(*flow));
+	return __skb_flow_dissect(skb, &flow_keys_dissector, flow,
+				  NULL, 0, 0, 0);
+}
+
+static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow,
+						  void *data, __be16 proto,
+						  int nhoff, int hlen)
+{
+	memset(flow, 0, sizeof(*flow));
+	return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow,
+				  data, proto, nhoff, hlen);
+}
+
 static inline __u32 skb_get_hash(struct sk_buff *skb)
 {
 	if (!skb->l4_hash && !skb->sw_hash)
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 1a8c22419936..6777a84c6f94 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -2,7 +2,6 @@
 #define _NET_FLOW_DISSECTOR_H
 
 #include <linux/types.h>
-#include <linux/skbuff.h>
 #include <linux/in6.h>
 #include <uapi/linux/if_ether.h>
 
@@ -134,23 +133,6 @@ struct flow_dissector {
 	unsigned short int offset[FLOW_DISSECTOR_KEY_MAX];
 };
 
-void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
-			     const struct flow_dissector_key *key,
-			     unsigned int key_count);
-
-bool __skb_flow_dissect(const struct sk_buff *skb,
-			struct flow_dissector *flow_dissector,
-			void *target_container,
-			void *data, __be16 proto, int nhoff, int hlen);
-
-static inline bool skb_flow_dissect(const struct sk_buff *skb,
-				    struct flow_dissector *flow_dissector,
-				    void *target_container)
-{
-	return __skb_flow_dissect(skb, flow_dissector, target_container,
-				  NULL, 0, 0, 0);
-}
-
 struct flow_keys {
 	struct flow_dissector_key_control control;
 #define FLOW_KEYS_HASH_START_FIELD basic
@@ -170,38 +152,6 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow);
 extern struct flow_dissector flow_keys_dissector;
 extern struct flow_dissector flow_keys_buf_dissector;
 
-static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
-					      struct flow_keys *flow)
-{
-	memset(flow, 0, sizeof(*flow));
-	return __skb_flow_dissect(skb, &flow_keys_dissector, flow,
-				  NULL, 0, 0, 0);
-}
-
-static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow,
-						  void *data, __be16 proto,
-						  int nhoff, int hlen)
-{
-	memset(flow, 0, sizeof(*flow));
-	return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow,
-				  data, proto, nhoff, hlen);
-}
-
-__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
-			    void *data, int hlen_proto);
-
-static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
-					int thoff, u8 ip_proto)
-{
-	return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
-}
-
-u32 flow_hash_from_keys(struct flow_keys *keys);
-void __skb_get_hash(struct sk_buff *skb);
-u32 skb_get_poff(const struct sk_buff *skb);
-u32 __skb_get_poff(const struct sk_buff *skb, void *data,
-		   const struct flow_keys *keys, int hlen);
-
 /* struct flow_keys_digest:
  *
  * This structure is used to hold a digest of the full flow keys. This is a
-- 
cgit v1.2.3


From bcc83839ffdb063dd2b0370cd85c4f825761fc59 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 09:24:24 -0700
Subject: skbuff: Make __skb_set_sw_hash a general function

Move __skb_set_sw_hash to skbuff.h and add __skb_set_hash which is
a common method (between __skb_set_sw_hash and skb_set_hash) to set
the hash in an skbuff.

Also, move skb_clear_hash to be closer to __skb_set_hash.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 45 ++++++++++++++++++++++++++++----------------
 include/net/flow_dissector.h |  5 +++++
 net/core/flow_dissector.c    | 18 ++++++------------
 3 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8a697c673b58..5d2c812e725b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -937,14 +937,40 @@ enum pkt_hash_types {
 	PKT_HASH_TYPE_L4,	/* Input: src_IP, dst_IP, src_port, dst_port */
 };
 
-static inline void
-skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
+static inline void skb_clear_hash(struct sk_buff *skb)
 {
-	skb->l4_hash = (type == PKT_HASH_TYPE_L4);
+	skb->hash = 0;
 	skb->sw_hash = 0;
+	skb->l4_hash = 0;
+}
+
+static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb)
+{
+	if (!skb->l4_hash)
+		skb_clear_hash(skb);
+}
+
+static inline void
+__skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4)
+{
+	skb->l4_hash = is_l4;
+	skb->sw_hash = is_sw;
 	skb->hash = hash;
 }
 
+static inline void
+skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
+{
+	/* Used by drivers to set hash from HW */
+	__skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4);
+}
+
+static inline void
+__skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4)
+{
+	__skb_set_hash(skb, hash, true, is_l4);
+}
+
 void __skb_get_hash(struct sk_buff *skb);
 u32 skb_get_poff(const struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
@@ -1027,19 +1053,6 @@ static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
 	return skb->hash;
 }
 
-static inline void skb_clear_hash(struct sk_buff *skb)
-{
-	skb->hash = 0;
-	skb->sw_hash = 0;
-	skb->l4_hash = 0;
-}
-
-static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb)
-{
-	if (!skb->l4_hash)
-		skb_clear_hash(skb);
-}
-
 static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from)
 {
 	to->hash = from->hash;
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 6777a84c6f94..af76c496f7db 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -167,4 +167,9 @@ struct flow_keys_digest {
 void make_flow_keys_digest(struct flow_keys_digest *digest,
 			   const struct flow_keys *flow);
 
+static inline bool flow_keys_have_l4(struct flow_keys *keys)
+{
+	return (keys->ports.ports || keys->tags.flow_label);
+}
+
 #endif
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 11e6540fa386..151b6e48b81f 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -590,15 +590,6 @@ void make_flow_keys_digest(struct flow_keys_digest *digest,
 }
 EXPORT_SYMBOL(make_flow_keys_digest);
 
-static inline void __skb_set_sw_hash(struct sk_buff *skb, u32 hash,
-				     struct flow_keys *keys)
-{
-	if (keys->ports.ports)
-		skb->l4_hash = 1;
-	skb->sw_hash = 1;
-	skb->hash = hash;
-}
-
 /**
  * __skb_get_hash: calculate a flow hash
  * @skb: sk_buff to calculate flow hash from
@@ -619,7 +610,8 @@ void __skb_get_hash(struct sk_buff *skb)
 	if (!hash)
 		return;
 
-	__skb_set_sw_hash(skb, hash, &keys);
+	__skb_set_sw_hash(skb, hash,
+			  flow_keys_have_l4(&keys));
 }
 EXPORT_SYMBOL(__skb_get_hash);
 
@@ -648,7 +640,8 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
 	keys.tags.flow_label = (__force u32)fl6->flowlabel;
 	keys.basic.ip_proto = fl6->flowi6_proto;
 
-	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys);
+	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys),
+			  flow_keys_have_l4(&keys));
 
 	return skb->hash;
 }
@@ -668,7 +661,8 @@ __u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
 	keys.keyid.keyid = fl4->fl4_gre_key;
 	keys.basic.ip_proto = fl4->flowi4_proto;
 
-	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys);
+	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys),
+			  flow_keys_have_l4(&keys));
 
 	return skb->hash;
 }
-- 
cgit v1.2.3


From c6cc1ca7f4d70cbb3ea3a5ca163c5dabaf155cdb Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 09:24:25 -0700
Subject: flowi: Abstract out functions to get flow hash based on flowi

Create __get_hash_from_flowi6 and __get_hash_from_flowi4 to get the
flow keys and hash based on flowi structures. These are called by
__skb_get_hash_flowi6 and __skb_get_hash_flowi4. Also, created
get_hash_from_flowi6 and get_hash_from_flowi4 which can be called
when just the hash value for a flowi is needed.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 16 ++++++++++++----
 include/net/flow.h           | 19 +++++++++++++++++++
 include/net/flow_dissector.h |  2 ++
 net/core/flow.c              | 36 ++++++++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5d2c812e725b..bbe41bccfc5f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1030,8 +1030,12 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6);
 
 static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
 {
-	if (!skb->l4_hash && !skb->sw_hash)
-		__skb_get_hash_flowi6(skb, fl6);
+	if (!skb->l4_hash && !skb->sw_hash) {
+		struct flow_keys keys;
+
+		__skb_set_sw_hash(skb, __get_hash_from_flowi6(fl6, &keys),
+				  flow_keys_have_l4(&keys));
+	}
 
 	return skb->hash;
 }
@@ -1040,8 +1044,12 @@ __u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl);
 
 static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
 {
-	if (!skb->l4_hash && !skb->sw_hash)
-		__skb_get_hash_flowi4(skb, fl4);
+	if (!skb->l4_hash && !skb->sw_hash) {
+		struct flow_keys keys;
+
+		__skb_set_sw_hash(skb, __get_hash_from_flowi4(fl4, &keys),
+				  flow_keys_have_l4(&keys));
+	}
 
 	return skb->hash;
 }
diff --git a/include/net/flow.h b/include/net/flow.h
index 9e0297c4c11d..dafe97c3c048 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -10,6 +10,7 @@
 #include <linux/socket.h>
 #include <linux/in6.h>
 #include <linux/atomic.h>
+#include <net/flow_dissector.h>
 
 /*
  * ifindex generation is per-net namespace, and loopback is
@@ -243,4 +244,22 @@ void flow_cache_flush(struct net *net);
 void flow_cache_flush_deferred(struct net *net);
 extern atomic_t flow_cache_genid;
 
+__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys);
+
+static inline __u32 get_hash_from_flowi6(struct flowi6 *fl6)
+{
+	struct flow_keys keys;
+
+	return __get_hash_from_flowi6(fl6, &keys);
+}
+
+__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys);
+
+static inline __u32 get_hash_from_flowi4(struct flowi4 *fl4)
+{
+	struct flow_keys keys;
+
+	return __get_hash_from_flowi4(fl4, &keys);
+}
+
 #endif
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index af76c496f7db..74fe160f0b05 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -172,4 +172,6 @@ static inline bool flow_keys_have_l4(struct flow_keys *keys)
 	return (keys->ports.ports || keys->tags.flow_label);
 }
 
+u32 flow_hash_from_keys(struct flow_keys *keys);
+
 #endif
diff --git a/net/core/flow.c b/net/core/flow.c
index 1033725be40b..61930bb0eb59 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -22,6 +22,7 @@
 #include <linux/cpumask.h>
 #include <linux/mutex.h>
 #include <net/flow.h>
+#include <net/flow_dissector.h>
 #include <linux/atomic.h>
 #include <linux/security.h>
 #include <net/net_namespace.h>
@@ -509,3 +510,38 @@ void flow_cache_fini(struct net *net)
 	fc->percpu = NULL;
 }
 EXPORT_SYMBOL(flow_cache_fini);
+
+__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys)
+{
+	memset(keys, 0, sizeof(*keys));
+
+	memcpy(&keys->addrs.v6addrs.src, &fl6->saddr,
+	    sizeof(keys->addrs.v6addrs.src));
+	memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr,
+	    sizeof(keys->addrs.v6addrs.dst));
+	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+	keys->ports.src = fl6->fl6_sport;
+	keys->ports.dst = fl6->fl6_dport;
+	keys->keyid.keyid = fl6->fl6_gre_key;
+	keys->tags.flow_label = (__force u32)fl6->flowlabel;
+	keys->basic.ip_proto = fl6->flowi6_proto;
+
+	return flow_hash_from_keys(keys);
+}
+EXPORT_SYMBOL(__get_hash_from_flowi6);
+
+__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys)
+{
+	memset(keys, 0, sizeof(*keys));
+
+	keys->addrs.v4addrs.src = fl4->saddr;
+	keys->addrs.v4addrs.dst = fl4->daddr;
+	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+	keys->ports.src = fl4->fl4_sport;
+	keys->ports.dst = fl4->fl4_dport;
+	keys->keyid.keyid = fl4->fl4_gre_key;
+	keys->basic.ip_proto = fl4->flowi4_proto;
+
+	return flow_hash_from_keys(keys);
+}
+EXPORT_SYMBOL(__get_hash_from_flowi4);
-- 
cgit v1.2.3


From cd79a2382aa5dcefa6e21a7c59bb1bb19e53b74d Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 09:24:27 -0700
Subject: flow_dissector: Add flags argument to skb_flow_dissector functions

The flags argument will allow control of the dissection process (for
instance whether to parse beyond L3).

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c             |  2 +-
 drivers/net/ethernet/cisco/enic/enic_clsf.c |  2 +-
 drivers/net/hyperv/netvsc_drv.c             |  2 +-
 include/linux/skbuff.h                      | 19 +++++++++++--------
 net/core/flow_dissector.c                   |  7 ++++---
 net/ethernet/eth.c                          |  2 +-
 net/sched/cls_flow.c                        |  2 +-
 net/sched/cls_flower.c                      |  2 +-
 net/sched/sch_choke.c                       |  4 ++--
 9 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 06e2d01f0b4e..771a449d2f56 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3095,7 +3095,7 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 	int noff, proto = -1;
 
 	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23)
-		return skb_flow_dissect_flow_keys(skb, fk);
+		return skb_flow_dissect_flow_keys(skb, fk, 0);
 
 	fk->ports.ports = 0;
 	noff = skb_network_offset(skb);
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c
index d106186f4f4a..3c677ed3c29e 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.c
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c
@@ -177,7 +177,7 @@ int enic_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
 	int res, i;
 
 	enic = netdev_priv(dev);
-	res = skb_flow_dissect_flow_keys(skb, &keys);
+	res = skb_flow_dissect_flow_keys(skb, &keys, 0);
 	if (!res || keys.basic.n_proto != htons(ETH_P_IP) ||
 	    (keys.basic.ip_proto != IPPROTO_TCP &&
 	     keys.basic.ip_proto != IPPROTO_UDP))
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 2990024b90f9..409b48e1e589 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -239,7 +239,7 @@ static bool netvsc_set_hash(u32 *hash, struct sk_buff *skb)
 	struct flow_keys flow;
 	int data_len;
 
-	if (!skb_flow_dissect_flow_keys(skb, &flow) ||
+	if (!skb_flow_dissect_flow_keys(skb, &flow, 0) ||
 	    !(flow.basic.n_proto == htons(ETH_P_IP) ||
 	      flow.basic.n_proto == htons(ETH_P_IPV6)))
 		return false;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bbe41bccfc5f..9e62687c70f3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -991,31 +991,34 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 bool __skb_flow_dissect(const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
 			void *target_container,
-			void *data, __be16 proto, int nhoff, int hlen);
+			void *data, __be16 proto, int nhoff, int hlen,
+			unsigned int flags);
 
 static inline bool skb_flow_dissect(const struct sk_buff *skb,
 				    struct flow_dissector *flow_dissector,
-				    void *target_container)
+				    void *target_container, unsigned int flags)
 {
 	return __skb_flow_dissect(skb, flow_dissector, target_container,
-				  NULL, 0, 0, 0);
+				  NULL, 0, 0, 0, flags);
 }
 
 static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
-					      struct flow_keys *flow)
+					      struct flow_keys *flow,
+					      unsigned int flags)
 {
 	memset(flow, 0, sizeof(*flow));
 	return __skb_flow_dissect(skb, &flow_keys_dissector, flow,
-				  NULL, 0, 0, 0);
+				  NULL, 0, 0, 0, flags);
 }
 
 static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow,
 						  void *data, __be16 proto,
-						  int nhoff, int hlen)
+						  int nhoff, int hlen,
+						  unsigned int flags)
 {
 	memset(flow, 0, sizeof(*flow));
 	return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow,
-				  data, proto, nhoff, hlen);
+				  data, proto, nhoff, hlen, flags);
 }
 
 static inline __u32 skb_get_hash(struct sk_buff *skb)
@@ -2046,7 +2049,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb,
 
 	if (skb_transport_header_was_set(skb))
 		return;
-	else if (skb_flow_dissect_flow_keys(skb, &keys))
+	else if (skb_flow_dissect_flow_keys(skb, &keys, 0))
 		skb_set_transport_header(skb, keys.control.thoff);
 	else
 		skb_set_transport_header(skb, offset_hint);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 22f3d768b459..c3d9807cb34e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -121,7 +121,8 @@ EXPORT_SYMBOL(__skb_flow_get_ports);
 bool __skb_flow_dissect(const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
 			void *target_container,
-			void *data, __be16 proto, int nhoff, int hlen)
+			void *data, __be16 proto, int nhoff, int hlen,
+			unsigned int flags)
 {
 	struct flow_dissector_key_control *key_control;
 	struct flow_dissector_key_basic *key_basic;
@@ -556,7 +557,7 @@ EXPORT_SYMBOL(flow_hash_from_keys);
 static inline u32 ___skb_get_hash(const struct sk_buff *skb,
 				  struct flow_keys *keys, u32 keyval)
 {
-	if (!skb_flow_dissect_flow_keys(skb, keys))
+	if (!skb_flow_dissect_flow_keys(skb, keys, 0))
 		return 0;
 
 	return __flow_hash_from_keys(keys, keyval);
@@ -726,7 +727,7 @@ u32 skb_get_poff(const struct sk_buff *skb)
 {
 	struct flow_keys keys;
 
-	if (!skb_flow_dissect_flow_keys(skb, &keys))
+	if (!skb_flow_dissect_flow_keys(skb, &keys, 0))
 		return 0;
 
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 217127c3a3ef..d850fdc828f9 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -132,7 +132,7 @@ u32 eth_get_headlen(void *data, unsigned int len)
 
 	/* parse any remaining L2/L3 headers, check for L4 */
 	if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
-					    sizeof(*eth), len))
+					    sizeof(*eth), len, 0))
 		return max_t(u32, keys.control.thoff, sizeof(*eth));
 
 	/* parse for any L4 headers */
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index bb2a0f529c1f..536838b657bf 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -301,7 +301,7 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 
 		keymask = f->keymask;
 		if (keymask & FLOW_KEYS_NEEDED)
-			skb_flow_dissect_flow_keys(skb, &flow_keys);
+			skb_flow_dissect_flow_keys(skb, &flow_keys, 0);
 
 		for (n = 0; n < f->nkeys; n++) {
 			key = ffs(keymask) - 1;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 2f3d03f99487..57692947ebbe 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -129,7 +129,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	 * so do it rather here.
 	 */
 	skb_key.basic.n_proto = skb->protocol;
-	skb_flow_dissect(skb, &head->dissector, &skb_key);
+	skb_flow_dissect(skb, &head->dissector, &skb_key, 0);
 
 	fl_set_masked_key(&skb_mkey, &skb_key, &head->mask);
 
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 665bde07916b..02bfd3d1c4f0 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -170,13 +170,13 @@ static bool choke_match_flow(struct sk_buff *skb1,
 
 	if (!choke_skb_cb(skb1)->keys_valid) {
 		choke_skb_cb(skb1)->keys_valid = 1;
-		skb_flow_dissect_flow_keys(skb1, &temp);
+		skb_flow_dissect_flow_keys(skb1, &temp, 0);
 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
 	}
 
 	if (!choke_skb_cb(skb2)->keys_valid) {
 		choke_skb_cb(skb2)->keys_valid = 1;
-		skb_flow_dissect_flow_keys(skb2, &temp);
+		skb_flow_dissect_flow_keys(skb2, &temp, 0);
 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
 	}
 
-- 
cgit v1.2.3


From de4c1f8ba302ccf4f2b3b17dc614b0a0b14d351a Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 1 Sep 2015 18:11:04 -0700
Subject: flow_dissector: Fix function argument ordering dependency

Commit c6cc1ca7f4d70c ("flowi: Abstract out functions to get flow hash
based on flowi") introduced a bug in __skb_set_sw_hash where we
require a dependency on evaluating arguments in a function in order.
There is no such ordering enforced in C, so this incorrect. This
patch fixes that by splitting out the arguments. This bug was
found via a compiler warning that keys may be uninitialized.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9e62687c70f3..eabfb810bc62 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1035,9 +1035,9 @@ static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
 {
 	if (!skb->l4_hash && !skb->sw_hash) {
 		struct flow_keys keys;
+		__u32 hash = __get_hash_from_flowi6(fl6, &keys);
 
-		__skb_set_sw_hash(skb, __get_hash_from_flowi6(fl6, &keys),
-				  flow_keys_have_l4(&keys));
+		__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
 	}
 
 	return skb->hash;
@@ -1049,9 +1049,9 @@ static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
 {
 	if (!skb->l4_hash && !skb->sw_hash) {
 		struct flow_keys keys;
+		__u32 hash = __get_hash_from_flowi4(fl4, &keys);
 
-		__skb_set_sw_hash(skb, __get_hash_from_flowi4(fl4, &keys),
-				  flow_keys_have_l4(&keys));
+		__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
 	}
 
 	return skb->hash;
-- 
cgit v1.2.3


From 20a17bf6c04e3eca8824c930ecc55ab832558e3b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 1 Sep 2015 21:19:17 -0700
Subject: flow_dissector: Use 'const' where possible.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  8 ++---
 include/net/flow.h        |  8 ++---
 net/core/flow_dissector.c | 79 ++++++++++++++++++++++++-----------------------
 3 files changed, 49 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index eabfb810bc62..2738d355cdf9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1029,9 +1029,9 @@ static inline __u32 skb_get_hash(struct sk_buff *skb)
 	return skb->hash;
 }
 
-__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6);
+__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6);
 
-static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
+static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
 {
 	if (!skb->l4_hash && !skb->sw_hash) {
 		struct flow_keys keys;
@@ -1043,9 +1043,9 @@ static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
 	return skb->hash;
 }
 
-__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl);
+__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl);
 
-static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
+static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4)
 {
 	if (!skb->l4_hash && !skb->sw_hash) {
 		struct flow_keys keys;
diff --git a/include/net/flow.h b/include/net/flow.h
index dafe97c3c048..acd6a096250e 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -244,18 +244,18 @@ void flow_cache_flush(struct net *net);
 void flow_cache_flush_deferred(struct net *net);
 extern atomic_t flow_cache_genid;
 
-__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys);
+__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys);
 
-static inline __u32 get_hash_from_flowi6(struct flowi6 *fl6)
+static inline __u32 get_hash_from_flowi6(const struct flowi6 *fl6)
 {
 	struct flow_keys keys;
 
 	return __get_hash_from_flowi6(fl6, &keys);
 }
 
-__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys);
+__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys);
 
-static inline __u32 get_hash_from_flowi4(struct flowi4 *fl4)
+static inline __u32 get_hash_from_flowi4(const struct flowi4 *fl4)
 {
 	struct flow_keys keys;
 
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 345a0408cfe4..d79699c9d1b9 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -19,14 +19,14 @@
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
-static bool skb_flow_dissector_uses_key(struct flow_dissector *flow_dissector,
-					enum flow_dissector_key_id key_id)
+static bool dissector_uses_key(const struct flow_dissector *flow_dissector,
+			       enum flow_dissector_key_id key_id)
 {
 	return flow_dissector->used_keys & (1 << key_id);
 }
 
-static void skb_flow_dissector_set_key(struct flow_dissector *flow_dissector,
-				       enum flow_dissector_key_id key_id)
+static void dissector_set_key(struct flow_dissector *flow_dissector,
+			      enum flow_dissector_key_id key_id)
 {
 	flow_dissector->used_keys |= (1 << key_id);
 }
@@ -51,20 +51,20 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 		 * boundaries of unsigned short.
 		 */
 		BUG_ON(key->offset > USHRT_MAX);
-		BUG_ON(skb_flow_dissector_uses_key(flow_dissector,
-						   key->key_id));
+		BUG_ON(dissector_uses_key(flow_dissector,
+					  key->key_id));
 
-		skb_flow_dissector_set_key(flow_dissector, key->key_id);
+		dissector_set_key(flow_dissector, key->key_id);
 		flow_dissector->offset[key->key_id] = key->offset;
 	}
 
 	/* Ensure that the dissector always includes control and basic key.
 	 * That way we are able to avoid handling lack of these in fast path.
 	 */
-	BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
-					    FLOW_DISSECTOR_KEY_CONTROL));
-	BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
-					    FLOW_DISSECTOR_KEY_BASIC));
+	BUG_ON(!dissector_uses_key(flow_dissector,
+				   FLOW_DISSECTOR_KEY_CONTROL));
+	BUG_ON(!dissector_uses_key(flow_dissector,
+				   FLOW_DISSECTOR_KEY_BASIC));
 }
 EXPORT_SYMBOL(skb_flow_dissector_init);
 
@@ -154,8 +154,8 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 					      FLOW_DISSECTOR_KEY_BASIC,
 					      target_container);
 
-	if (skb_flow_dissector_uses_key(flow_dissector,
-					FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+	if (dissector_uses_key(flow_dissector,
+			       FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
 		struct ethhdr *eth = eth_hdr(skb);
 		struct flow_dissector_key_eth_addrs *key_eth_addrs;
 
@@ -178,8 +178,8 @@ ip:
 
 		ip_proto = iph->protocol;
 
-		if (!skb_flow_dissector_uses_key(flow_dissector,
-						 FLOW_DISSECTOR_KEY_IPV4_ADDRS))
+		if (!dissector_uses_key(flow_dissector,
+					FLOW_DISSECTOR_KEY_IPV4_ADDRS))
 			break;
 
 		key_addrs = skb_flow_dissector_target(flow_dissector,
@@ -218,8 +218,8 @@ ipv6:
 		ip_proto = iph->nexthdr;
 		nhoff += sizeof(struct ipv6hdr);
 
-		if (skb_flow_dissector_uses_key(flow_dissector,
-						FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+		if (dissector_uses_key(flow_dissector,
+				       FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
 			struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs;
 
 			key_ipv6_addrs = skb_flow_dissector_target(flow_dissector,
@@ -232,8 +232,8 @@ ipv6:
 
 		flow_label = ip6_flowlabel(iph);
 		if (flow_label) {
-			if (skb_flow_dissector_uses_key(flow_dissector,
-				FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
+			if (dissector_uses_key(flow_dissector,
+					       FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
 				key_tags = skb_flow_dissector_target(flow_dissector,
 								     FLOW_DISSECTOR_KEY_FLOW_LABEL,
 								     target_container);
@@ -257,8 +257,8 @@ ipv6:
 		if (!vlan)
 			goto out_bad;
 
-		if (skb_flow_dissector_uses_key(flow_dissector,
-						FLOW_DISSECTOR_KEY_VLANID)) {
+		if (dissector_uses_key(flow_dissector,
+				       FLOW_DISSECTOR_KEY_VLANID)) {
 			key_tags = skb_flow_dissector_target(flow_dissector,
 							     FLOW_DISSECTOR_KEY_VLANID,
 							     target_container);
@@ -298,8 +298,8 @@ ipv6:
 		if (!hdr)
 			goto out_bad;
 
-		if (skb_flow_dissector_uses_key(flow_dissector,
-						FLOW_DISSECTOR_KEY_TIPC_ADDRS)) {
+		if (dissector_uses_key(flow_dissector,
+				       FLOW_DISSECTOR_KEY_TIPC_ADDRS)) {
 			key_addrs = skb_flow_dissector_target(flow_dissector,
 							      FLOW_DISSECTOR_KEY_TIPC_ADDRS,
 							      target_container);
@@ -320,8 +320,8 @@ mpls:
 
 		if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >>
 		     MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) {
-			if (skb_flow_dissector_uses_key(flow_dissector,
-							FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
+			if (dissector_uses_key(flow_dissector,
+					       FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
 				key_keyid = skb_flow_dissector_target(flow_dissector,
 								      FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
 								      target_container);
@@ -374,8 +374,8 @@ ip_proto_again:
 			if (!keyid)
 				goto out_bad;
 
-			if (skb_flow_dissector_uses_key(flow_dissector,
-							FLOW_DISSECTOR_KEY_GRE_KEYID)) {
+			if (dissector_uses_key(flow_dissector,
+					       FLOW_DISSECTOR_KEY_GRE_KEYID)) {
 				key_keyid = skb_flow_dissector_target(flow_dissector,
 								      FLOW_DISSECTOR_KEY_GRE_KEYID,
 								      target_container);
@@ -470,8 +470,8 @@ ip_proto_again:
 		break;
 	}
 
-	if (skb_flow_dissector_uses_key(flow_dissector,
-					FLOW_DISSECTOR_KEY_PORTS)) {
+	if (dissector_uses_key(flow_dissector,
+			       FLOW_DISSECTOR_KEY_PORTS)) {
 		key_ports = skb_flow_dissector_target(flow_dissector,
 						      FLOW_DISSECTOR_KEY_PORTS,
 						      target_container);
@@ -497,18 +497,21 @@ static __always_inline void __flow_hash_secret_init(void)
 	net_get_random_once(&hashrnd, sizeof(hashrnd));
 }
 
-static __always_inline u32 __flow_hash_words(u32 *words, u32 length, u32 keyval)
+static __always_inline u32 __flow_hash_words(const u32 *words, u32 length,
+					     u32 keyval)
 {
 	return jhash2(words, length, keyval);
 }
 
-static inline void *flow_keys_hash_start(struct flow_keys *flow)
+static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow)
 {
+	const void *p = flow;
+
 	BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32));
-	return (void *)flow + FLOW_KEYS_HASH_OFFSET;
+	return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET);
 }
 
-static inline size_t flow_keys_hash_length(struct flow_keys *flow)
+static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
 {
 	size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);
 	BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
@@ -598,7 +601,7 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 
 	__flow_hash_consistentify(keys);
 
-	hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys),
+	hash = __flow_hash_words(flow_keys_hash_start(keys),
 				 flow_keys_hash_length(keys), keyval);
 	if (!hash)
 		hash = 1;
@@ -677,7 +680,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
 }
 EXPORT_SYMBOL(skb_get_hash_perturb);
 
-__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
+__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
 {
 	struct flow_keys keys;
 
@@ -701,7 +704,7 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6)
 }
 EXPORT_SYMBOL(__skb_get_hash_flowi6);
 
-__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4)
+__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4)
 {
 	struct flow_keys keys;
 
@@ -787,7 +790,7 @@ u32 skb_get_poff(const struct sk_buff *skb)
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
 }
 
-__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys)
+__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
 {
 	memset(keys, 0, sizeof(*keys));
 
@@ -806,7 +809,7 @@ __u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys)
 }
 EXPORT_SYMBOL(__get_hash_from_flowi6);
 
-__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys)
+__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys)
 {
 	memset(keys, 0, sizeof(*keys));
 
-- 
cgit v1.2.3


From 62da98656b62a5ca57f22263705175af8ded5aa1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 3 Sep 2015 01:26:07 +0200
Subject: netfilter: nf_conntrack: make nf_ct_zone_dflt built-in

Fengguang reported, that some randconfig generated the following linker
issue with nf_ct_zone_dflt object involved:

  [...]
  CC      init/version.o
  LD      init/built-in.o
  net/built-in.o: In function `ipv4_conntrack_defrag':
  nf_defrag_ipv4.c:(.text+0x93e95): undefined reference to `nf_ct_zone_dflt'
  net/built-in.o: In function `ipv6_defrag':
  nf_defrag_ipv6_hooks.c:(.text+0xe3ffe): undefined reference to `nf_ct_zone_dflt'
  make: *** [vmlinux] Error 1

Given that configurations exist where we have a built-in part, which is
accessing nf_ct_zone_dflt such as the two handlers nf_ct_defrag_user()
and nf_ct6_defrag_user(), and a part that configures nf_conntrack as a
module, we must move nf_ct_zone_dflt into a fixed, guaranteed built-in
area when netfilter is configured in general.

Therefore, split the more generic parts into a common header under
include/linux/netfilter/ and move nf_ct_zone_dflt into the built-in
section that already holds parts related to CONFIG_NF_CONNTRACK in the
netfilter core. This fixes the issue on my side.

Fixes: 308ac9143ee2 ("netfilter: nf_conntrack: push zone object into functions")
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h                          |  2 ++
 .../linux/netfilter/nf_conntrack_zones_common.h    | 23 ++++++++++++++++++++++
 include/net/netfilter/nf_conntrack_zones.h         | 19 +-----------------
 net/netfilter/core.c                               |  6 ++++++
 net/netfilter/nf_conntrack_core.c                  |  7 -------
 5 files changed, 32 insertions(+), 25 deletions(-)
 create mode 100644 include/linux/netfilter/nf_conntrack_zones_common.h

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index d788ce62d826..36a652531791 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -368,6 +368,8 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 #endif /*CONFIG_NETFILTER*/
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#include <linux/netfilter/nf_conntrack_zones_common.h>
+
 extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu;
 void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
 extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu;
diff --git a/include/linux/netfilter/nf_conntrack_zones_common.h b/include/linux/netfilter/nf_conntrack_zones_common.h
new file mode 100644
index 000000000000..5d7cf36d4766
--- /dev/null
+++ b/include/linux/netfilter/nf_conntrack_zones_common.h
@@ -0,0 +1,23 @@
+#ifndef _NF_CONNTRACK_ZONES_COMMON_H
+#define _NF_CONNTRACK_ZONES_COMMON_H
+
+#include <uapi/linux/netfilter/nf_conntrack_tuple_common.h>
+
+#define NF_CT_DEFAULT_ZONE_ID	0
+
+#define NF_CT_ZONE_DIR_ORIG	(1 << IP_CT_DIR_ORIGINAL)
+#define NF_CT_ZONE_DIR_REPL	(1 << IP_CT_DIR_REPLY)
+
+#define NF_CT_DEFAULT_ZONE_DIR	(NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL)
+
+#define NF_CT_FLAG_MARK		1
+
+struct nf_conntrack_zone {
+	u16	id;
+	u8	flags;
+	u8	dir;
+};
+
+extern const struct nf_conntrack_zone nf_ct_zone_dflt;
+
+#endif /* _NF_CONNTRACK_ZONES_COMMON_H */
diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h
index 5316c7b3a374..4e32512cef32 100644
--- a/include/net/netfilter/nf_conntrack_zones.h
+++ b/include/net/netfilter/nf_conntrack_zones.h
@@ -1,24 +1,7 @@
 #ifndef _NF_CONNTRACK_ZONES_H
 #define _NF_CONNTRACK_ZONES_H
 
-#include <linux/netfilter/nf_conntrack_tuple_common.h>
-
-#define NF_CT_DEFAULT_ZONE_ID	0
-
-#define NF_CT_ZONE_DIR_ORIG	(1 << IP_CT_DIR_ORIGINAL)
-#define NF_CT_ZONE_DIR_REPL	(1 << IP_CT_DIR_REPLY)
-
-#define NF_CT_DEFAULT_ZONE_DIR	(NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL)
-
-#define NF_CT_FLAG_MARK		1
-
-struct nf_conntrack_zone {
-	u16	id;
-	u8	flags;
-	u8	dir;
-};
-
-extern const struct nf_conntrack_zone nf_ct_zone_dflt;
+#include <linux/netfilter/nf_conntrack_zones_common.h>
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 #include <net/netfilter/nf_conntrack_extend.h>
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 0b939b7ad724..8e47f8113495 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -388,6 +388,12 @@ EXPORT_SYMBOL(nf_conntrack_destroy);
 struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly;
 EXPORT_SYMBOL_GPL(nfq_ct_hook);
 
+/* Built-in default zone used e.g. by modules. */
+const struct nf_conntrack_zone nf_ct_zone_dflt = {
+	.id	= NF_CT_DEFAULT_ZONE_ID,
+	.dir	= NF_CT_DEFAULT_ZONE_DIR,
+};
+EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
 #endif /* CONFIG_NF_CONNTRACK */
 
 #ifdef CONFIG_NF_NAT_NEEDED
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ac3be9b0629b..eedf0495f11f 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1286,13 +1286,6 @@ bool __nf_ct_kill_acct(struct nf_conn *ct,
 }
 EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
 
-/* Built-in default zone used e.g. by modules. */
-const struct nf_conntrack_zone nf_ct_zone_dflt = {
-	.id	= NF_CT_DEFAULT_ZONE_ID,
-	.dir	= NF_CT_DEFAULT_ZONE_DIR,
-};
-EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
-
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = {
 	.len	= sizeof(struct nf_conntrack_zone),
-- 
cgit v1.2.3