From d1174416747d790d750742d0514915deeed93acf Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 10 May 2017 11:22:52 -0700
Subject: bpf: Track alignment of register values in the verifier.

Currently if we add only constant values to pointers we can fully
validate the alignment, and properly check if we need to reject the
program on !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS architectures.

However, once an unknown value is introduced we only allow byte sized
memory accesses which is too restrictive.

Add logic to track the known minimum alignment of register values,
and propagate this state into registers containing pointers.

The most common paradigm that makes use of this new logic is computing
the transport header using the IP header length field.  For example:

	struct ethhdr *ep = skb->data;
	struct iphdr *iph = (struct iphdr *) (ep + 1);
	struct tcphdr *th;
 ...
	n = iph->ihl;
	th = ((void *)iph + (n * 4));
	port = th->dest;

The existing code will reject the load of th->dest because it cannot
validate that the alignment is at least 2 once "n * 4" is added the
the packet pointer.

In the new code, the register holding "n * 4" will have a reg->min_align
value of 4, because any value multiplied by 4 will be at least 4 byte
aligned.  (actually, the eBPF code emitted by the compiler in this case
is most likely to use a shift left by 2, but the end result is identical)

At the critical addition:

	th = ((void *)iph + (n * 4));

The register holding 'th' will start with reg->off value of 14.  The
pointer addition will transform that reg into something that looks like:

	reg->aux_off = 14
	reg->aux_off_align = 4

Next, the verifier will look at the th->dest load, and it will see
a load offset of 2, and first check:

	if (reg->aux_off_align % size)

which will pass because aux_off_align is 4.  reg_off will be computed:

	reg_off = reg->off;
 ...
		reg_off += reg->aux_off;

plus we have off==2, and it will thus check:

	if ((NET_IP_ALIGN + reg_off + off) % size != 0)

which evaluates to:

	if ((NET_IP_ALIGN + 14 + 2) % size != 0)

On strict alignment architectures, NET_IP_ALIGN is 2, thus:

	if ((2 + 14 + 2) % size != 0)

which passes.

These pointer transformations and checks work regardless of whether
the constant offset or the variable with known alignment is added
first to the pointer register.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5efb4db44e1e..7c6a51924afc 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -40,6 +40,9 @@ struct bpf_reg_state {
 	 */
 	s64 min_value;
 	u64 max_value;
+	u32 min_align;
+	u32 aux_off;
+	u32 aux_off_align;
 };
 
 enum bpf_stack_slot_type {
-- 
cgit v1.2.3


From e07b98d9bffe410019dfcf62c3428d4a96c56a2c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 10 May 2017 11:38:07 -0700
Subject: bpf: Add strict alignment flag for BPF_PROG_LOAD.

Add a new field, "prog_flags", and an initial flag value
BPF_F_STRICT_ALIGNMENT.

When set, the verifier will enforce strict pointer alignment
regardless of the setting of CONFIG_EFFICIENT_UNALIGNED_ACCESS.

The verifier, in this mode, will also use a fixed value of "2" in
place of NET_IP_ALIGN.

This facilitates test cases that will exercise and validate this part
of the verifier even when run on architectures where alignment doesn't
matter.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h   |  1 +
 include/uapi/linux/bpf.h       |  8 ++++++++
 kernel/bpf/syscall.c           |  5 ++++-
 kernel/bpf/verifier.c          | 23 +++++++++++++++++------
 tools/build/feature/test-bpf.c |  1 +
 tools/include/uapi/linux/bpf.h | 11 +++++++++--
 6 files changed, 40 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7c6a51924afc..d5093b52b485 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -90,6 +90,7 @@ struct bpf_verifier_env {
 	struct bpf_prog *prog;		/* eBPF program being verified */
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
 	int stack_size;			/* number of states to be processed */
+	bool strict_alignment;		/* perform strict pointer alignment checks */
 	struct bpf_verifier_state cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 945a1f5f63c5..94dfa9def355 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -132,6 +132,13 @@ enum bpf_attach_type {
  */
 #define BPF_F_ALLOW_OVERRIDE	(1U << 0)
 
+/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
+ * verifier will perform strict alignment checking as if the kernel
+ * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set,
+ * and NET_IP_ALIGN defined to 2.
+ */
+#define BPF_F_STRICT_ALIGNMENT	(1U << 0)
+
 #define BPF_PSEUDO_MAP_FD	1
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -177,6 +184,7 @@ union bpf_attr {
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
+		__u32		prog_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fd2411fd6914..265a0d854e33 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -783,7 +783,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD kern_version
+#define	BPF_PROG_LOAD_LAST_FIELD prog_flags
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -796,6 +796,9 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_LOAD))
 		return -EINVAL;
 
+	if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
+		return -EINVAL;
+
 	/* copy eBPF program license from user space */
 	if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
 			      sizeof(license) - 1) < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ff2bfe1d656a..e74fb1b87855 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -791,6 +791,7 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
 {
+	int ip_align;
 	int reg_off;
 
 	/* Byte size accesses are always allowed. */
@@ -807,10 +808,14 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 		reg_off += reg->aux_off;
 	}
 
-	/* skb->data is NET_IP_ALIGN-ed */
-	if ((NET_IP_ALIGN + reg_off + off) % size != 0) {
+	/* skb->data is NET_IP_ALIGN-ed, but for strict alignment checking
+	 * we force this to 2 which is universally what architectures use
+	 * when they don't set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+	 */
+	ip_align = strict ? 2 : NET_IP_ALIGN;
+	if ((ip_align + reg_off + off) % size != 0) {
 		verbose("misaligned packet access off %d+%d+%d size %d\n",
-			NET_IP_ALIGN, reg_off, off, size);
+			ip_align, reg_off, off, size);
 		return -EACCES;
 	}
 
@@ -828,10 +833,11 @@ static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
 	return 0;
 }
 
-static int check_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_ptr_alignment(struct bpf_verifier_env *env,
+			       const struct bpf_reg_state *reg,
 			       int off, int size)
 {
-	bool strict = false;
+	bool strict = env->strict_alignment;
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		strict = true;
@@ -873,7 +879,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 	if (size < 0)
 		return size;
 
-	err = check_ptr_alignment(reg, off, size);
+	err = check_ptr_alignment(env, reg, off, size);
 	if (err)
 		return err;
 
@@ -3568,6 +3574,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	} else {
 		log_level = 0;
 	}
+	if (attr->prog_flags & BPF_F_STRICT_ALIGNMENT)
+		env->strict_alignment = true;
+	else
+		env->strict_alignment = false;
 
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
@@ -3673,6 +3683,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	mutex_lock(&bpf_verifier_lock);
 
 	log_level = 0;
+	env->strict_alignment = false;
 
 	env->explored_states = kcalloc(env->prog->len,
 				       sizeof(struct bpf_verifier_state_list *),
diff --git a/tools/build/feature/test-bpf.c b/tools/build/feature/test-bpf.c
index ebc6dceddb58..7598361ef1f1 100644
--- a/tools/build/feature/test-bpf.c
+++ b/tools/build/feature/test-bpf.c
@@ -29,6 +29,7 @@ int main(void)
 	attr.log_size = 0;
 	attr.log_level = 0;
 	attr.kern_version = 0;
+	attr.prog_flags = 0;
 
 	/*
 	 * Test existence of __NR_bpf and BPF_PROG_LOAD.
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e553529929f6..94dfa9def355 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -132,6 +132,13 @@ enum bpf_attach_type {
  */
 #define BPF_F_ALLOW_OVERRIDE	(1U << 0)
 
+/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
+ * verifier will perform strict alignment checking as if the kernel
+ * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set,
+ * and NET_IP_ALIGN defined to 2.
+ */
+#define BPF_F_STRICT_ALIGNMENT	(1U << 0)
+
 #define BPF_PSEUDO_MAP_FD	1
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -177,6 +184,7 @@ union bpf_attr {
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
+		__u32		prog_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -481,8 +489,7 @@ union bpf_attr {
  * u32 bpf_get_socket_uid(skb)
  *     Get the owner uid of the socket stored inside sk_buff.
  *     @skb: pointer to skb
- *     Return: uid of the socket owner on success or 0 if the socket pointer
- *     inside sk_buff is NULL
+ *     Return: uid of the socket owner on success or overflowuid if failed.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3


From d67b9cd28c1d7f82c2e5e727731ea7c89b23a0a8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 12 May 2017 01:04:46 +0200
Subject: xdp: refine xdp api with regards to generic xdp

While working on the iproute2 generic XDP frontend, I noticed that
as of right now it's possible to have native *and* generic XDP
programs loaded both at the same time for the case when a driver
supports native XDP.

The intended model for generic XDP from b5cdae3291f7 ("net: Generic
XDP") is, however, that only one out of the two can be present at
once which is also indicated as such in the XDP netlink dump part.
The main rationale for generic XDP is to ease accessibility (in
case a driver does not yet have XDP support) and to generically
provide a semantical model as an example for driver developers
wanting to add XDP support. The generic XDP option for an XDP
aware driver can still be useful for comparing and testing both
implementations.

However, it is not intended to have a second XDP processing stage
or layer with exactly the same functionality of the first native
stage. Only reason could be to have a partial fallback for future
XDP features that are not supported yet in the native implementation
and we probably also shouldn't strive for such fallback and instead
encourage native feature support in the first place. Given there's
currently no such fallback issue or use case, lets not go there yet
if we don't need to.

Therefore, change semantics for loading XDP and bail out if the
user tries to load a generic XDP program when a native one is
present and vice versa. Another alternative to bailing out would
be to handle the transition from one flavor to another gracefully,
but that would require to bring the device down, exchange both
types of programs, and bring it up again in order to avoid a tiny
window where a packet could hit both hooks. Given this complicates
the logic for just a debugging feature in the native case, I went
with the simpler variant.

For the dump, remove IFLA_XDP_FLAGS that was added with b5cdae3291f7
and reuse IFLA_XDP_ATTACHED for indicating the mode. Dumping all
or just a subset of flags that were used for loading the XDP prog
is suboptimal in the long run since not all flags are useful for
dumping and if we start to reuse the same flag definitions for
load and dump, then we'll waste bit space. What we really just
want is to dump the mode for now.

Current IFLA_XDP_ATTACHED semantics are: nothing was installed (0),
a program is running at the native driver layer (1). Thus, add a
mode that says that a program is running at generic XDP layer (2).
Applications will handle this fine in that older binaries will
just indicate that something is attached at XDP layer, effectively
this is similar to IFLA_XDP_FLAGS attr that we would have had
modulo the redundancy.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    |  8 +++++--
 include/uapi/linux/if_link.h |  7 ++++++
 net/core/dev.c               | 55 +++++++++++++++++++++++++++++---------------
 net/core/rtnetlink.c         | 40 +++++++++++++++-----------------
 4 files changed, 67 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9c23bd2efb56..3f39d27decf4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3296,11 +3296,15 @@ int dev_get_phys_port_id(struct net_device *dev,
 int dev_get_phys_port_name(struct net_device *dev,
 			   char *name, size_t len);
 int dev_change_proto_down(struct net_device *dev, bool proto_down);
-int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
-		      int fd, u32 flags);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
+
+typedef int (*xdp_op_t)(struct net_device *dev, struct netdev_xdp *xdp);
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+		      int fd, u32 flags);
+bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op);
+
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 bool is_skb_forwardable(const struct net_device *dev,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 549ac8a98b44..15ac20382aba 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -894,6 +894,13 @@ enum {
 					 XDP_FLAGS_SKB_MODE | \
 					 XDP_FLAGS_DRV_MODE)
 
+/* These are stored into IFLA_XDP_ATTACHED on dump. */
+enum {
+	XDP_ATTACHED_NONE = 0,
+	XDP_ATTACHED_DRV,
+	XDP_ATTACHED_SKB,
+};
+
 enum {
 	IFLA_XDP_UNSPEC,
 	IFLA_XDP_FD,
diff --git a/net/core/dev.c b/net/core/dev.c
index e56cb71351d4..fca407b4a6ea 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6852,6 +6852,32 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
+bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op)
+{
+	struct netdev_xdp xdp;
+
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = XDP_QUERY_PROG;
+
+	/* Query must always succeed. */
+	WARN_ON(xdp_op(dev, &xdp) < 0);
+	return xdp.prog_attached;
+}
+
+static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op,
+			   struct netlink_ext_ack *extack,
+			   struct bpf_prog *prog)
+{
+	struct netdev_xdp xdp;
+
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = XDP_SETUP_PROG;
+	xdp.extack = extack;
+	xdp.prog = prog;
+
+	return xdp_op(dev, &xdp);
+}
+
 /**
  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
  *	@dev: device
@@ -6864,43 +6890,34 @@ EXPORT_SYMBOL(dev_change_proto_down);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags)
 {
-	int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp);
 	const struct net_device_ops *ops = dev->netdev_ops;
 	struct bpf_prog *prog = NULL;
-	struct netdev_xdp xdp;
+	xdp_op_t xdp_op, xdp_chk;
 	int err;
 
 	ASSERT_RTNL();
 
-	xdp_op = ops->ndo_xdp;
+	xdp_op = xdp_chk = ops->ndo_xdp;
 	if (!xdp_op && (flags & XDP_FLAGS_DRV_MODE))
 		return -EOPNOTSUPP;
 	if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE))
 		xdp_op = generic_xdp_install;
+	if (xdp_op == xdp_chk)
+		xdp_chk = generic_xdp_install;
 
 	if (fd >= 0) {
-		if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
-			memset(&xdp, 0, sizeof(xdp));
-			xdp.command = XDP_QUERY_PROG;
-
-			err = xdp_op(dev, &xdp);
-			if (err < 0)
-				return err;
-			if (xdp.prog_attached)
-				return -EBUSY;
-		}
+		if (xdp_chk && __dev_xdp_attached(dev, xdp_chk))
+			return -EEXIST;
+		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
+		    __dev_xdp_attached(dev, xdp_op))
+			return -EBUSY;
 
 		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
 	}
 
-	memset(&xdp, 0, sizeof(xdp));
-	xdp.command = XDP_SETUP_PROG;
-	xdp.extack = extack;
-	xdp.prog = prog;
-
-	err = xdp_op(dev, &xdp);
+	err = dev_xdp_install(dev, xdp_op, extack, prog);
 	if (err < 0 && prog)
 		bpf_prog_put(prog);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dda9f1636356..d7f82c3450b1 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -899,8 +899,7 @@ static size_t rtnl_port_size(const struct net_device *dev,
 static size_t rtnl_xdp_size(void)
 {
 	size_t xdp_size = nla_total_size(0) +	/* nest IFLA_XDP */
-			  nla_total_size(1) +	/* XDP_ATTACHED */
-			  nla_total_size(4);	/* XDP_FLAGS */
+			  nla_total_size(1);	/* XDP_ATTACHED */
 
 	return xdp_size;
 }
@@ -1247,37 +1246,34 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
 	return 0;
 }
 
+static u8 rtnl_xdp_attached_mode(struct net_device *dev)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	ASSERT_RTNL();
+
+	if (rcu_access_pointer(dev->xdp_prog))
+		return XDP_ATTACHED_SKB;
+	if (ops->ndo_xdp && __dev_xdp_attached(dev, ops->ndo_xdp))
+		return XDP_ATTACHED_DRV;
+
+	return XDP_ATTACHED_NONE;
+}
+
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
 {
 	struct nlattr *xdp;
-	u32 xdp_flags = 0;
-	u8 val = 0;
 	int err;
 
 	xdp = nla_nest_start(skb, IFLA_XDP);
 	if (!xdp)
 		return -EMSGSIZE;
-	if (rcu_access_pointer(dev->xdp_prog)) {
-		xdp_flags = XDP_FLAGS_SKB_MODE;
-		val = 1;
-	} else if (dev->netdev_ops->ndo_xdp) {
-		struct netdev_xdp xdp_op = {};
-
-		xdp_op.command = XDP_QUERY_PROG;
-		err = dev->netdev_ops->ndo_xdp(dev, &xdp_op);
-		if (err)
-			goto err_cancel;
-		val = xdp_op.prog_attached;
-	}
-	err = nla_put_u8(skb, IFLA_XDP_ATTACHED, val);
+
+	err = nla_put_u8(skb, IFLA_XDP_ATTACHED,
+			 rtnl_xdp_attached_mode(dev));
 	if (err)
 		goto err_cancel;
 
-	if (xdp_flags) {
-		err = nla_put_u32(skb, IFLA_XDP_FLAGS, xdp_flags);
-		if (err)
-			goto err_cancel;
-	}
 	nla_nest_end(skb, xdp);
 	return 0;
 
-- 
cgit v1.2.3


From 508541146af18e43072e41a31aa62fac2b01aac1 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Tue, 25 Apr 2017 10:39:57 +0300
Subject: net/mlx5: Use underlay QPN from the root name space

Root flow table is dynamically changed by the underlying flow steering
layer, and IPoIB/ULPs have no idea what will be the root flow table in
the future, hence we need a dynamic infrastructure to move Underlay QPs
with the root flow table.

Fixes: b3ba51498bdd ("net/mlx5: Refactor create flow table method to accept underlay QP")
Signed-off-by: Erez Shitrit <erezsh@mellanox.com>
Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c   |  5 ++---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c  |  9 +++-----
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h  |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 25 +++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/ipoib.c   |  7 +++++--
 include/linux/mlx5/fs.h                           |  4 +++-
 8 files changed, 38 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 0099a3e397bc..2fd044b23875 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1003,7 +1003,7 @@ int mlx5e_create_direct_tirs(struct mlx5e_priv *priv);
 void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv);
 void mlx5e_destroy_rqt(struct mlx5e_priv *priv, struct mlx5e_rqt *rqt);
 
-int mlx5e_create_ttc_table(struct mlx5e_priv *priv, u32 underlay_qpn);
+int mlx5e_create_ttc_table(struct mlx5e_priv *priv);
 void mlx5e_destroy_ttc_table(struct mlx5e_priv *priv);
 
 int mlx5e_create_tis(struct mlx5_core_dev *mdev, int tc,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index 576d6787b484..53ed58320a24 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -800,7 +800,7 @@ void mlx5e_destroy_ttc_table(struct mlx5e_priv *priv)
 	mlx5e_destroy_flow_table(&ttc->ft);
 }
 
-int mlx5e_create_ttc_table(struct mlx5e_priv *priv, u32 underlay_qpn)
+int mlx5e_create_ttc_table(struct mlx5e_priv *priv)
 {
 	struct mlx5e_ttc_table *ttc = &priv->fs.ttc;
 	struct mlx5_flow_table_attr ft_attr = {};
@@ -810,7 +810,6 @@ int mlx5e_create_ttc_table(struct mlx5e_priv *priv, u32 underlay_qpn)
 	ft_attr.max_fte = MLX5E_TTC_TABLE_SIZE;
 	ft_attr.level = MLX5E_TTC_FT_LEVEL;
 	ft_attr.prio = MLX5E_NIC_PRIO;
-	ft_attr.underlay_qpn = underlay_qpn;
 
 	ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr);
 	if (IS_ERR(ft->t)) {
@@ -1147,7 +1146,7 @@ int mlx5e_create_flow_steering(struct mlx5e_priv *priv)
 		priv->netdev->hw_features &= ~NETIF_F_NTUPLE;
 	}
 
-	err = mlx5e_create_ttc_table(priv, 0);
+	err = mlx5e_create_ttc_table(priv);
 	if (err) {
 		netdev_err(priv->netdev, "Failed to create ttc table, err=%d\n",
 			   err);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 19e3d2fc2099..fcec7bedd3cd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -40,28 +40,25 @@
 #include "eswitch.h"
 
 int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
-			    struct mlx5_flow_table *ft)
+			    struct mlx5_flow_table *ft, u32 underlay_qpn)
 {
 	u32 in[MLX5_ST_SZ_DW(set_flow_table_root_in)]   = {0};
 	u32 out[MLX5_ST_SZ_DW(set_flow_table_root_out)] = {0};
 
 	if ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) &&
-	    ft->underlay_qpn == 0)
+	    underlay_qpn == 0)
 		return 0;
 
 	MLX5_SET(set_flow_table_root_in, in, opcode,
 		 MLX5_CMD_OP_SET_FLOW_TABLE_ROOT);
 	MLX5_SET(set_flow_table_root_in, in, table_type, ft->type);
 	MLX5_SET(set_flow_table_root_in, in, table_id, ft->id);
+	MLX5_SET(set_flow_table_root_in, in, underlay_qpn, underlay_qpn);
 	if (ft->vport) {
 		MLX5_SET(set_flow_table_root_in, in, vport_number, ft->vport);
 		MLX5_SET(set_flow_table_root_in, in, other_vport, 1);
 	}
 
-	if ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) &&
-	    ft->underlay_qpn != 0)
-		MLX5_SET(set_flow_table_root_in, in, underlay_qpn, ft->underlay_qpn);
-
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
index 8fad80688536..0f98a7cf4877 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -71,7 +71,8 @@ int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
 			unsigned int index);
 
 int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
-			    struct mlx5_flow_table *ft);
+			    struct mlx5_flow_table *ft,
+			    u32 underlay_qpn);
 
 int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u16 *id);
 int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u16 id);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index b8a176503d38..0e487e8ca634 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -650,7 +650,7 @@ static int update_root_ft_create(struct mlx5_flow_table *ft, struct fs_prio
 	if (ft->level >= min_level)
 		return 0;
 
-	err = mlx5_cmd_update_root_ft(root->dev, ft);
+	err = mlx5_cmd_update_root_ft(root->dev, ft, root->underlay_qpn);
 	if (err)
 		mlx5_core_warn(root->dev, "Update root flow table of id=%u failed\n",
 			       ft->id);
@@ -818,8 +818,6 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
 		goto unlock_root;
 	}
 
-	ft->underlay_qpn = ft_attr->underlay_qpn;
-
 	tree_init_node(&ft->node, 1, del_flow_table);
 	log_table_sz = ft->max_fte ? ilog2(ft->max_fte) : 0;
 	next_ft = find_next_chained_ft(fs_prio);
@@ -1489,7 +1487,8 @@ static int update_root_ft_destroy(struct mlx5_flow_table *ft)
 
 	new_root_ft = find_next_ft(ft);
 	if (new_root_ft) {
-		int err = mlx5_cmd_update_root_ft(root->dev, new_root_ft);
+		int err = mlx5_cmd_update_root_ft(root->dev, new_root_ft,
+						  root->underlay_qpn);
 
 		if (err) {
 			mlx5_core_warn(root->dev, "Update root flow table of id=%u failed\n",
@@ -2062,3 +2061,21 @@ err:
 	mlx5_cleanup_fs(dev);
 	return err;
 }
+
+int mlx5_fs_add_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn)
+{
+	struct mlx5_flow_root_namespace *root = dev->priv.steering->root_ns;
+
+	root->underlay_qpn = underlay_qpn;
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_fs_add_rx_underlay_qpn);
+
+int mlx5_fs_remove_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn)
+{
+	struct mlx5_flow_root_namespace *root = dev->priv.steering->root_ns;
+
+	root->underlay_qpn = 0;
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_fs_remove_rx_underlay_qpn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 81eafc7b9dd9..990acee6fb09 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -118,7 +118,6 @@ struct mlx5_flow_table {
 	/* FWD rules that point on this flow table */
 	struct list_head		fwd_rules;
 	u32				flags;
-	u32				underlay_qpn;
 };
 
 struct mlx5_fc_cache {
@@ -195,6 +194,7 @@ struct mlx5_flow_root_namespace {
 	struct mlx5_flow_table		*root_ft;
 	/* Should be held when chaining flow tables */
 	struct mutex			chain_lock;
+	u32				underlay_qpn;
 };
 
 int mlx5_init_fc_stats(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c
index 56bff3540954..cc1858752e70 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c
@@ -160,6 +160,8 @@ out:
 
 static void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp)
 {
+	mlx5_fs_remove_rx_underlay_qpn(mdev, qp->qpn);
+
 	mlx5_core_destroy_qp(mdev, qp);
 }
 
@@ -174,6 +176,8 @@ static int mlx5i_init_tx(struct mlx5e_priv *priv)
 		return err;
 	}
 
+	mlx5_fs_add_rx_underlay_qpn(priv->mdev, ipriv->qp.qpn);
+
 	err = mlx5e_create_tis(priv->mdev, 0 /* tc */, ipriv->qp.qpn, &priv->tisn[0]);
 	if (err) {
 		mlx5_core_warn(priv->mdev, "create tis failed, %d\n", err);
@@ -193,7 +197,6 @@ static void mlx5i_cleanup_tx(struct mlx5e_priv *priv)
 
 static int mlx5i_create_flow_steering(struct mlx5e_priv *priv)
 {
-	struct mlx5i_priv *ipriv = priv->ppriv;
 	int err;
 
 	priv->fs.ns = mlx5_get_flow_namespace(priv->mdev,
@@ -209,7 +212,7 @@ static int mlx5i_create_flow_steering(struct mlx5e_priv *priv)
 		priv->netdev->hw_features &= ~NETIF_F_NTUPLE;
 	}
 
-	err = mlx5e_create_ttc_table(priv, ipriv->qp.qpn);
+	err = mlx5e_create_ttc_table(priv);
 	if (err) {
 		netdev_err(priv->netdev, "Failed to create ttc table, err=%d\n",
 			   err);
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 1b166d2e19c5..b25e7baa273e 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -109,7 +109,6 @@ struct mlx5_flow_table_attr {
 	int max_fte;
 	u32 level;
 	u32 flags;
-	u32 underlay_qpn;
 };
 
 struct mlx5_flow_table *
@@ -167,4 +166,7 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
 			  u64 *bytes, u64 *packets, u64 *lastuse);
+int mlx5_fs_add_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn);
+int mlx5_fs_remove_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn);
+
 #endif
-- 
cgit v1.2.3