From 61e84623ace35ce48975e8f90bbbac7557c43d61 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Fri, 7 Oct 2016 22:04:33 -0400
Subject: net: centralize net_device min/max MTU checking

While looking into an MTU issue with sfc, I started noticing that almost
every NIC driver with an ndo_change_mtu function implemented almost
exactly the same range checks, and in many cases, that was the only
practical thing their ndo_change_mtu function was doing. Quite a few
drivers have either 68, 64, 60 or 46 as their minimum MTU value checked,
and then various sizes from 1500 to 65535 for their maximum MTU value. We
can remove a whole lot of redundant code here if we simple store min_mtu
and max_mtu in net_device, and check against those in net/core/dev.c's
dev_set_mtu().

In theory, there should be zero functional change with this patch, it just
puts the infrastructure in place. Subsequent patches will attempt to start
using said infrastructure, with theoretically zero change in
functionality.

CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 136ae6bbe81e..fbdf923af4d3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1506,6 +1506,8 @@ enum netdev_priv_flags {
  *	@if_port:	Selectable AUI, TP, ...
  *	@dma:		DMA channel
  *	@mtu:		Interface MTU value
+ *	@min_mtu:	Interface Minimum MTU value
+ *	@max_mtu:	Interface Maximum MTU value
  *	@type:		Interface hardware type
  *	@hard_header_len: Maximum hardware header length.
  *
@@ -1726,6 +1728,8 @@ struct net_device {
 	unsigned char		dma;
 
 	unsigned int		mtu;
+	unsigned int		min_mtu;
+	unsigned int		max_mtu;
 	unsigned short		type;
 	unsigned short		hard_header_len;
 
-- 
cgit v1.2.3


From cf53b1da73bdf940f1523ec5a7d375d7056c759c Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Tue, 11 Oct 2016 13:04:09 -0700
Subject: Revert "net: Add driver helper functions to determine checksum
 offloadability"

This reverts commit 6ae23ad36253a8033c5714c52b691b84456487c5.

The code has been in kernel since 4.4 but there are no in tree
code that uses. Unused code is broken code, remove it.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  78 --------------------------
 net/core/dev.c            | 136 ----------------------------------------------
 2 files changed, 214 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fbdf923af4d3..bf341b65ca5e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2653,71 +2653,6 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
 	remcsum_unadjust((__sum16 *)ptr, grc->delta);
 }
 
-struct skb_csum_offl_spec {
-	__u16		ipv4_okay:1,
-			ipv6_okay:1,
-			encap_okay:1,
-			ip_options_okay:1,
-			ext_hdrs_okay:1,
-			tcp_okay:1,
-			udp_okay:1,
-			sctp_okay:1,
-			vlan_okay:1,
-			no_encapped_ipv6:1,
-			no_not_encapped:1;
-};
-
-bool __skb_csum_offload_chk(struct sk_buff *skb,
-			    const struct skb_csum_offl_spec *spec,
-			    bool *csum_encapped,
-			    bool csum_help);
-
-static inline bool skb_csum_offload_chk(struct sk_buff *skb,
-					const struct skb_csum_offl_spec *spec,
-					bool *csum_encapped,
-					bool csum_help)
-{
-	if (skb->ip_summed != CHECKSUM_PARTIAL)
-		return false;
-
-	return __skb_csum_offload_chk(skb, spec, csum_encapped, csum_help);
-}
-
-static inline bool skb_csum_offload_chk_help(struct sk_buff *skb,
-					     const struct skb_csum_offl_spec *spec)
-{
-	bool csum_encapped;
-
-	return skb_csum_offload_chk(skb, spec, &csum_encapped, true);
-}
-
-static inline bool skb_csum_off_chk_help_cmn(struct sk_buff *skb)
-{
-	static const struct skb_csum_offl_spec csum_offl_spec = {
-		.ipv4_okay = 1,
-		.ip_options_okay = 1,
-		.ipv6_okay = 1,
-		.vlan_okay = 1,
-		.tcp_okay = 1,
-		.udp_okay = 1,
-	};
-
-	return skb_csum_offload_chk_help(skb, &csum_offl_spec);
-}
-
-static inline bool skb_csum_off_chk_help_cmn_v4_only(struct sk_buff *skb)
-{
-	static const struct skb_csum_offl_spec csum_offl_spec = {
-		.ipv4_okay = 1,
-		.ip_options_okay = 1,
-		.tcp_okay = 1,
-		.udp_okay = 1,
-		.vlan_okay = 1,
-	};
-
-	return skb_csum_offload_chk_help(skb, &csum_offl_spec);
-}
-
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
 				  const void *daddr, const void *saddr,
@@ -3961,19 +3896,6 @@ static inline bool can_checksum_protocol(netdev_features_t features,
 	}
 }
 
-/* Map an ethertype into IP protocol if possible */
-static inline int eproto_to_ipproto(int eproto)
-{
-	switch (eproto) {
-	case htons(ETH_P_IP):
-		return IPPROTO_IP;
-	case htons(ETH_P_IPV6):
-		return IPPROTO_IPV6;
-	default:
-		return -1;
-	}
-}
-
 #ifdef CONFIG_BUG
 void netdev_rx_csum_fault(struct net_device *dev);
 #else
diff --git a/net/core/dev.c b/net/core/dev.c
index f376639e8774..6498cc2ba8f6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -139,7 +139,6 @@
 #include <linux/errqueue.h>
 #include <linux/hrtimer.h>
 #include <linux/netfilter_ingress.h>
-#include <linux/sctp.h>
 #include <linux/crash_dump.h>
 
 #include "net-sysfs.h"
@@ -2492,141 +2491,6 @@ out:
 }
 EXPORT_SYMBOL(skb_checksum_help);
 
-/* skb_csum_offload_check - Driver helper function to determine if a device
- * with limited checksum offload capabilities is able to offload the checksum
- * for a given packet.
- *
- * Arguments:
- *   skb - sk_buff for the packet in question
- *   spec - contains the description of what device can offload
- *   csum_encapped - returns true if the checksum being offloaded is
- *	      encpasulated. That is it is checksum for the transport header
- *	      in the inner headers.
- *   checksum_help - when set indicates that helper function should
- *	      call skb_checksum_help if offload checks fail
- *
- * Returns:
- *   true: Packet has passed the checksum checks and should be offloadable to
- *	   the device (a driver may still need to check for additional
- *	   restrictions of its device)
- *   false: Checksum is not offloadable. If checksum_help was set then
- *	   skb_checksum_help was called to resolve checksum for non-GSO
- *	   packets and when IP protocol is not SCTP
- */
-bool __skb_csum_offload_chk(struct sk_buff *skb,
-			    const struct skb_csum_offl_spec *spec,
-			    bool *csum_encapped,
-			    bool csum_help)
-{
-	struct iphdr *iph;
-	struct ipv6hdr *ipv6;
-	void *nhdr;
-	int protocol;
-	u8 ip_proto;
-
-	if (skb->protocol == htons(ETH_P_8021Q) ||
-	    skb->protocol == htons(ETH_P_8021AD)) {
-		if (!spec->vlan_okay)
-			goto need_help;
-	}
-
-	/* We check whether the checksum refers to a transport layer checksum in
-	 * the outermost header or an encapsulated transport layer checksum that
-	 * corresponds to the inner headers of the skb. If the checksum is for
-	 * something else in the packet we need help.
-	 */
-	if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
-		/* Non-encapsulated checksum */
-		protocol = eproto_to_ipproto(vlan_get_protocol(skb));
-		nhdr = skb_network_header(skb);
-		*csum_encapped = false;
-		if (spec->no_not_encapped)
-			goto need_help;
-	} else if (skb->encapsulation && spec->encap_okay &&
-		   skb_checksum_start_offset(skb) ==
-		   skb_inner_transport_offset(skb)) {
-		/* Encapsulated checksum */
-		*csum_encapped = true;
-		switch (skb->inner_protocol_type) {
-		case ENCAP_TYPE_ETHER:
-			protocol = eproto_to_ipproto(skb->inner_protocol);
-			break;
-		case ENCAP_TYPE_IPPROTO:
-			protocol = skb->inner_protocol;
-			break;
-		}
-		nhdr = skb_inner_network_header(skb);
-	} else {
-		goto need_help;
-	}
-
-	switch (protocol) {
-	case IPPROTO_IP:
-		if (!spec->ipv4_okay)
-			goto need_help;
-		iph = nhdr;
-		ip_proto = iph->protocol;
-		if (iph->ihl != 5 && !spec->ip_options_okay)
-			goto need_help;
-		break;
-	case IPPROTO_IPV6:
-		if (!spec->ipv6_okay)
-			goto need_help;
-		if (spec->no_encapped_ipv6 && *csum_encapped)
-			goto need_help;
-		ipv6 = nhdr;
-		nhdr += sizeof(*ipv6);
-		ip_proto = ipv6->nexthdr;
-		break;
-	default:
-		goto need_help;
-	}
-
-ip_proto_again:
-	switch (ip_proto) {
-	case IPPROTO_TCP:
-		if (!spec->tcp_okay ||
-		    skb->csum_offset != offsetof(struct tcphdr, check))
-			goto need_help;
-		break;
-	case IPPROTO_UDP:
-		if (!spec->udp_okay ||
-		    skb->csum_offset != offsetof(struct udphdr, check))
-			goto need_help;
-		break;
-	case IPPROTO_SCTP:
-		if (!spec->sctp_okay ||
-		    skb->csum_offset != offsetof(struct sctphdr, checksum))
-			goto cant_help;
-		break;
-	case NEXTHDR_HOP:
-	case NEXTHDR_ROUTING:
-	case NEXTHDR_DEST: {
-		u8 *opthdr = nhdr;
-
-		if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
-			goto need_help;
-
-		ip_proto = opthdr[0];
-		nhdr += (opthdr[1] + 1) << 3;
-
-		goto ip_proto_again;
-	}
-	default:
-		goto need_help;
-	}
-
-	/* Passed the tests for offloading checksum */
-	return true;
-
-need_help:
-	if (csum_help && !skb_shinfo(skb)->gso_size)
-		skb_checksum_help(skb);
-cant_help:
-	return false;
-}
-EXPORT_SYMBOL(__skb_csum_offload_chk);
-
 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 {
 	__be16 type = skb->protocol;
-- 
cgit v1.2.3


From c3aaa403840a5ccd305fb5e73f3cbfac6453b5e5 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@caviumnetworks.com>
Date: Fri, 14 Oct 2016 05:19:17 -0400
Subject: qed: Pass MAC hints to VFs

Some hypervisors can support MAC hints to their VFs.
Even though we don't have such a hypervisor API in linux, we add
sufficient logic for the VF to be able to receive such hints and
set the mac accordingly - as long as the VF has not been set with
a MAC already.

Signed-off-by: Yuval Mintz <Yuval.Mintz@caviumnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_vf.c     | 4 ++--
 drivers/net/ethernet/qlogic/qede/qede_main.c | 6 +++++-
 include/linux/qed/qed_eth_if.h               | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index abf5bf11f865..f580bf4c97f0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -1230,8 +1230,8 @@ static void qed_handle_bulletin_change(struct qed_hwfn *hwfn)
 
 	is_mac_exist = qed_vf_bulletin_get_forced_mac(hwfn, mac,
 						      &is_mac_forced);
-	if (is_mac_exist && is_mac_forced && cookie)
-		ops->force_mac(cookie, mac);
+	if (is_mac_exist && cookie)
+		ops->force_mac(cookie, mac, !!is_mac_forced);
 
 	/* Always update link configuration according to bulletin */
 	qed_link_update(hwfn);
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 343038ca047d..9866d952e3e1 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -171,10 +171,14 @@ static struct pci_driver qede_pci_driver = {
 #endif
 };
 
-static void qede_force_mac(void *dev, u8 *mac)
+static void qede_force_mac(void *dev, u8 *mac, bool forced)
 {
 	struct qede_dev *edev = dev;
 
+	/* MAC hints take effect only if we haven't set one already */
+	if (is_valid_ether_addr(edev->ndev->dev_addr) && !forced)
+		return;
+
 	ether_addr_copy(edev->ndev->dev_addr, mac);
 	ether_addr_copy(edev->primary_mac, mac);
 }
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 33c24ebc9b7f..1c779486c30d 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -129,7 +129,7 @@ struct qed_tunn_params {
 
 struct qed_eth_cb_ops {
 	struct qed_common_cb_ops common;
-	void (*force_mac) (void *dev, u8 *mac);
+	void (*force_mac) (void *dev, u8 *mac, bool forced);
 };
 
 #ifdef CONFIG_DCB
-- 
cgit v1.2.3


From 7b7e70f979e34ed84d725eab8ea42921ab6f42e3 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@caviumnetworks.com>
Date: Fri, 14 Oct 2016 05:19:20 -0400
Subject: qed*: Allow unicast filtering

Apparently qede fails to set IFF_UNICAST_FLT, and as a result is not
actually performing unicast MAC filtering.
While we're at it - relax a hard-coded limitation that limits each
interface into using at most 15 unicast MAC addresses before turning
promiscuous. Instead utilize the HW resources to their limit.

Signed-off-by: Yuval Mintz <Yuval.Mintz@caviumnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c     | 12 ++++++++++--
 drivers/net/ethernet/qlogic/qede/qede_main.c |  4 +++-
 include/linux/qed/qed_eth_if.h               |  1 +
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index ddd410a91e13..6b0e22d9fe4c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1652,6 +1652,7 @@ static int qed_fill_eth_dev_info(struct qed_dev *cdev,
 
 	if (IS_PF(cdev)) {
 		int max_vf_vlan_filters = 0;
+		int max_vf_mac_filters = 0;
 
 		if (cdev->int_params.out.int_mode == QED_INT_MODE_MSIX) {
 			for_each_hwfn(cdev, i)
@@ -1665,11 +1666,18 @@ static int qed_fill_eth_dev_info(struct qed_dev *cdev,
 			info->num_queues = cdev->num_hwfns;
 		}
 
-		if (IS_QED_SRIOV(cdev))
+		if (IS_QED_SRIOV(cdev)) {
 			max_vf_vlan_filters = cdev->p_iov_info->total_vfs *
 					      QED_ETH_VF_NUM_VLAN_FILTERS;
-		info->num_vlan_filters = RESC_NUM(&cdev->hwfns[0], QED_VLAN) -
+			max_vf_mac_filters = cdev->p_iov_info->total_vfs *
+					     QED_ETH_VF_NUM_MAC_FILTERS;
+		}
+		info->num_vlan_filters = RESC_NUM(QED_LEADING_HWFN(cdev),
+						  QED_VLAN) -
 					 max_vf_vlan_filters;
+		info->num_mac_filters = RESC_NUM(QED_LEADING_HWFN(cdev),
+						 QED_MAC) -
+					max_vf_mac_filters;
 
 		ether_addr_copy(info->port_mac,
 				cdev->hwfns[0].hw_info.hw_mac_addr);
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 6c2b09c255d5..0e483afc2b87 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -2365,6 +2365,8 @@ static void qede_init_ndev(struct qede_dev *edev)
 
 	qede_set_ethtool_ops(ndev);
 
+	ndev->priv_flags = IFF_UNICAST_FLT;
+
 	/* user-changeble features */
 	hw_features = NETIF_F_GRO | NETIF_F_SG |
 		      NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
@@ -3937,7 +3939,7 @@ static void qede_config_rx_mode(struct net_device *ndev)
 
 	/* Check for promiscuous */
 	if ((ndev->flags & IFF_PROMISC) ||
-	    (uc_count > 15)) { /* @@@TBD resource allocation - 1 */
+	    (uc_count > edev->dev_info.num_mac_filters - 1)) {
 		accept_flags = QED_FILTER_RX_MODE_TYPE_PROMISC;
 	} else {
 		/* Add MAC filters according to the unicast secondary macs */
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 1c779486c30d..15130805d792 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -23,6 +23,7 @@ struct qed_dev_eth_info {
 
 	u8	port_mac[ETH_ALEN];
 	u8	num_vlan_filters;
+	u16	num_mac_filters;
 
 	/* Legacy VF - this affects the datapath, so qede has to know */
 	bool is_legacy;
-- 
cgit v1.2.3


From 664fcf123a30edf16b47d2ce1f610d654ba917b2 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 16 Oct 2016 19:56:51 +0200
Subject: net: phy: Threaded interrupts allow some simplification

The PHY interrupts are now handled in a threaded interrupt handler,
which can sleep. The work queue is no longer needed, phy_change() can
be called directly. phy_mac_interrupt() still needs to be safe to call
in interrupt context, so keep the work queue, and use a helper to call
phy_change().

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c        | 45 +++++++++++++++++++++++++-------------------
 drivers/net/phy/phy_device.c |  2 +-
 include/linux/phy.h          |  5 +++--
 3 files changed, 30 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 25f2b296aaba..bb673c63c85c 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -664,7 +664,7 @@ static void phy_error(struct phy_device *phydev)
  * @phy_dat: phy_device pointer
  *
  * Description: When a PHY interrupt occurs, the handler disables
- * interrupts, and schedules a work task to clear the interrupt.
+ * interrupts, and uses phy_change to handle the interrupt.
  */
 static irqreturn_t phy_interrupt(int irq, void *phy_dat)
 {
@@ -673,15 +673,10 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat)
 	if (PHY_HALTED == phydev->state)
 		return IRQ_NONE;		/* It can't be ours.  */
 
-	/* The MDIO bus is not allowed to be written in interrupt
-	 * context, so we need to disable the irq here.  A work
-	 * queue will write the PHY to disable and clear the
-	 * interrupt, and then reenable the irq line.
-	 */
 	disable_irq_nosync(irq);
 	atomic_inc(&phydev->irq_disable);
 
-	queue_work(system_power_efficient_wq, &phydev->phy_queue);
+	phy_change(phydev);
 
 	return IRQ_HANDLED;
 }
@@ -766,12 +761,6 @@ int phy_stop_interrupts(struct phy_device *phydev)
 
 	free_irq(phydev->irq, phydev);
 
-	/* Cannot call flush_scheduled_work() here as desired because
-	 * of rtnl_lock(), but we do not really care about what would
-	 * be done, except from enable_irq(), so cancel any work
-	 * possibly pending and take care of the matter below.
-	 */
-	cancel_work_sync(&phydev->phy_queue);
 	/* If work indeed has been cancelled, disable_irq() will have
 	 * been left unbalanced from phy_interrupt() and enable_irq()
 	 * has to be called so that other devices on the line work.
@@ -784,14 +773,11 @@ int phy_stop_interrupts(struct phy_device *phydev)
 EXPORT_SYMBOL(phy_stop_interrupts);
 
 /**
- * phy_change - Scheduled by the phy_interrupt/timer to handle PHY changes
- * @work: work_struct that describes the work to be done
+ * phy_change - Called by the phy_interrupt to handle PHY changes
+ * @phydev: phy_device struct that interrupted
  */
-void phy_change(struct work_struct *work)
+void phy_change(struct phy_device *phydev)
 {
-	struct phy_device *phydev =
-		container_of(work, struct phy_device, phy_queue);
-
 	if (phy_interrupt_is_valid(phydev)) {
 		if (phydev->drv->did_interrupt &&
 		    !phydev->drv->did_interrupt(phydev))
@@ -832,6 +818,18 @@ phy_err:
 	phy_error(phydev);
 }
 
+/**
+ * phy_change_work - Scheduled by the phy_mac_interrupt to handle PHY changes
+ * @work: work_struct that describes the work to be done
+ */
+void phy_change_work(struct work_struct *work)
+{
+	struct phy_device *phydev =
+		container_of(work, struct phy_device, phy_queue);
+
+	phy_change(phydev);
+}
+
 /**
  * phy_stop - Bring down the PHY link, and stop checking the status
  * @phydev: target phy_device struct
@@ -1116,6 +1114,15 @@ void phy_state_machine(struct work_struct *work)
 				   PHY_STATE_TIME * HZ);
 }
 
+/**
+ * phy_mac_interrupt - MAC says the link has changed
+ * @phydev: phy_device struct with changed link
+ * @new_link: Link is Up/Down.
+ *
+ * Description: The MAC layer is able indicate there has been a change
+ *   in the PHY link status. Set the new link status, and trigger the
+ *   state machine, work a work queue.
+ */
 void phy_mac_interrupt(struct phy_device *phydev, int new_link)
 {
 	phydev->link = new_link;
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index e977ba931878..ac440a815353 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -347,7 +347,7 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id,
 
 	mutex_init(&dev->lock);
 	INIT_DELAYED_WORK(&dev->state_queue, phy_state_machine);
-	INIT_WORK(&dev->phy_queue, phy_change);
+	INIT_WORK(&dev->phy_queue, phy_change_work);
 
 	/* Request the appropriate module unconditionally; don't
 	 * bother trying to do so only if it isn't already loaded,
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e25f1830fbcf..c47378c93607 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -343,7 +343,7 @@ struct phy_c45_device_ids {
  * giving up on the current attempt at acquiring a link
  * irq: IRQ number of the PHY's interrupt (-1 if none)
  * phy_timer: The timer for handling the state machine
- * phy_queue: A work_queue for the interrupt
+ * phy_queue: A work_queue for the phy_mac_interrupt
  * attached_dev: The attached enet driver's device instance ptr
  * adjust_link: Callback for the enet controller to respond to
  * changes in the link state.
@@ -802,7 +802,8 @@ int phy_driver_register(struct phy_driver *new_driver, struct module *owner);
 int phy_drivers_register(struct phy_driver *new_driver, int n,
 			 struct module *owner);
 void phy_state_machine(struct work_struct *work);
-void phy_change(struct work_struct *work);
+void phy_change(struct phy_device *phydev);
+void phy_change_work(struct work_struct *work);
 void phy_mac_interrupt(struct phy_device *phydev, int new_link);
 void phy_start_machine(struct phy_device *phydev);
 void phy_stop_machine(struct phy_device *phydev);
-- 
cgit v1.2.3


From 1a3f060c1a47dba4e12ac21ce62b57666b9c4e95 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Mon, 17 Oct 2016 19:15:44 -0700
Subject: net: Introduce new api for walking upper and lower devices

This patch introduces netdev_walk_all_upper_dev_rcu,
netdev_walk_all_lower_dev and netdev_walk_all_lower_dev_rcu. These
functions recursively walk the adj_list of devices to determine all upper
and lower devices.

The functions take a callback function that is invoked for each device
in the list. If the callback returns non-0, the walk is terminated and
the functions return that code back to callers.

v3
- simplified netdev_has_upper_dev_all_rcu and __netdev_has_upper_dev and
  removed typecast as suggested by Stephen

v2
- fixed definition of netdev_next_lower_dev_rcu to mirror the upper_dev
  version.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  17 +++++
 net/core/dev.c            | 155 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 172 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bf341b65ca5e..a5902d995907 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3778,6 +3778,14 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
 	     updev; \
 	     updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter)))
 
+int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
+				  int (*fn)(struct net_device *upper_dev,
+					    void *data),
+				  void *data);
+
+bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
+				  struct net_device *upper_dev);
+
 void *netdev_lower_get_next_private(struct net_device *dev,
 				    struct list_head **iter);
 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
@@ -3821,6 +3829,15 @@ struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
 	     ldev; \
 	     ldev = netdev_all_lower_get_next_rcu(dev, &(iter)))
 
+int netdev_walk_all_lower_dev(struct net_device *dev,
+			      int (*fn)(struct net_device *lower_dev,
+					void *data),
+			      void *data);
+int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
+				  int (*fn)(struct net_device *lower_dev,
+					    void *data),
+				  void *data);
+
 void *netdev_adjacent_get_private(struct list_head *adj_list);
 void *netdev_lower_get_first_private_rcu(struct net_device *dev);
 struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index f67fd16615bb..fc48337cfab8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5155,6 +5155,31 @@ bool netdev_has_upper_dev(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_has_upper_dev);
 
+/**
+ * netdev_has_upper_dev_all - Check if device is linked to an upper device
+ * @dev: device
+ * @upper_dev: upper device to check
+ *
+ * Find out if a device is linked to specified upper device and return true
+ * in case it is. Note that this checks the entire upper device chain.
+ * The caller must hold rcu lock.
+ */
+
+static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
+{
+	struct net_device *dev = data;
+
+	return upper_dev == dev;
+}
+
+bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
+				  struct net_device *upper_dev)
+{
+	return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+					       upper_dev);
+}
+EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
+
 /**
  * netdev_has_any_upper_dev - Check if device is linked to some device
  * @dev: device
@@ -5255,6 +5280,51 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
 
+static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
+						    struct list_head **iter)
+{
+	struct netdev_adjacent *upper;
+
+	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
+
+	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+
+	if (&upper->list == &dev->adj_list.upper)
+		return NULL;
+
+	*iter = &upper->list;
+
+	return upper->dev;
+}
+
+int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
+				  int (*fn)(struct net_device *dev,
+					    void *data),
+				  void *data)
+{
+	struct net_device *udev;
+	struct list_head *iter;
+	int ret;
+
+	for (iter = &dev->adj_list.upper,
+	     udev = netdev_next_upper_dev_rcu(dev, &iter);
+	     udev;
+	     udev = netdev_next_upper_dev_rcu(dev, &iter)) {
+		/* first is the upper device itself */
+		ret = fn(udev, data);
+		if (ret)
+			return ret;
+
+		/* then look at all of its upper devices */
+		ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
+
 /**
  * netdev_lower_get_next_private - Get the next ->private from the
  *				   lower neighbour list
@@ -5361,6 +5431,49 @@ struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list
 }
 EXPORT_SYMBOL(netdev_all_lower_get_next);
 
+static struct net_device *netdev_next_lower_dev(struct net_device *dev,
+						struct list_head **iter)
+{
+	struct netdev_adjacent *lower;
+
+	lower = list_entry(*iter, struct netdev_adjacent, list);
+
+	if (&lower->list == &dev->adj_list.lower)
+		return NULL;
+
+	*iter = lower->list.next;
+
+	return lower->dev;
+}
+
+int netdev_walk_all_lower_dev(struct net_device *dev,
+			      int (*fn)(struct net_device *dev,
+					void *data),
+			      void *data)
+{
+	struct net_device *ldev;
+	struct list_head *iter;
+	int ret;
+
+	for (iter = &dev->adj_list.lower,
+	     ldev = netdev_next_lower_dev(dev, &iter);
+	     ldev;
+	     ldev = netdev_next_lower_dev(dev, &iter)) {
+		/* first is the lower device itself */
+		ret = fn(ldev, data);
+		if (ret)
+			return ret;
+
+		/* then look at all of its lower devices */
+		ret = netdev_walk_all_lower_dev(ldev, fn, data);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
+
 /**
  * netdev_all_lower_get_next_rcu - Get the next device from all
  *				   lower neighbour list, RCU variant
@@ -5382,6 +5495,48 @@ struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
 
+static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
+						    struct list_head **iter)
+{
+	struct netdev_adjacent *lower;
+
+	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
+	if (&lower->list == &dev->adj_list.lower)
+		return NULL;
+
+	*iter = &lower->list;
+
+	return lower->dev;
+}
+
+int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
+				  int (*fn)(struct net_device *dev,
+					    void *data),
+				  void *data)
+{
+	struct net_device *ldev;
+	struct list_head *iter;
+	int ret;
+
+	for (iter = &dev->adj_list.lower,
+	     ldev = netdev_next_lower_dev_rcu(dev, &iter);
+	     ldev;
+	     ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
+		/* first is the lower device itself */
+		ret = fn(ldev, data);
+		if (ret)
+			return ret;
+
+		/* then look at all of its lower devices */
+		ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
+
 /**
  * netdev_lower_get_first_private_rcu - Get the first ->private from the
  *				       lower neighbour list, RCU
-- 
cgit v1.2.3


From f1170fd462c67c4ae2f20734566d94e0f8f62f69 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Mon, 17 Oct 2016 19:15:51 -0700
Subject: net: Remove all_adj_list and its references

Only direct adjacencies are maintained. All upper or lower devices can
be learned via the new walk API which recursively walks the adj_list for
upper devices or lower devices.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  25 ------
 net/core/dev.c            | 223 ++++------------------------------------------
 2 files changed, 18 insertions(+), 230 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a5902d995907..458c87631e7f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1456,7 +1456,6 @@ enum netdev_priv_flags {
  *	@ptype_specific: Device-specific, protocol-specific packet handlers
  *
  *	@adj_list:	Directly linked devices, like slaves for bonding
- *	@all_adj_list:	All linked devices, *including* neighbours
  *	@features:	Currently active device features
  *	@hw_features:	User-changeable features
  *
@@ -1675,11 +1674,6 @@ struct net_device {
 		struct list_head lower;
 	} adj_list;
 
-	struct {
-		struct list_head upper;
-		struct list_head lower;
-	} all_adj_list;
-
 	netdev_features_t	features;
 	netdev_features_t	hw_features;
 	netdev_features_t	wanted_features;
@@ -3771,13 +3765,6 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
 	     updev; \
 	     updev = netdev_upper_get_next_dev_rcu(dev, &(iter)))
 
-/* iterate through upper list, must be called under RCU read lock */
-#define netdev_for_each_all_upper_dev_rcu(dev, updev, iter) \
-	for (iter = &(dev)->all_adj_list.upper, \
-	     updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter)); \
-	     updev; \
-	     updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter)))
-
 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 				  int (*fn)(struct net_device *upper_dev,
 					    void *data),
@@ -3817,18 +3804,6 @@ struct net_device *netdev_all_lower_get_next(struct net_device *dev,
 struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
 						 struct list_head **iter);
 
-#define netdev_for_each_all_lower_dev(dev, ldev, iter) \
-	for (iter = (dev)->all_adj_list.lower.next, \
-	     ldev = netdev_all_lower_get_next(dev, &(iter)); \
-	     ldev; \
-	     ldev = netdev_all_lower_get_next(dev, &(iter)))
-
-#define netdev_for_each_all_lower_dev_rcu(dev, ldev, iter) \
-	for (iter = (dev)->all_adj_list.lower.next, \
-	     ldev = netdev_all_lower_get_next_rcu(dev, &(iter)); \
-	     ldev; \
-	     ldev = netdev_all_lower_get_next_rcu(dev, &(iter)))
-
 int netdev_walk_all_lower_dev(struct net_device *dev,
 			      int (*fn)(struct net_device *lower_dev,
 					void *data),
diff --git a/net/core/dev.c b/net/core/dev.c
index fc48337cfab8..a9fe14908b44 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5137,6 +5137,13 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
 	return NULL;
 }
 
+static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
+{
+	struct net_device *dev = data;
+
+	return upper_dev == dev;
+}
+
 /**
  * netdev_has_upper_dev - Check if device is linked to an upper device
  * @dev: device
@@ -5151,7 +5158,8 @@ bool netdev_has_upper_dev(struct net_device *dev,
 {
 	ASSERT_RTNL();
 
-	return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
+	return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
+					     upper_dev);
 }
 EXPORT_SYMBOL(netdev_has_upper_dev);
 
@@ -5165,13 +5173,6 @@ EXPORT_SYMBOL(netdev_has_upper_dev);
  * The caller must hold rcu lock.
  */
 
-static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
-{
-	struct net_device *dev = data;
-
-	return upper_dev == dev;
-}
-
 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
 				  struct net_device *upper_dev)
 {
@@ -5191,7 +5192,7 @@ static bool netdev_has_any_upper_dev(struct net_device *dev)
 {
 	ASSERT_RTNL();
 
-	return !list_empty(&dev->all_adj_list.upper);
+	return !list_empty(&dev->adj_list.upper);
 }
 
 /**
@@ -5254,32 +5255,6 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
 
-/**
- * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next device from the dev's upper list, starting from iter
- * position. The caller must hold RCU read lock.
- */
-struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
-						     struct list_head **iter)
-{
-	struct netdev_adjacent *upper;
-
-	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
-
-	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
-
-	if (&upper->list == &dev->all_adj_list.upper)
-		return NULL;
-
-	*iter = &upper->list;
-
-	return upper->dev;
-}
-EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
-
 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
 						    struct list_head **iter)
 {
@@ -5406,31 +5381,6 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
 }
 EXPORT_SYMBOL(netdev_lower_get_next);
 
-/**
- * netdev_all_lower_get_next - Get the next device from all lower neighbour list
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next netdev_adjacent from the dev's all lower neighbour
- * list, starting from iter position. The caller must hold RTNL lock or
- * its own locking that guarantees that the neighbour all lower
- * list will remain unchanged.
- */
-struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
-{
-	struct netdev_adjacent *lower;
-
-	lower = list_entry(*iter, struct netdev_adjacent, list);
-
-	if (&lower->list == &dev->all_adj_list.lower)
-		return NULL;
-
-	*iter = lower->list.next;
-
-	return lower->dev;
-}
-EXPORT_SYMBOL(netdev_all_lower_get_next);
-
 static struct net_device *netdev_next_lower_dev(struct net_device *dev,
 						struct list_head **iter)
 {
@@ -5474,27 +5424,6 @@ int netdev_walk_all_lower_dev(struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
 
-/**
- * netdev_all_lower_get_next_rcu - Get the next device from all
- *				   lower neighbour list, RCU variant
- * @dev: device
- * @iter: list_head ** of the current position
- *
- * Gets the next netdev_adjacent from the dev's all lower neighbour
- * list, starting from iter position. The caller must hold RCU read lock.
- */
-struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
-						 struct list_head **iter)
-{
-	struct netdev_adjacent *lower;
-
-	lower = list_first_or_null_rcu(&dev->all_adj_list.lower,
-				       struct netdev_adjacent, list);
-
-	return lower ? lower->dev : NULL;
-}
-EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
-
 static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
 						    struct list_head **iter)
 {
@@ -5722,15 +5651,6 @@ static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
 	return 0;
 }
 
-static int __netdev_adjacent_dev_link(struct net_device *dev,
-				      struct net_device *upper_dev)
-{
-	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
-						&dev->all_adj_list.upper,
-						&upper_dev->all_adj_list.lower,
-						NULL, false);
-}
-
 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
 					       struct net_device *upper_dev,
 					       u16 ref_nr,
@@ -5741,40 +5661,19 @@ static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
 	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
 }
 
-static void __netdev_adjacent_dev_unlink(struct net_device *dev,
-					 struct net_device *upper_dev,
-					 u16 ref_nr)
-{
-	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
-					   &dev->all_adj_list.upper,
-					   &upper_dev->all_adj_list.lower);
-}
-
 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
 						struct net_device *upper_dev,
 						void *private, bool master)
 {
-	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
-
-	if (ret)
-		return ret;
-
-	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
-					       &dev->adj_list.upper,
-					       &upper_dev->adj_list.lower,
-					       private, master);
-	if (ret) {
-		__netdev_adjacent_dev_unlink(dev, upper_dev, 1);
-		return ret;
-	}
-
-	return 0;
+	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
+						&dev->adj_list.upper,
+						&upper_dev->adj_list.lower,
+						private, master);
 }
 
 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 						   struct net_device *upper_dev)
 {
-	__netdev_adjacent_dev_unlink(dev, upper_dev, 1);
 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
 					   &dev->adj_list.upper,
 					   &upper_dev->adj_list.lower);
@@ -5785,7 +5684,6 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 				   void *upper_priv, void *upper_info)
 {
 	struct netdev_notifier_changeupper_info changeupper_info;
-	struct netdev_adjacent *i, *j, *to_i, *to_j;
 	int ret = 0;
 
 	ASSERT_RTNL();
@@ -5794,10 +5692,10 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 		return -EBUSY;
 
 	/* To prevent loops, check if dev is not upper device to upper_dev. */
-	if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
+	if (netdev_has_upper_dev(upper_dev, dev))
 		return -EBUSY;
 
-	if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
+	if (netdev_has_upper_dev(dev, upper_dev))
 		return -EEXIST;
 
 	if (master && netdev_master_upper_dev_get(dev))
@@ -5819,80 +5717,15 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	if (ret)
 		return ret;
 
-	/* Now that we linked these devs, make all the upper_dev's
-	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
-	 * versa, and don't forget the devices itself. All of these
-	 * links are non-neighbours.
-	 */
-	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
-		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
-			pr_debug("Interlinking %s with %s, non-neighbour\n",
-				 i->dev->name, j->dev->name);
-			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
-			if (ret)
-				goto rollback_mesh;
-		}
-	}
-
-	/* add dev to every upper_dev's upper device */
-	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
-		pr_debug("linking %s's upper device %s with %s\n",
-			 upper_dev->name, i->dev->name, dev->name);
-		ret = __netdev_adjacent_dev_link(dev, i->dev);
-		if (ret)
-			goto rollback_upper_mesh;
-	}
-
-	/* add upper_dev to every dev's lower device */
-	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
-		pr_debug("linking %s's lower device %s with %s\n", dev->name,
-			 i->dev->name, upper_dev->name);
-		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
-		if (ret)
-			goto rollback_lower_mesh;
-	}
-
 	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
 					    &changeupper_info.info);
 	ret = notifier_to_errno(ret);
 	if (ret)
-		goto rollback_lower_mesh;
+		goto rollback;
 
 	return 0;
 
-rollback_lower_mesh:
-	to_i = i;
-	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
-		if (i == to_i)
-			break;
-		__netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
-	}
-
-	i = NULL;
-
-rollback_upper_mesh:
-	to_i = i;
-	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
-		if (i == to_i)
-			break;
-		__netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
-	}
-
-	i = j = NULL;
-
-rollback_mesh:
-	to_i = i;
-	to_j = j;
-	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
-		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
-			if (i == to_i && j == to_j)
-				break;
-			__netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
-		}
-		if (i == to_i)
-			break;
-	}
-
+rollback:
 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 
 	return ret;
@@ -5949,7 +5782,6 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev)
 {
 	struct netdev_notifier_changeupper_info changeupper_info;
-	struct netdev_adjacent *i, *j;
 	ASSERT_RTNL();
 
 	changeupper_info.upper_dev = upper_dev;
@@ -5961,23 +5793,6 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 
 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
 
-	/* Here is the tricky part. We must remove all dev's lower
-	 * devices from all upper_dev's upper devices and vice
-	 * versa, to maintain the graph relationship.
-	 */
-	list_for_each_entry(i, &dev->all_adj_list.lower, list)
-		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
-			__netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
-
-	/* remove also the devices itself from lower/upper device
-	 * list
-	 */
-	list_for_each_entry(i, &dev->all_adj_list.lower, list)
-		__netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
-
-	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
-		__netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
-
 	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
 				      &changeupper_info.info);
 }
@@ -7679,8 +7494,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	INIT_LIST_HEAD(&dev->link_watch_list);
 	INIT_LIST_HEAD(&dev->adj_list.upper);
 	INIT_LIST_HEAD(&dev->adj_list.lower);
-	INIT_LIST_HEAD(&dev->all_adj_list.upper);
-	INIT_LIST_HEAD(&dev->all_adj_list.lower);
 	INIT_LIST_HEAD(&dev->ptype_all);
 	INIT_LIST_HEAD(&dev->ptype_specific);
 #ifdef CONFIG_NET_SCHED
-- 
cgit v1.2.3


From 1f9127caece42514a47011326b83ad93d95cd5d7 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@ni.com>
Date: Mon, 17 Oct 2016 10:49:54 -0500
Subject: net: phy: Create phy_supported_speeds function which lists speeds
 currently supported by a phydevice

phy_supported_speeds provides a means to get a list of all the speeds a
phy device currently supports.

Signed-off-by: Zach Brown <zach.brown@ni.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 35 +++++++++++++++++++++++++++++++++++
 include/linux/phy.h   | 15 +++++++++++++++
 2 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 8b7659e94057..ee3c793124c7 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -260,6 +260,41 @@ static inline unsigned int phy_find_valid(unsigned int idx, u32 features)
 	return idx < MAX_NUM_SETTINGS ? idx : MAX_NUM_SETTINGS - 1;
 }
 
+/**
+ * phy_supported_speeds - return all speeds currently supported by a phy device
+ * @phy: The phy device to return supported speeds of.
+ * @speeds: buffer to store supported speeds in.
+ * @size:   size of speeds buffer.
+ *
+ * Description: Returns the number of supported speeds, and fills the speeds
+ * buffer with the supported speeds. If speeds buffer is too small to contain
+ * all currently supported speeds, will return as many speeds as can fit.
+ */
+unsigned int phy_supported_speeds(struct phy_device *phy,
+				  unsigned int *speeds,
+				  unsigned int size)
+{
+	unsigned int count = 0;
+	unsigned int idx = 0;
+
+	while (idx < MAX_NUM_SETTINGS && count < size) {
+		idx = phy_find_valid(idx, phy->supported);
+
+		if (!(settings[idx].setting & phy->supported))
+			break;
+
+		/* Assumes settings are grouped by speed */
+		if ((count == 0) ||
+		    (speeds[count - 1] != settings[idx].speed)) {
+			speeds[count] = settings[idx].speed;
+			count++;
+		}
+		idx++;
+	}
+
+	return count;
+}
+
 /**
  * phy_check_valid - check if there is a valid PHY setting which matches
  *		     speed, duplex, and feature mask
diff --git a/include/linux/phy.h b/include/linux/phy.h
index c47378c93607..4b6c246c63bb 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -84,6 +84,21 @@ typedef enum {
 	PHY_INTERFACE_MODE_MAX,
 } phy_interface_t;
 
+/**
+ * phy_supported_speeds - return all speeds currently supported by a phy device
+ * @phy: The phy device to return supported speeds of.
+ * @speeds: buffer to store supported speeds in.
+ * @size: size of speeds buffer.
+ *
+ * Description: Returns the number of supported speeds, and
+ * fills the speeds * buffer with the supported speeds. If speeds buffer is
+ * too small to contain * all currently supported speeds, will return as
+ * many speeds as can fit.
+ */
+unsigned int phy_supported_speeds(struct phy_device *phy,
+				      unsigned int *speeds,
+				      unsigned int size);
+
 /**
  * It maps 'enum phy_interface_t' found in include/linux/phy.h
  * into the device tree binding of 'phy-mode', so that Ethernet
-- 
cgit v1.2.3


From 2e0bc452f4721520502575362a9cd3c1248d2337 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@ni.com>
Date: Mon, 17 Oct 2016 10:49:55 -0500
Subject: net: phy: leds: add support for led triggers on phy link state change

Create an option CONFIG_LED_TRIGGER_PHY (default n), which will create a
set of led triggers for each instantiated PHY device. There is one LED
trigger per link-speed, per-phy.
The triggers are registered during phy_attach and unregistered during
phy_detach.

This allows for a user to configure their system to allow a set of LEDs
not controlled by the phy to represent link state changes on the phy.
LEDS controlled by the phy are unaffected.

For example, we have a board where some of the leds in the
RJ45 socket are controlled by the phy, but others are not. Using the
triggers provided by this patch the leds not controlled by the phy can
be configured to show the current speed of the ethernet connection. The
leds controlled by the phy are unaffected.

Signed-off-by: Josh Cartwright <josh.cartwright@ni.com>
Signed-off-by: Nathan Sullivan <nathan.sullivan@ni.com>
Signed-off-by: Zach Brown <zach.brown@ni.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig            |  13 ++++
 drivers/net/phy/Makefile           |   1 +
 drivers/net/phy/phy.c              |   1 +
 drivers/net/phy/phy_device.c       |   5 ++
 drivers/net/phy/phy_led_triggers.c | 136 +++++++++++++++++++++++++++++++++++++
 include/linux/phy.h                |   7 ++
 include/linux/phy_led_triggers.h   |  51 ++++++++++++++
 7 files changed, 214 insertions(+)
 create mode 100644 drivers/net/phy/phy_led_triggers.c
 create mode 100644 include/linux/phy_led_triggers.h

(limited to 'include/linux')

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 2651c8d8de2f..45f68eaf9b79 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -15,6 +15,19 @@ if PHYLIB
 config SWPHY
 	bool
 
+config LED_TRIGGER_PHY
+	bool "Support LED triggers for tracking link state"
+	depends on LEDS_TRIGGERS
+	---help---
+	  Adds support for a set of LED trigger events per-PHY.  Link
+	  state change will trigger the events, for consumption by an
+	  LED class driver.  There are triggers for each link speed currently
+	  supported by the phy, and are of the form:
+	       <mii bus id>:<phy>:<speed>
+
+	  Where speed is in the form:
+		<Speed in megabits>Mbps or <Speed in gigabits>Gbps
+
 comment "MDIO bus device drivers"
 
 config MDIO_BCM_IPROC
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index e58667d111e7..86d12cd3fbf0 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -2,6 +2,7 @@
 
 libphy-y			:= phy.o phy_device.o mdio_bus.o mdio_device.o
 libphy-$(CONFIG_SWPHY)		+= swphy.o
+libphy-$(CONFIG_LED_TRIGGER_PHY)	+= phy_led_triggers.o
 
 obj-$(CONFIG_PHYLIB)		+= libphy.o
 
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index ee3c793124c7..2f94c60d4939 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -946,6 +946,7 @@ EXPORT_SYMBOL(phy_start);
 static void phy_adjust_link(struct phy_device *phydev)
 {
 	phydev->adjust_link(phydev->attached_dev);
+	phy_led_trigger_change_speed(phydev);
 }
 
 /**
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index ac440a815353..49a1c988d29c 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -30,6 +30,7 @@
 #include <linux/mii.h>
 #include <linux/ethtool.h>
 #include <linux/phy.h>
+#include <linux/phy_led_triggers.h>
 #include <linux/mdio.h>
 #include <linux/io.h>
 #include <linux/uaccess.h>
@@ -916,6 +917,8 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 	else
 		phy_resume(phydev);
 
+	phy_led_triggers_register(phydev);
+
 	return err;
 
 error:
@@ -989,6 +992,8 @@ void phy_detach(struct phy_device *phydev)
 		}
 	}
 
+	phy_led_triggers_unregister(phydev);
+
 	/*
 	 * The phydev might go away on the put_device() below, so avoid
 	 * a use-after-free bug by reading the underlying bus first.
diff --git a/drivers/net/phy/phy_led_triggers.c b/drivers/net/phy/phy_led_triggers.c
new file mode 100644
index 000000000000..cda600a1b766
--- /dev/null
+++ b/drivers/net/phy/phy_led_triggers.c
@@ -0,0 +1,136 @@
+/* Copyright (C) 2016 National Instruments Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/leds.h>
+#include <linux/phy.h>
+#include <linux/netdevice.h>
+
+static struct phy_led_trigger *phy_speed_to_led_trigger(struct phy_device *phy,
+							unsigned int speed)
+{
+	unsigned int i;
+
+	for (i = 0; i < phy->phy_num_led_triggers; i++) {
+		if (phy->phy_led_triggers[i].speed == speed)
+			return &phy->phy_led_triggers[i];
+	}
+	return NULL;
+}
+
+void phy_led_trigger_change_speed(struct phy_device *phy)
+{
+	struct phy_led_trigger *plt;
+
+	if (!phy->link)
+		goto out_change_speed;
+
+	if (phy->speed == 0)
+		return;
+
+	plt = phy_speed_to_led_trigger(phy, phy->speed);
+	if (!plt) {
+		netdev_alert(phy->attached_dev,
+			     "No phy led trigger registered for speed(%d)\n",
+			     phy->speed);
+		goto out_change_speed;
+	}
+
+	if (plt != phy->last_triggered) {
+		led_trigger_event(&phy->last_triggered->trigger, LED_OFF);
+		led_trigger_event(&plt->trigger, LED_FULL);
+		phy->last_triggered = plt;
+	}
+	return;
+
+out_change_speed:
+	if (phy->last_triggered) {
+		led_trigger_event(&phy->last_triggered->trigger,
+				  LED_OFF);
+		phy->last_triggered = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(phy_led_trigger_change_speed);
+
+static int phy_led_trigger_register(struct phy_device *phy,
+				    struct phy_led_trigger *plt,
+				    unsigned int speed)
+{
+	char name_suffix[PHY_LED_TRIGGER_SPEED_SUFFIX_SIZE];
+
+	plt->speed = speed;
+
+	if (speed < SPEED_1000)
+		snprintf(name_suffix, sizeof(name_suffix), "%dMbps", speed);
+	else if (speed == SPEED_2500)
+		snprintf(name_suffix, sizeof(name_suffix), "2.5Gbps");
+	else
+		snprintf(name_suffix, sizeof(name_suffix), "%dGbps",
+			 DIV_ROUND_CLOSEST(speed, 1000));
+
+	snprintf(plt->name, sizeof(plt->name), PHY_ID_FMT ":%s",
+		 phy->mdio.bus->id, phy->mdio.addr, name_suffix);
+	plt->trigger.name = plt->name;
+
+	return led_trigger_register(&plt->trigger);
+}
+
+static void phy_led_trigger_unregister(struct phy_led_trigger *plt)
+{
+	led_trigger_unregister(&plt->trigger);
+}
+
+int phy_led_triggers_register(struct phy_device *phy)
+{
+	int i, err;
+	unsigned int speeds[50];
+
+	phy->phy_num_led_triggers = phy_supported_speeds(phy, speeds,
+							 ARRAY_SIZE(speeds));
+	if (!phy->phy_num_led_triggers)
+		return 0;
+
+	phy->phy_led_triggers = devm_kzalloc(&phy->mdio.dev,
+					    sizeof(struct phy_led_trigger) *
+						   phy->phy_num_led_triggers,
+					    GFP_KERNEL);
+	if (!phy->phy_led_triggers)
+		return -ENOMEM;
+
+	for (i = 0; i < phy->phy_num_led_triggers; i++) {
+		err = phy_led_trigger_register(phy, &phy->phy_led_triggers[i],
+					       speeds[i]);
+		if (err)
+			goto out_unreg;
+	}
+
+	phy->last_triggered = NULL;
+	phy_led_trigger_change_speed(phy);
+
+	return 0;
+out_unreg:
+	while (i--)
+		phy_led_trigger_unregister(&phy->phy_led_triggers[i]);
+	devm_kfree(&phy->mdio.dev, phy->phy_led_triggers);
+	return err;
+}
+EXPORT_SYMBOL_GPL(phy_led_triggers_register);
+
+void phy_led_triggers_unregister(struct phy_device *phy)
+{
+	int i;
+
+	for (i = 0; i < phy->phy_num_led_triggers; i++)
+		phy_led_trigger_unregister(&phy->phy_led_triggers[i]);
+
+	devm_kfree(&phy->mdio.dev, phy->phy_led_triggers);
+}
+EXPORT_SYMBOL_GPL(phy_led_triggers_unregister);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 4b6c246c63bb..e7e1fd382564 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/mod_devicetable.h>
+#include <linux/phy_led_triggers.h>
 
 #include <linux/atomic.h>
 
@@ -420,6 +421,12 @@ struct phy_device {
 
 	int link_timeout;
 
+#ifdef CONFIG_LED_TRIGGER_PHY
+	struct phy_led_trigger *phy_led_triggers;
+	unsigned int phy_num_led_triggers;
+	struct phy_led_trigger *last_triggered;
+#endif
+
 	/*
 	 * Interrupt number for this PHY
 	 * -1 means no interrupt
diff --git a/include/linux/phy_led_triggers.h b/include/linux/phy_led_triggers.h
new file mode 100644
index 000000000000..a2daea0a37d2
--- /dev/null
+++ b/include/linux/phy_led_triggers.h
@@ -0,0 +1,51 @@
+/* Copyright (C) 2016 National Instruments Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef __PHY_LED_TRIGGERS
+#define __PHY_LED_TRIGGERS
+
+struct phy_device;
+
+#ifdef CONFIG_LED_TRIGGER_PHY
+
+#include <linux/leds.h>
+
+#define PHY_LED_TRIGGER_SPEED_SUFFIX_SIZE	10
+#define PHY_MII_BUS_ID_SIZE	(20 - 3)
+
+#define PHY_LINK_LED_TRIGGER_NAME_SIZE (PHY_MII_BUS_ID_SIZE + \
+				       FIELD_SIZEOF(struct mdio_device, addr)+\
+				       PHY_LED_TRIGGER_SPEED_SUFFIX_SIZE)
+
+struct phy_led_trigger {
+	struct led_trigger trigger;
+	char name[PHY_LINK_LED_TRIGGER_NAME_SIZE];
+	unsigned int speed;
+};
+
+
+extern int phy_led_triggers_register(struct phy_device *phy);
+extern void phy_led_triggers_unregister(struct phy_device *phy);
+extern void phy_led_trigger_change_speed(struct phy_device *phy);
+
+#else
+
+static inline int phy_led_triggers_register(struct phy_device *phy)
+{
+	return 0;
+}
+static inline void phy_led_triggers_unregister(struct phy_device *phy) { }
+static inline void phy_led_trigger_change_speed(struct phy_device *phy) { }
+
+#endif
+
+#endif
-- 
cgit v1.2.3


From 9a97434215819872b054c3d0c067e5e4fa768b0e Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Mon, 17 Oct 2016 21:45:29 +0200
Subject: ARM: pxa: enhance smc91x platform data

Instead of having the smc91x driver relying on machine_is_*() calls,
provide this data through platform data, ie. idp, mainstone and
stargate.

This way, the driver doesn't need anymore machine_is_*() calls, which
wouldn't work anymore with a device-tree build.

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm/mach-pxa/idp.c       | 1 +
 arch/arm/mach-pxa/mainstone.c | 1 +
 arch/arm/mach-pxa/stargate2.c | 1 +
 include/linux/smc91x.h        | 1 +
 4 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm/mach-pxa/idp.c b/arch/arm/mach-pxa/idp.c
index 66070acaa888..d1db32b1a2c6 100644
--- a/arch/arm/mach-pxa/idp.c
+++ b/arch/arm/mach-pxa/idp.c
@@ -85,6 +85,7 @@ static struct resource smc91x_resources[] = {
 static struct smc91x_platdata smc91x_platdata = {
 	.flags = SMC91X_USE_8BIT | SMC91X_USE_16BIT | SMC91X_USE_32BIT |
 		 SMC91X_USE_DMA | SMC91X_NOWAIT,
+	.pxa_u16_align4 = true,
 };
 
 static struct platform_device smc91x_device = {
diff --git a/arch/arm/mach-pxa/mainstone.c b/arch/arm/mach-pxa/mainstone.c
index 40964069a17c..a2d851a3a546 100644
--- a/arch/arm/mach-pxa/mainstone.c
+++ b/arch/arm/mach-pxa/mainstone.c
@@ -140,6 +140,7 @@ static struct resource smc91x_resources[] = {
 static struct smc91x_platdata mainstone_smc91x_info = {
 	.flags	= SMC91X_USE_8BIT | SMC91X_USE_16BIT | SMC91X_USE_32BIT |
 		  SMC91X_NOWAIT | SMC91X_USE_DMA,
+	.pxa_u16_align4 = true,
 };
 
 static struct platform_device smc91x_device = {
diff --git a/arch/arm/mach-pxa/stargate2.c b/arch/arm/mach-pxa/stargate2.c
index 702f4f14b708..7b6610e9dae4 100644
--- a/arch/arm/mach-pxa/stargate2.c
+++ b/arch/arm/mach-pxa/stargate2.c
@@ -673,6 +673,7 @@ static struct resource smc91x_resources[] = {
 static struct smc91x_platdata stargate2_smc91x_info = {
 	.flags = SMC91X_USE_8BIT | SMC91X_USE_16BIT | SMC91X_USE_32BIT
 	| SMC91X_NOWAIT | SMC91X_USE_DMA,
+	.pxa_u16_align4 = true,
 };
 
 static struct platform_device smc91x_device = {
diff --git a/include/linux/smc91x.h b/include/linux/smc91x.h
index e302c447e057..129bc674dcf5 100644
--- a/include/linux/smc91x.h
+++ b/include/linux/smc91x.h
@@ -39,6 +39,7 @@ struct smc91x_platdata {
 	unsigned long flags;
 	unsigned char leda;
 	unsigned char ledb;
+	bool pxa_u16_align4;	/* PXA buggy u16 writes on 4*n+2 addresses */
 };
 
 #endif /* __SMC91X_H__ */
-- 
cgit v1.2.3


From 57a09bf0a416700676e77102c28f9cfcb48267e0 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 18 Oct 2016 19:51:19 +0200
Subject: bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers

A BPF program is required to check the return register of a
map_elem_lookup() call before accessing memory. The verifier keeps
track of this by converting the type of the result register from
PTR_TO_MAP_VALUE_OR_NULL to PTR_TO_MAP_VALUE after a conditional
jump ensures safety. This check is currently exclusively performed
for the result register 0.

In the event the compiler reorders instructions, BPF_MOV64_REG
instructions may be moved before the conditional jump which causes
them to keep their type PTR_TO_MAP_VALUE_OR_NULL to which the
verifier objects when the register is accessed:

0: (b7) r1 = 10
1: (7b) *(u64 *)(r10 -8) = r1
2: (bf) r2 = r10
3: (07) r2 += -8
4: (18) r1 = 0x59c00000
6: (85) call 1
7: (bf) r4 = r0
8: (15) if r0 == 0x0 goto pc+1
 R0=map_value(ks=8,vs=8) R4=map_value_or_null(ks=8,vs=8) R10=fp
9: (7a) *(u64 *)(r4 +0) = 0
R4 invalid mem access 'map_value_or_null'

This commit extends the verifier to keep track of all identical
PTR_TO_MAP_VALUE_OR_NULL registers after a map_elem_lookup() by
assigning them an ID and then marking them all when the conditional
jump is observed.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h                |  2 +-
 kernel/bpf/verifier.c                       | 61 +++++++++++++++++-------
 tools/testing/selftests/bpf/test_verifier.c | 72 +++++++++++++++++++++++++++++
 3 files changed, 118 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7035b997aaa5..ac5b393ee6b2 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -23,13 +23,13 @@ struct bpf_reg_state {
 	 * result in a bad access.
 	 */
 	u64 min_value, max_value;
+	u32 id;
 	union {
 		/* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
 		s64 imm;
 
 		/* valid when type == PTR_TO_PACKET* */
 		struct {
-			u32 id;
 			u16 off;
 			u16 range;
 		};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 99a7e5b388f2..846d7ceaf202 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -212,9 +212,10 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 		else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
 			 t == PTR_TO_MAP_VALUE_OR_NULL ||
 			 t == PTR_TO_MAP_VALUE_ADJ)
-			verbose("(ks=%d,vs=%d)",
+			verbose("(ks=%d,vs=%d,id=%u)",
 				reg->map_ptr->key_size,
-				reg->map_ptr->value_size);
+				reg->map_ptr->value_size,
+				reg->id);
 		if (reg->min_value != BPF_REGISTER_MIN_RANGE)
 			verbose(",min_value=%llu",
 				(unsigned long long)reg->min_value);
@@ -447,6 +448,7 @@ static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
 {
 	BUG_ON(regno >= MAX_BPF_REG);
 	regs[regno].type = UNKNOWN_VALUE;
+	regs[regno].id = 0;
 	regs[regno].imm = 0;
 }
 
@@ -1252,6 +1254,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
 			return -EINVAL;
 		}
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
+		regs[BPF_REG_0].id = ++env->id_gen;
 	} else {
 		verbose("unknown return type %d of func %d\n",
 			fn->ret_type, func_id);
@@ -1644,8 +1647,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 						insn->src_reg);
 					return -EACCES;
 				}
-				regs[insn->dst_reg].type = UNKNOWN_VALUE;
-				regs[insn->dst_reg].map_ptr = NULL;
+				mark_reg_unknown_value(regs, insn->dst_reg);
 			}
 		} else {
 			/* case: R = imm
@@ -1907,6 +1909,38 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 	check_reg_overflow(true_reg);
 }
 
+static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
+			 enum bpf_reg_type type)
+{
+	struct bpf_reg_state *reg = &regs[regno];
+
+	if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
+		reg->type = type;
+		if (type == UNKNOWN_VALUE)
+			mark_reg_unknown_value(regs, regno);
+	}
+}
+
+/* The logic is similar to find_good_pkt_pointers(), both could eventually
+ * be folded together at some point.
+ */
+static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
+			  enum bpf_reg_type type)
+{
+	struct bpf_reg_state *regs = state->regs;
+	int i;
+
+	for (i = 0; i < MAX_BPF_REG; i++)
+		mark_map_reg(regs, i, regs[regno].id, type);
+
+	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+		if (state->stack_slot_type[i] != STACK_SPILL)
+			continue;
+		mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE,
+			     regs[regno].id, type);
+	}
+}
+
 static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			     struct bpf_insn *insn, int *insn_idx)
 {
@@ -1994,18 +2028,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	if (BPF_SRC(insn->code) == BPF_K &&
 	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
 	    dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-		if (opcode == BPF_JEQ) {
-			/* next fallthrough insn can access memory via
-			 * this register
-			 */
-			regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
-			/* branch targer cannot access it, since reg == 0 */
-			mark_reg_unknown_value(other_branch->regs,
-					       insn->dst_reg);
-		} else {
-			other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
-			mark_reg_unknown_value(regs, insn->dst_reg);
-		}
+		/* Mark all identical map registers in each branch as either
+		 * safe or unknown depending R == 0 or R != 0 conditional.
+		 */
+		mark_map_regs(this_branch, insn->dst_reg,
+			      opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE);
+		mark_map_regs(other_branch, insn->dst_reg,
+			      opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index ff5df121b2f6..0ef8eaf6cea7 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -2588,6 +2588,78 @@ static struct bpf_test tests[] = {
 		.result_unpriv = REJECT,
 		.result = REJECT,
 	},
+	{
+		"multiple registers share map_lookup_elem result",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 10),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 4 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS
+	},
+	{
+		"invalid memory access with multiple map_lookup_elem calls",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 10),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 4 },
+		.result = REJECT,
+		.errstr = "R4 !read_ok",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS
+	},
+	{
+		"valid indirect map_lookup_elem access with 2nd lookup in branch",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 10),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_IMM(BPF_REG_2, 10),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0, 3),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 4 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
cgit v1.2.3


From 8b6b4135e4fb2b537f33b811c13f77bee25ca8d3 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Thu, 20 Oct 2016 13:55:19 -0400
Subject: net: use core MTU range checking in WAN drivers

- set min/max_mtu in all hdlc drivers, remove hdlc_change_mtu
- sent max_mtu in lec driver, remove lec_change_mtu
- set min/max_mtu in x25_asy driver

CC: netdev@vger.kernel.org
CC: Krzysztof Halasa <khc@pm.waw.pl>
CC: Krzysztof Halasa <khalasa@piap.pl>
CC: Jan "Yenya" Kasprzak <kas@fi.muni.cz>
CC: Francois Romieu <romieu@fr.zoreil.com>
CC: Kevin Curtis <kevin.curtis@farsite.co.uk>
CC: Zhao Qiang <qiang.zhao@nxp.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/char/pcmcia/synclink_cs.c |  1 -
 drivers/net/wan/c101.c            |  1 -
 drivers/net/wan/cosa.c            |  1 -
 drivers/net/wan/dscc4.c           |  1 -
 drivers/net/wan/farsync.c         |  1 -
 drivers/net/wan/fsl_ucc_hdlc.c    |  1 -
 drivers/net/wan/hdlc.c            | 11 ++---------
 drivers/net/wan/hdlc_fr.c         |  3 ++-
 drivers/net/wan/hostess_sv11.c    |  1 -
 drivers/net/wan/ixp4xx_hss.c      |  1 -
 drivers/net/wan/lmc/lmc_main.c    |  1 -
 drivers/net/wan/n2.c              |  1 -
 drivers/net/wan/pc300too.c        |  1 -
 drivers/net/wan/pci200syn.c       |  1 -
 drivers/net/wan/sealevel.c        |  1 -
 drivers/net/wan/wanxl.c           |  1 -
 drivers/net/wan/x25_asy.c         |  5 ++---
 drivers/tty/synclink.c            |  1 -
 drivers/tty/synclink_gt.c         |  1 -
 drivers/tty/synclinkmp.c          |  1 -
 include/linux/hdlc.h              |  2 --
 net/atm/lec.c                     | 11 +----------
 22 files changed, 7 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index d28922df01d7..a7dd5f4f2c5a 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -4248,7 +4248,6 @@ static void hdlcdev_rx(MGSLPC_INFO *info, char *buf, int size)
 static const struct net_device_ops hdlcdev_ops = {
 	.ndo_open       = hdlcdev_open,
 	.ndo_stop       = hdlcdev_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = hdlcdev_ioctl,
 	.ndo_tx_timeout = hdlcdev_tx_timeout,
diff --git a/drivers/net/wan/c101.c b/drivers/net/wan/c101.c
index 09a50751763b..2371e078afbb 100644
--- a/drivers/net/wan/c101.c
+++ b/drivers/net/wan/c101.c
@@ -302,7 +302,6 @@ static void c101_destroy_card(card_t *card)
 static const struct net_device_ops c101_ops = {
 	.ndo_open       = c101_open,
 	.ndo_stop       = c101_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = c101_ioctl,
 };
diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c
index b87fe0a01c69..087eb266601f 100644
--- a/drivers/net/wan/cosa.c
+++ b/drivers/net/wan/cosa.c
@@ -432,7 +432,6 @@ module_exit(cosa_exit);
 static const struct net_device_ops cosa_ops = {
 	.ndo_open       = cosa_net_open,
 	.ndo_stop       = cosa_net_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = cosa_net_ioctl,
 	.ndo_tx_timeout = cosa_net_timeout,
diff --git a/drivers/net/wan/dscc4.c b/drivers/net/wan/dscc4.c
index 629225980463..7351e5440ed7 100644
--- a/drivers/net/wan/dscc4.c
+++ b/drivers/net/wan/dscc4.c
@@ -887,7 +887,6 @@ static inline int dscc4_set_quartz(struct dscc4_dev_priv *dpriv, int hz)
 static const struct net_device_ops dscc4_ops = {
 	.ndo_open       = dscc4_open,
 	.ndo_stop       = dscc4_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = dscc4_ioctl,
 	.ndo_tx_timeout = dscc4_tx_timeout,
diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c
index 3c9cbf908ec7..03696d35ee9c 100644
--- a/drivers/net/wan/farsync.c
+++ b/drivers/net/wan/farsync.c
@@ -2394,7 +2394,6 @@ fst_init_card(struct fst_card_info *card)
 static const struct net_device_ops fst_ops = {
 	.ndo_open       = fst_open,
 	.ndo_stop       = fst_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = fst_ioctl,
 	.ndo_tx_timeout = fst_tx_timeout,
diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c
index 65647533b401..e38ce4da3efb 100644
--- a/drivers/net/wan/fsl_ucc_hdlc.c
+++ b/drivers/net/wan/fsl_ucc_hdlc.c
@@ -992,7 +992,6 @@ static const struct dev_pm_ops uhdlc_pm_ops = {
 static const struct net_device_ops uhdlc_ops = {
 	.ndo_open       = uhdlc_open,
 	.ndo_stop       = uhdlc_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = uhdlc_ioctl,
 };
diff --git a/drivers/net/wan/hdlc.c b/drivers/net/wan/hdlc.c
index 9bd4aa8083ce..7221a53b8b14 100644
--- a/drivers/net/wan/hdlc.c
+++ b/drivers/net/wan/hdlc.c
@@ -46,14 +46,6 @@ static const char* version = "HDLC support module revision 1.22";
 
 static struct hdlc_proto *first_proto;
 
-int hdlc_change_mtu(struct net_device *dev, int new_mtu)
-{
-	if ((new_mtu < 68) || (new_mtu > HDLC_MAX_MTU))
-		return -EINVAL;
-	dev->mtu = new_mtu;
-	return 0;
-}
-
 static int hdlc_rcv(struct sk_buff *skb, struct net_device *dev,
 		    struct packet_type *p, struct net_device *orig_dev)
 {
@@ -237,6 +229,8 @@ static void hdlc_setup_dev(struct net_device *dev)
 	dev->flags		 = IFF_POINTOPOINT | IFF_NOARP;
 	dev->priv_flags		 = IFF_WAN_HDLC;
 	dev->mtu		 = HDLC_MAX_MTU;
+	dev->min_mtu		 = 68;
+	dev->max_mtu		 = HDLC_MAX_MTU;
 	dev->type		 = ARPHRD_RAWHDLC;
 	dev->hard_header_len	 = 16;
 	dev->addr_len		 = 0;
@@ -353,7 +347,6 @@ MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>");
 MODULE_DESCRIPTION("HDLC support module");
 MODULE_LICENSE("GPL v2");
 
-EXPORT_SYMBOL(hdlc_change_mtu);
 EXPORT_SYMBOL(hdlc_start_xmit);
 EXPORT_SYMBOL(hdlc_open);
 EXPORT_SYMBOL(hdlc_close);
diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index b6e0cfb095d3..eb915281197e 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -1053,7 +1053,6 @@ static void pvc_setup(struct net_device *dev)
 static const struct net_device_ops pvc_ops = {
 	.ndo_open       = pvc_open,
 	.ndo_stop       = pvc_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = pvc_xmit,
 	.ndo_do_ioctl   = pvc_ioctl,
 };
@@ -1096,6 +1095,8 @@ static int fr_add_pvc(struct net_device *frad, unsigned int dlci, int type)
 	}
 	dev->netdev_ops = &pvc_ops;
 	dev->mtu = HDLC_MAX_MTU;
+	dev->min_mtu = 68;
+	dev->max_mtu = HDLC_MAX_MTU;
 	dev->priv_flags |= IFF_NO_QUEUE;
 	dev->ml_priv = pvc;
 
diff --git a/drivers/net/wan/hostess_sv11.c b/drivers/net/wan/hostess_sv11.c
index 3d741663fd67..dd6bb3364ad2 100644
--- a/drivers/net/wan/hostess_sv11.c
+++ b/drivers/net/wan/hostess_sv11.c
@@ -180,7 +180,6 @@ static int hostess_attach(struct net_device *dev, unsigned short encoding,
 static const struct net_device_ops hostess_ops = {
 	.ndo_open       = hostess_open,
 	.ndo_stop       = hostess_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = hostess_ioctl,
 };
diff --git a/drivers/net/wan/ixp4xx_hss.c b/drivers/net/wan/ixp4xx_hss.c
index e7bbdb7af53a..6a505c26a3e7 100644
--- a/drivers/net/wan/ixp4xx_hss.c
+++ b/drivers/net/wan/ixp4xx_hss.c
@@ -1321,7 +1321,6 @@ static int hss_hdlc_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 static const struct net_device_ops hss_hdlc_ops = {
 	.ndo_open       = hss_hdlc_open,
 	.ndo_stop       = hss_hdlc_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = hss_hdlc_ioctl,
 };
diff --git a/drivers/net/wan/lmc/lmc_main.c b/drivers/net/wan/lmc/lmc_main.c
index 299140c04556..001b7796740d 100644
--- a/drivers/net/wan/lmc/lmc_main.c
+++ b/drivers/net/wan/lmc/lmc_main.c
@@ -808,7 +808,6 @@ static int lmc_attach(struct net_device *dev, unsigned short encoding,
 static const struct net_device_ops lmc_ops = {
 	.ndo_open       = lmc_open,
 	.ndo_stop       = lmc_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = lmc_ioctl,
 	.ndo_tx_timeout = lmc_driver_timeout,
diff --git a/drivers/net/wan/n2.c b/drivers/net/wan/n2.c
index 315bf09d6a20..c8f4517db3a0 100644
--- a/drivers/net/wan/n2.c
+++ b/drivers/net/wan/n2.c
@@ -330,7 +330,6 @@ static void n2_destroy_card(card_t *card)
 static const struct net_device_ops n2_ops = {
 	.ndo_open       = n2_open,
 	.ndo_stop       = n2_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = n2_ioctl,
 };
diff --git a/drivers/net/wan/pc300too.c b/drivers/net/wan/pc300too.c
index db363856e0b5..e1dd1ec18d64 100644
--- a/drivers/net/wan/pc300too.c
+++ b/drivers/net/wan/pc300too.c
@@ -291,7 +291,6 @@ static void pc300_pci_remove_one(struct pci_dev *pdev)
 static const struct net_device_ops pc300_ops = {
 	.ndo_open       = pc300_open,
 	.ndo_stop       = pc300_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = pc300_ioctl,
 };
diff --git a/drivers/net/wan/pci200syn.c b/drivers/net/wan/pci200syn.c
index e8455621390e..4e437c599e9a 100644
--- a/drivers/net/wan/pci200syn.c
+++ b/drivers/net/wan/pci200syn.c
@@ -270,7 +270,6 @@ static void pci200_pci_remove_one(struct pci_dev *pdev)
 static const struct net_device_ops pci200_ops = {
 	.ndo_open       = pci200_open,
 	.ndo_stop       = pci200_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = pci200_ioctl,
 };
diff --git a/drivers/net/wan/sealevel.c b/drivers/net/wan/sealevel.c
index 27860b4f5908..fbb5aa2c4d8f 100644
--- a/drivers/net/wan/sealevel.c
+++ b/drivers/net/wan/sealevel.c
@@ -174,7 +174,6 @@ static int sealevel_attach(struct net_device *dev, unsigned short encoding,
 static const struct net_device_ops sealevel_ops = {
 	.ndo_open       = sealevel_open,
 	.ndo_stop       = sealevel_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = sealevel_ioctl,
 };
diff --git a/drivers/net/wan/wanxl.c b/drivers/net/wan/wanxl.c
index a20d688d2595..0c7317520ed3 100644
--- a/drivers/net/wan/wanxl.c
+++ b/drivers/net/wan/wanxl.c
@@ -551,7 +551,6 @@ static void wanxl_pci_remove_one(struct pci_dev *pdev)
 static const struct net_device_ops wanxl_ops = {
 	.ndo_open       = wanxl_open,
 	.ndo_stop       = wanxl_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = wanxl_ioctl,
 	.ndo_get_stats  = wanxl_get_stats,
diff --git a/drivers/net/wan/x25_asy.c b/drivers/net/wan/x25_asy.c
index 1bc5e93d2a34..878b05d06fc7 100644
--- a/drivers/net/wan/x25_asy.c
+++ b/drivers/net/wan/x25_asy.c
@@ -124,9 +124,6 @@ static int x25_asy_change_mtu(struct net_device *dev, int newmtu)
 	unsigned char *xbuff, *rbuff;
 	int len;
 
-	if (newmtu > 65534)
-		return -EINVAL;
-
 	len = 2 * newmtu;
 	xbuff = kmalloc(len + 4, GFP_ATOMIC);
 	rbuff = kmalloc(len + 4, GFP_ATOMIC);
@@ -751,6 +748,8 @@ static void x25_asy_setup(struct net_device *dev)
 	 */
 
 	dev->mtu		= SL_MTU;
+	dev->min_mtu		= 0;
+	dev->max_mtu		= 65534;
 	dev->netdev_ops		= &x25_asy_netdev_ops;
 	dev->watchdog_timeo	= HZ*20;
 	dev->hard_header_len	= 0;
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index c13e27ecb0b7..415885c56435 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -7973,7 +7973,6 @@ static void hdlcdev_rx(struct mgsl_struct *info, char *buf, int size)
 static const struct net_device_ops hdlcdev_ops = {
 	.ndo_open       = hdlcdev_open,
 	.ndo_stop       = hdlcdev_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = hdlcdev_ioctl,
 	.ndo_tx_timeout = hdlcdev_tx_timeout,
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 7aca2d4670e4..8267bcf2405e 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -1768,7 +1768,6 @@ static void hdlcdev_rx(struct slgt_info *info, char *buf, int size)
 static const struct net_device_ops hdlcdev_ops = {
 	.ndo_open       = hdlcdev_open,
 	.ndo_stop       = hdlcdev_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = hdlcdev_ioctl,
 	.ndo_tx_timeout = hdlcdev_tx_timeout,
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index dec156586de1..d66620f7eaa3 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -1887,7 +1887,6 @@ static void hdlcdev_rx(SLMP_INFO *info, char *buf, int size)
 static const struct net_device_ops hdlcdev_ops = {
 	.ndo_open       = hdlcdev_open,
 	.ndo_stop       = hdlcdev_close,
-	.ndo_change_mtu = hdlc_change_mtu,
 	.ndo_start_xmit = hdlc_start_xmit,
 	.ndo_do_ioctl   = hdlcdev_ioctl,
 	.ndo_tx_timeout = hdlcdev_tx_timeout,
diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h
index e31bcd4c7859..97585d9679f3 100644
--- a/include/linux/hdlc.h
+++ b/include/linux/hdlc.h
@@ -93,8 +93,6 @@ static __inline__ void debug_frame(const struct sk_buff *skb)
 int hdlc_open(struct net_device *dev);
 /* Must be called by hardware driver when HDLC device is being closed */
 void hdlc_close(struct net_device *dev);
-/* May be used by hardware driver */
-int hdlc_change_mtu(struct net_device *dev, int new_mtu);
 /* Must be pointed to by hw driver's dev->netdev_ops->ndo_start_xmit */
 netdev_tx_t hdlc_start_xmit(struct sk_buff *skb, struct net_device *dev);
 
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 5d2693826afb..779b3fa6052d 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -544,15 +544,6 @@ send_to_lecd(struct lec_priv *priv, atmlec_msg_type type,
 	return 0;
 }
 
-/* shamelessly stolen from drivers/net/net_init.c */
-static int lec_change_mtu(struct net_device *dev, int new_mtu)
-{
-	if ((new_mtu < 68) || (new_mtu > 18190))
-		return -EINVAL;
-	dev->mtu = new_mtu;
-	return 0;
-}
-
 static void lec_set_multicast_list(struct net_device *dev)
 {
 	/*
@@ -565,7 +556,6 @@ static const struct net_device_ops lec_netdev_ops = {
 	.ndo_open		= lec_open,
 	.ndo_stop		= lec_close,
 	.ndo_start_xmit		= lec_start_xmit,
-	.ndo_change_mtu		= lec_change_mtu,
 	.ndo_tx_timeout		= lec_tx_timeout,
 	.ndo_set_rx_mode	= lec_set_multicast_list,
 };
@@ -742,6 +732,7 @@ static int lecd_attach(struct atm_vcc *vcc, int arg)
 		if (!dev_lec[i])
 			return -ENOMEM;
 		dev_lec[i]->netdev_ops = &lec_netdev_ops;
+		dev_lec[i]->max_mtu = 18190;
 		snprintf(dev_lec[i]->name, IFNAMSIZ, "lec%d", i);
 		if (register_netdev(dev_lec[i])) {
 			free_netdev(dev_lec[i]);
-- 
cgit v1.2.3


From b3e3893e1253692c3d2b8e8ebd5a26183ed30443 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Thu, 20 Oct 2016 13:55:22 -0400
Subject: net: use core MTU range checking in misc drivers

firewire-net:
- set min/max_mtu
- remove fwnet_change_mtu

nes:
- set max_mtu
- clean up nes_netdev_change_mtu

xpnet:
- set min/max_mtu
- remove xpnet_dev_change_mtu

hippi:
- set min/max_mtu
- remove hippi_change_mtu

batman-adv:
- set max_mtu
- remove batadv_interface_change_mtu
- initialization is a little async, not 100% certain that max_mtu is set
  in the optimal place, don't have hardware to test with

rionet:
- set min/max_mtu
- remove rionet_change_mtu

slip:
- set min/max_mtu
- streamline sl_change_mtu

um/net_kern:
- remove pointless ndo_change_mtu

hsi/clients/ssi_protocol:
- use core MTU range checking
- remove now redundant ssip_pn_set_mtu

ipoib:
- set a default max MTU value
- Note: ipoib's actual max MTU can vary, depending on if the device is in
  connected mode or not, so we'll just set the max_mtu value to the max
  possible, and let the ndo_change_mtu function continue to validate any new
  MTU change requests with checks for CM or not. Note that ipoib has no
  min_mtu set, and thus, the network core's mtu > 0 check is the only lower
  bounds here.

mptlan:
- use net core MTU range checking
- remove now redundant mpt_lan_change_mtu

fddi:
- min_mtu = 21, max_mtu = 4470
- remove now redundant fddi_change_mtu (including export)

fjes:
- min_mtu = 8192, max_mtu = 65536
- The max_mtu value is actually one over IP_MAX_MTU here, but the idea is to
  get past the core net MTU range checks so fjes_change_mtu can validate a
  new MTU against what it supports (see fjes_support_mtu in fjes_hw.c)

hsr:
- min_mtu = 0 (calls ether_setup, max_mtu is 1500)

f_phonet:
- min_mtu = 6, max_mtu = 65541

u_ether:
- min_mtu = 14, max_mtu = 15412

phonet/pep-gprs:
- min_mtu = 576, max_mtu = 65530
- remove redundant gprs_set_mtu

CC: netdev@vger.kernel.org
CC: linux-rdma@vger.kernel.org
CC: Stefan Richter <stefanr@s5r6.in-berlin.de>
CC: Faisal Latif <faisal.latif@intel.com>
CC: linux-rdma@vger.kernel.org
CC: Cliff Whickman <cpw@sgi.com>
CC: Robin Holt <robinmholt@gmail.com>
CC: Jes Sorensen <jes@trained-monkey.org>
CC: Marek Lindner <mareklindner@neomailbox.ch>
CC: Simon Wunderlich <sw@simonwunderlich.de>
CC: Antonio Quartulli <a@unstable.cc>
CC: Sathya Prakash <sathya.prakash@broadcom.com>
CC: Chaitra P B <chaitra.basappa@broadcom.com>
CC: Suganath Prabu Subramani <suganath-prabu.subramani@broadcom.com>
CC: MPT-FusionLinux.pdl@broadcom.com
CC: Sebastian Reichel <sre@kernel.org>
CC: Felipe Balbi <balbi@kernel.org>
CC: Arvid Brodin <arvid.brodin@alten.se>
CC: Remi Denis-Courmont <courmisch@gmail.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/um/drivers/net_kern.c                |  8 --------
 drivers/firewire/net.c                    | 18 ++++--------------
 drivers/hsi/clients/ssi_protocol.c        | 14 ++++----------
 drivers/infiniband/hw/nes/nes.c           |  1 -
 drivers/infiniband/hw/nes/nes.h           |  4 ++--
 drivers/infiniband/hw/nes/nes_nic.c       | 10 +++-------
 drivers/infiniband/ulp/ipoib/ipoib_main.c |  1 +
 drivers/message/fusion/mptlan.c           | 15 ++++-----------
 drivers/misc/sgi-xp/xpnet.c               | 21 ++++-----------------
 drivers/net/fddi/skfp/skfddi.c            |  1 -
 drivers/net/fjes/fjes_main.c              |  2 ++
 drivers/net/hippi/rrunner.c               |  1 -
 drivers/net/rionet.c                      | 15 +++------------
 drivers/net/slip/slip.c                   | 11 +++++------
 drivers/usb/gadget/function/f_phonet.c    | 11 ++---------
 drivers/usb/gadget/function/u_ether.c     | 14 ++++----------
 include/linux/fddidevice.h                |  1 -
 include/linux/hippidevice.h               |  1 -
 net/802/fddi.c                            | 11 ++---------
 net/802/hippi.c                           | 14 ++------------
 net/batman-adv/soft-interface.c           | 13 +------------
 net/hsr/hsr_device.c                      |  1 +
 net/phonet/pep-gprs.c                     | 12 ++----------
 23 files changed, 46 insertions(+), 154 deletions(-)

(limited to 'include/linux')

diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index 2cd5b6874c7b..1669240c7a25 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -256,13 +256,6 @@ static void uml_net_tx_timeout(struct net_device *dev)
 	netif_wake_queue(dev);
 }
 
-static int uml_net_change_mtu(struct net_device *dev, int new_mtu)
-{
-	dev->mtu = new_mtu;
-
-	return 0;
-}
-
 #ifdef CONFIG_NET_POLL_CONTROLLER
 static void uml_net_poll_controller(struct net_device *dev)
 {
@@ -374,7 +367,6 @@ static const struct net_device_ops uml_netdev_ops = {
 	.ndo_set_rx_mode	= uml_net_set_multicast_list,
 	.ndo_tx_timeout 	= uml_net_tx_timeout,
 	.ndo_set_mac_address	= eth_mac_addr,
-	.ndo_change_mtu 	= uml_net_change_mtu,
 	.ndo_validate_addr	= eth_validate_addr,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = uml_net_poll_controller,
diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index 309311b1faae..8430222151fc 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -1349,15 +1349,6 @@ static netdev_tx_t fwnet_tx(struct sk_buff *skb, struct net_device *net)
 	return NETDEV_TX_OK;
 }
 
-static int fwnet_change_mtu(struct net_device *net, int new_mtu)
-{
-	if (new_mtu < 68)
-		return -EINVAL;
-
-	net->mtu = new_mtu;
-	return 0;
-}
-
 static const struct ethtool_ops fwnet_ethtool_ops = {
 	.get_link	= ethtool_op_get_link,
 };
@@ -1366,7 +1357,6 @@ static const struct net_device_ops fwnet_netdev_ops = {
 	.ndo_open       = fwnet_open,
 	.ndo_stop	= fwnet_stop,
 	.ndo_start_xmit = fwnet_tx,
-	.ndo_change_mtu = fwnet_change_mtu,
 };
 
 static void fwnet_init_dev(struct net_device *net)
@@ -1435,7 +1425,6 @@ static int fwnet_probe(struct fw_unit *unit,
 	struct net_device *net;
 	bool allocated_netdev = false;
 	struct fwnet_device *dev;
-	unsigned max_mtu;
 	int ret;
 	union fwnet_hwaddr *ha;
 
@@ -1478,9 +1467,10 @@ static int fwnet_probe(struct fw_unit *unit,
 	 * Use the RFC 2734 default 1500 octets or the maximum payload
 	 * as initial MTU
 	 */
-	max_mtu = (1 << (card->max_receive + 1))
-		  - sizeof(struct rfc2734_header) - IEEE1394_GASP_HDR_SIZE;
-	net->mtu = min(1500U, max_mtu);
+	net->max_mtu = (1 << (card->max_receive + 1))
+		       - sizeof(struct rfc2734_header) - IEEE1394_GASP_HDR_SIZE;
+	net->mtu = min(1500U, net->max_mtu);
+	net->min_mtu = ETH_MIN_MTU;
 
 	/* Set our hardware address while we're at it */
 	ha = (union fwnet_hwaddr *)net->dev_addr;
diff --git a/drivers/hsi/clients/ssi_protocol.c b/drivers/hsi/clients/ssi_protocol.c
index 6031cd146556..7ef819680acd 100644
--- a/drivers/hsi/clients/ssi_protocol.c
+++ b/drivers/hsi/clients/ssi_protocol.c
@@ -960,15 +960,6 @@ static int ssip_pn_stop(struct net_device *dev)
 	return 0;
 }
 
-static int ssip_pn_set_mtu(struct net_device *dev, int new_mtu)
-{
-	if (new_mtu > SSIP_MAX_MTU || new_mtu < PHONET_MIN_MTU)
-		return -EINVAL;
-	dev->mtu = new_mtu;
-
-	return 0;
-}
-
 static void ssip_xmit_work(struct work_struct *work)
 {
 	struct ssi_protocol *ssi =
@@ -1060,7 +1051,6 @@ static const struct net_device_ops ssip_pn_ops = {
 	.ndo_open	= ssip_pn_open,
 	.ndo_stop	= ssip_pn_stop,
 	.ndo_start_xmit	= ssip_pn_xmit,
-	.ndo_change_mtu	= ssip_pn_set_mtu,
 };
 
 static void ssip_pn_setup(struct net_device *dev)
@@ -1136,6 +1126,10 @@ static int ssi_protocol_probe(struct device *dev)
 		goto out1;
 	}
 
+	/* MTU range: 6 - 65535 */
+	ssi->netdev->min_mtu = PHONET_MIN_MTU;
+	ssi->netdev->max_mtu = SSIP_MAX_MTU;
+
 	SET_NETDEV_DEV(ssi->netdev, dev);
 	netif_carrier_off(ssi->netdev);
 	err = register_netdev(ssi->netdev);
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
index 35cbb17bec12..2baa45a8e401 100644
--- a/drivers/infiniband/hw/nes/nes.c
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -65,7 +65,6 @@ MODULE_DESCRIPTION("NetEffect RNIC Low-level iWARP Driver");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_VERSION(DRV_VERSION);
 
-int max_mtu = 9000;
 int interrupt_mod_interval = 0;
 
 /* Interoperability */
diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h
index e7430c9254d3..85acd0843b50 100644
--- a/drivers/infiniband/hw/nes/nes.h
+++ b/drivers/infiniband/hw/nes/nes.h
@@ -83,6 +83,8 @@
 #define NES_FIRST_QPN           64
 #define NES_SW_CONTEXT_ALIGN    1024
 
+#define NES_MAX_MTU		9000
+
 #define NES_NIC_MAX_NICS        16
 #define NES_MAX_ARP_TABLE_SIZE  4096
 
@@ -169,8 +171,6 @@ do { \
 #include "nes_cm.h"
 #include "nes_mgt.h"
 
-extern int max_mtu;
-#define max_frame_len (max_mtu+ETH_HLEN)
 extern int interrupt_mod_interval;
 extern int nes_if_count;
 extern int mpa_version;
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
index 2b27d1351cf7..7f8597d6738b 100644
--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -981,20 +981,16 @@ static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu)
 {
 	struct nes_vnic	*nesvnic = netdev_priv(netdev);
 	struct nes_device *nesdev = nesvnic->nesdev;
-	int ret = 0;
 	u8 jumbomode = 0;
 	u32 nic_active;
 	u32 nic_active_bit;
 	u32 uc_all_active;
 	u32 mc_all_active;
 
-	if ((new_mtu < ETH_ZLEN) || (new_mtu > max_mtu))
-		return -EINVAL;
-
 	netdev->mtu = new_mtu;
 	nesvnic->max_frame_size	= new_mtu + VLAN_ETH_HLEN;
 
-	if (netdev->mtu	> 1500)	{
+	if (netdev->mtu	> ETH_DATA_LEN)	{
 		jumbomode=1;
 	}
 	nes_nic_init_timer_defaults(nesdev, jumbomode);
@@ -1020,7 +1016,7 @@ static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu)
 		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
 	}
 
-	return ret;
+	return 0;
 }
 
 
@@ -1658,7 +1654,7 @@ struct net_device *nes_netdev_init(struct nes_device *nesdev,
 
 	netdev->watchdog_timeo = NES_TX_TIMEOUT;
 	netdev->irq = nesdev->pcidev->irq;
-	netdev->mtu = ETH_DATA_LEN;
+	netdev->max_mtu = NES_MAX_MTU;
 	netdev->hard_header_len = ETH_HLEN;
 	netdev->addr_len = ETH_ALEN;
 	netdev->type = ARPHRD_ETHER;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index cc059218c962..ae5d7cd100a5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -2017,6 +2017,7 @@ static struct net_device *ipoib_add_port(const char *format,
 	/* MTU will be reset when mcast join happens */
 	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
 	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
+	priv->dev->max_mtu = IPOIB_CM_MTU;
 
 	priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);
 
diff --git a/drivers/message/fusion/mptlan.c b/drivers/message/fusion/mptlan.c
index 6955c9e22d57..55dd71bbdc2a 100644
--- a/drivers/message/fusion/mptlan.c
+++ b/drivers/message/fusion/mptlan.c
@@ -548,16 +548,6 @@ mpt_lan_close(struct net_device *dev)
 	return 0;
 }
 
-/*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
-static int
-mpt_lan_change_mtu(struct net_device *dev, int new_mtu)
-{
-	if ((new_mtu < MPT_LAN_MIN_MTU) || (new_mtu > MPT_LAN_MAX_MTU))
-		return -EINVAL;
-	dev->mtu = new_mtu;
-	return 0;
-}
-
 /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/
 /* Tx timeout handler. */
 static void
@@ -1304,7 +1294,6 @@ static const struct net_device_ops mpt_netdev_ops = {
 	.ndo_open       = mpt_lan_open,
 	.ndo_stop       = mpt_lan_close,
 	.ndo_start_xmit = mpt_lan_sdu_send,
-	.ndo_change_mtu = mpt_lan_change_mtu,
 	.ndo_tx_timeout = mpt_lan_tx_timeout,
 };
 
@@ -1375,6 +1364,10 @@ mpt_register_lan_device (MPT_ADAPTER *mpt_dev, int pnum)
 	dev->netdev_ops = &mpt_netdev_ops;
 	dev->watchdog_timeo = MPT_LAN_TX_TIMEOUT;
 
+	/* MTU range: 96 - 65280 */
+	dev->min_mtu = MPT_LAN_MIN_MTU;
+	dev->max_mtu = MPT_LAN_MAX_MTU;
+
 	dlprintk((KERN_INFO MYNAM ": Finished registering dev "
 		"and setting initial values\n"));
 
diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c
index 557f9782c53c..0c26eaf5f62b 100644
--- a/drivers/misc/sgi-xp/xpnet.c
+++ b/drivers/misc/sgi-xp/xpnet.c
@@ -118,6 +118,8 @@ static DEFINE_SPINLOCK(xpnet_broadcast_lock);
  * now, the default is 64KB.
  */
 #define XPNET_MAX_MTU (0x800000UL - L1_CACHE_BYTES)
+/* 68 comes from min TCP+IP+MAC header */
+#define XPNET_MIN_MTU 68
 /* 32KB has been determined to be the ideal */
 #define XPNET_DEF_MTU (0x8000UL)
 
@@ -330,22 +332,6 @@ xpnet_dev_stop(struct net_device *dev)
 	return 0;
 }
 
-static int
-xpnet_dev_change_mtu(struct net_device *dev, int new_mtu)
-{
-	/* 68 comes from min TCP+IP+MAC header */
-	if ((new_mtu < 68) || (new_mtu > XPNET_MAX_MTU)) {
-		dev_err(xpnet, "ifconfig %s mtu %d failed; value must be "
-			"between 68 and %ld\n", dev->name, new_mtu,
-			XPNET_MAX_MTU);
-		return -EINVAL;
-	}
-
-	dev->mtu = new_mtu;
-	dev_dbg(xpnet, "ifconfig %s mtu set to %d\n", dev->name, new_mtu);
-	return 0;
-}
-
 /*
  * Notification that the other end has received the message and
  * DMA'd the skb information.  At this point, they are done with
@@ -519,7 +505,6 @@ static const struct net_device_ops xpnet_netdev_ops = {
 	.ndo_open		= xpnet_dev_open,
 	.ndo_stop		= xpnet_dev_stop,
 	.ndo_start_xmit		= xpnet_dev_hard_start_xmit,
-	.ndo_change_mtu		= xpnet_dev_change_mtu,
 	.ndo_tx_timeout		= xpnet_dev_tx_timeout,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
@@ -555,6 +540,8 @@ xpnet_init(void)
 
 	xpnet_device->netdev_ops = &xpnet_netdev_ops;
 	xpnet_device->mtu = XPNET_DEF_MTU;
+	xpnet_device->min_mtu = XPNET_MIN_MTU;
+	xpnet_device->max_mtu = XPNET_MAX_MTU;
 
 	/*
 	 * Multicast assumes the LSB of the first octet is set for multicast
diff --git a/drivers/net/fddi/skfp/skfddi.c b/drivers/net/fddi/skfp/skfddi.c
index 51acc6d86e91..3a639180e4a0 100644
--- a/drivers/net/fddi/skfp/skfddi.c
+++ b/drivers/net/fddi/skfp/skfddi.c
@@ -166,7 +166,6 @@ static const struct net_device_ops skfp_netdev_ops = {
 	.ndo_stop		= skfp_close,
 	.ndo_start_xmit		= skfp_send_pkt,
 	.ndo_get_stats		= skfp_ctl_get_stats,
-	.ndo_change_mtu		= fddi_change_mtu,
 	.ndo_set_rx_mode	= skfp_ctl_set_multicast_list,
 	.ndo_set_mac_address	= skfp_ctl_set_mac_address,
 	.ndo_do_ioctl		= skfp_ioctl,
diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c
index f36eb4ad40b7..b77e4ecf3cf2 100644
--- a/drivers/net/fjes/fjes_main.c
+++ b/drivers/net/fjes/fjes_main.c
@@ -1316,6 +1316,8 @@ static void fjes_netdev_setup(struct net_device *netdev)
 	netdev->netdev_ops = &fjes_netdev_ops;
 	fjes_set_ethtool_ops(netdev);
 	netdev->mtu = fjes_support_mtu[3];
+	netdev->min_mtu = fjes_support_mtu[0];
+	netdev->max_mtu = fjes_support_mtu[3];
 	netdev->flags |= IFF_BROADCAST;
 	netdev->features |= NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_FILTER;
 }
diff --git a/drivers/net/hippi/rrunner.c b/drivers/net/hippi/rrunner.c
index 95c0b45a68fb..f5a9728b89f3 100644
--- a/drivers/net/hippi/rrunner.c
+++ b/drivers/net/hippi/rrunner.c
@@ -68,7 +68,6 @@ static const struct net_device_ops rr_netdev_ops = {
 	.ndo_stop		= rr_close,
 	.ndo_do_ioctl		= rr_ioctl,
 	.ndo_start_xmit		= rr_start_xmit,
-	.ndo_change_mtu		= hippi_change_mtu,
 	.ndo_set_mac_address	= hippi_mac_addr,
 };
 
diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
index a31f4610b493..300bb1479b3a 100644
--- a/drivers/net/rionet.c
+++ b/drivers/net/rionet.c
@@ -466,17 +466,6 @@ static void rionet_set_msglevel(struct net_device *ndev, u32 value)
 	rnet->msg_enable = value;
 }
 
-static int rionet_change_mtu(struct net_device *ndev, int new_mtu)
-{
-	if ((new_mtu < 68) || (new_mtu > RIONET_MAX_MTU)) {
-		printk(KERN_ERR "%s: Invalid MTU size %d\n",
-		       ndev->name, new_mtu);
-		return -EINVAL;
-	}
-	ndev->mtu = new_mtu;
-	return 0;
-}
-
 static const struct ethtool_ops rionet_ethtool_ops = {
 	.get_drvinfo = rionet_get_drvinfo,
 	.get_msglevel = rionet_get_msglevel,
@@ -488,7 +477,6 @@ static const struct net_device_ops rionet_netdev_ops = {
 	.ndo_open		= rionet_open,
 	.ndo_stop		= rionet_close,
 	.ndo_start_xmit		= rionet_start_xmit,
-	.ndo_change_mtu		= rionet_change_mtu,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
 };
@@ -525,6 +513,9 @@ static int rionet_setup_netdev(struct rio_mport *mport, struct net_device *ndev)
 
 	ndev->netdev_ops = &rionet_netdev_ops;
 	ndev->mtu = RIONET_MAX_MTU;
+	/* MTU range: 68 - 4082 */
+	ndev->min_mtu = ETH_MIN_MTU;
+	ndev->max_mtu = RIONET_MAX_MTU;
 	ndev->features = NETIF_F_LLTX;
 	SET_NETDEV_DEV(ndev, &mport->dev);
 	ndev->ethtool_ops = &rionet_ethtool_ops;
diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index 9ed6d1c1ee45..7e933d8ff811 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -561,12 +561,7 @@ static int sl_change_mtu(struct net_device *dev, int new_mtu)
 {
 	struct slip *sl = netdev_priv(dev);
 
-	if (new_mtu < 68 || new_mtu > 65534)
-		return -EINVAL;
-
-	if (new_mtu != dev->mtu)
-		return sl_realloc_bufs(sl, new_mtu);
-	return 0;
+	return sl_realloc_bufs(sl, new_mtu);
 }
 
 /* Netdevice get statistics request */
@@ -663,6 +658,10 @@ static void sl_setup(struct net_device *dev)
 	dev->addr_len		= 0;
 	dev->tx_queue_len	= 10;
 
+	/* MTU range: 68 - 65534 */
+	dev->min_mtu = 68;
+	dev->max_mtu = 65534;
+
 	/* New-style flags. */
 	dev->flags		= IFF_NOARP|IFF_POINTOPOINT|IFF_MULTICAST;
 }
diff --git a/drivers/usb/gadget/function/f_phonet.c b/drivers/usb/gadget/function/f_phonet.c
index 0473d619d5bf..b4058f0000e4 100644
--- a/drivers/usb/gadget/function/f_phonet.c
+++ b/drivers/usb/gadget/function/f_phonet.c
@@ -261,19 +261,10 @@ out:
 	return NETDEV_TX_OK;
 }
 
-static int pn_net_mtu(struct net_device *dev, int new_mtu)
-{
-	if ((new_mtu < PHONET_MIN_MTU) || (new_mtu > PHONET_MAX_MTU))
-		return -EINVAL;
-	dev->mtu = new_mtu;
-	return 0;
-}
-
 static const struct net_device_ops pn_netdev_ops = {
 	.ndo_open	= pn_net_open,
 	.ndo_stop	= pn_net_close,
 	.ndo_start_xmit	= pn_net_xmit,
-	.ndo_change_mtu	= pn_net_mtu,
 };
 
 static void pn_net_setup(struct net_device *dev)
@@ -282,6 +273,8 @@ static void pn_net_setup(struct net_device *dev)
 	dev->type		= ARPHRD_PHONET;
 	dev->flags		= IFF_POINTOPOINT | IFF_NOARP;
 	dev->mtu		= PHONET_DEV_MTU;
+	dev->min_mtu		= PHONET_MIN_MTU;
+	dev->max_mtu		= PHONET_MAX_MTU;
 	dev->hard_header_len	= 1;
 	dev->dev_addr[0]	= PN_MEDIA_USB;
 	dev->addr_len		= 1;
diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c
index 9c8c9ed1dc9e..39a6df1e2ded 100644
--- a/drivers/usb/gadget/function/u_ether.c
+++ b/drivers/usb/gadget/function/u_ether.c
@@ -142,15 +142,6 @@ static inline int qlen(struct usb_gadget *gadget, unsigned qmult)
 
 /* NETWORK DRIVER HOOKUP (to the layer above this driver) */
 
-static int ueth_change_mtu(struct net_device *net, int new_mtu)
-{
-	if (new_mtu <= ETH_HLEN || new_mtu > GETHER_MAX_ETH_FRAME_LEN)
-		return -ERANGE;
-	net->mtu = new_mtu;
-
-	return 0;
-}
-
 static void eth_get_drvinfo(struct net_device *net, struct ethtool_drvinfo *p)
 {
 	struct eth_dev *dev = netdev_priv(net);
@@ -736,7 +727,6 @@ static const struct net_device_ops eth_netdev_ops = {
 	.ndo_open		= eth_open,
 	.ndo_stop		= eth_stop,
 	.ndo_start_xmit		= eth_start_xmit,
-	.ndo_change_mtu		= ueth_change_mtu,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
@@ -799,6 +789,10 @@ struct eth_dev *gether_setup_name(struct usb_gadget *g,
 
 	net->ethtool_ops = &ops;
 
+	/* MTU range: 14 - 15412 */
+	net->min_mtu = ETH_HLEN;
+	net->max_mtu = GETHER_MAX_ETH_FRAME_LEN;
+
 	dev->gadget = g;
 	SET_NETDEV_DEV(net, &g->dev);
 	SET_NETDEV_DEVTYPE(net, &gadget_type);
diff --git a/include/linux/fddidevice.h b/include/linux/fddidevice.h
index 9a79f0106da1..32c22cfb238b 100644
--- a/include/linux/fddidevice.h
+++ b/include/linux/fddidevice.h
@@ -26,7 +26,6 @@
 
 #ifdef __KERNEL__
 __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev);
-int fddi_change_mtu(struct net_device *dev, int new_mtu);
 struct net_device *alloc_fddidev(int sizeof_priv);
 #endif
 
diff --git a/include/linux/hippidevice.h b/include/linux/hippidevice.h
index 8ec23fb0b412..402f99e328d4 100644
--- a/include/linux/hippidevice.h
+++ b/include/linux/hippidevice.h
@@ -32,7 +32,6 @@ struct hippi_cb {
 };
 
 __be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev);
-int hippi_change_mtu(struct net_device *dev, int new_mtu);
 int hippi_mac_addr(struct net_device *dev, void *p);
 int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p);
 struct net_device *alloc_hippi_dev(int sizeof_priv);
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 7d3a0af954e8..6356623fc238 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -141,15 +141,6 @@ __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
 
 EXPORT_SYMBOL(fddi_type_trans);
 
-int fddi_change_mtu(struct net_device *dev, int new_mtu)
-{
-	if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN))
-		return -EINVAL;
-	dev->mtu = new_mtu;
-	return 0;
-}
-EXPORT_SYMBOL(fddi_change_mtu);
-
 static const struct header_ops fddi_header_ops = {
 	.create		= fddi_header,
 };
@@ -161,6 +152,8 @@ static void fddi_setup(struct net_device *dev)
 	dev->type		= ARPHRD_FDDI;
 	dev->hard_header_len	= FDDI_K_SNAP_HLEN+3;	/* Assume 802.2 SNAP hdr len + 3 pad bytes */
 	dev->mtu		= FDDI_K_SNAP_DLEN;	/* Assume max payload of 802.2 SNAP frame */
+	dev->min_mtu		= FDDI_K_SNAP_HLEN;
+	dev->max_mtu		= FDDI_K_SNAP_DLEN;
 	dev->addr_len		= FDDI_K_ALEN;
 	dev->tx_queue_len	= 100;			/* Long queues on FDDI */
 	dev->flags		= IFF_BROADCAST | IFF_MULTICAST;
diff --git a/net/802/hippi.c b/net/802/hippi.c
index ade1a52cdcff..5e4427beab2b 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -116,18 +116,6 @@ __be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
 
 EXPORT_SYMBOL(hippi_type_trans);
 
-int hippi_change_mtu(struct net_device *dev, int new_mtu)
-{
-	/*
-	 * HIPPI's got these nice large MTUs.
-	 */
-	if ((new_mtu < 68) || (new_mtu > 65280))
-		return -EINVAL;
-	dev->mtu = new_mtu;
-	return 0;
-}
-EXPORT_SYMBOL(hippi_change_mtu);
-
 /*
  * For HIPPI we will actually use the lower 4 bytes of the hardware
  * address as the I-FIELD rather than the actual hardware address.
@@ -174,6 +162,8 @@ static void hippi_setup(struct net_device *dev)
 	dev->type		= ARPHRD_HIPPI;
 	dev->hard_header_len 	= HIPPI_HLEN;
 	dev->mtu		= 65280;
+	dev->min_mtu		= 68;
+	dev->max_mtu		= 65280;
 	dev->addr_len		= HIPPI_ALEN;
 	dev->tx_queue_len	= 25 /* 5 */;
 	memset(dev->broadcast, 0xFF, HIPPI_ALEN);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 49e16b6e0ba3..112679d64be5 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -158,17 +158,6 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p)
 	return 0;
 }
 
-static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu)
-{
-	/* check ranges */
-	if ((new_mtu < 68) || (new_mtu > batadv_hardif_min_mtu(dev)))
-		return -EINVAL;
-
-	dev->mtu = new_mtu;
-
-	return 0;
-}
-
 /**
  * batadv_interface_set_rx_mode - set the rx mode of a device
  * @dev: registered network device to modify
@@ -920,7 +909,6 @@ static const struct net_device_ops batadv_netdev_ops = {
 	.ndo_vlan_rx_add_vid = batadv_interface_add_vid,
 	.ndo_vlan_rx_kill_vid = batadv_interface_kill_vid,
 	.ndo_set_mac_address = batadv_interface_set_mac_addr,
-	.ndo_change_mtu = batadv_interface_change_mtu,
 	.ndo_set_rx_mode = batadv_interface_set_rx_mode,
 	.ndo_start_xmit = batadv_interface_tx,
 	.ndo_validate_addr = eth_validate_addr,
@@ -987,6 +975,7 @@ struct net_device *batadv_softif_create(struct net *net, const char *name)
 	dev_net_set(soft_iface, net);
 
 	soft_iface->rtnl_link_ops = &batadv_link_ops;
+	soft_iface->max_mtu = batadv_hardif_min_mtu(soft_iface);
 
 	ret = register_netdevice(soft_iface);
 	if (ret < 0) {
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index 16737cd8dae8..fc65b145f6e7 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -398,6 +398,7 @@ void hsr_dev_setup(struct net_device *dev)
 	random_ether_addr(dev->dev_addr);
 
 	ether_setup(dev);
+	dev->min_mtu = 0;
 	dev->header_ops = &hsr_header_ops;
 	dev->netdev_ops = &hsr_device_ops;
 	SET_NETDEV_DEVTYPE(dev, &hsr_type);
diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c
index fa8237fdc57b..21c28b51be94 100644
--- a/net/phonet/pep-gprs.c
+++ b/net/phonet/pep-gprs.c
@@ -217,20 +217,10 @@ static netdev_tx_t gprs_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
-static int gprs_set_mtu(struct net_device *dev, int new_mtu)
-{
-	if ((new_mtu < 576) || (new_mtu > (PHONET_MAX_MTU - 11)))
-		return -EINVAL;
-
-	dev->mtu = new_mtu;
-	return 0;
-}
-
 static const struct net_device_ops gprs_netdev_ops = {
 	.ndo_open	= gprs_open,
 	.ndo_stop	= gprs_close,
 	.ndo_start_xmit	= gprs_xmit,
-	.ndo_change_mtu	= gprs_set_mtu,
 };
 
 static void gprs_setup(struct net_device *dev)
@@ -239,6 +229,8 @@ static void gprs_setup(struct net_device *dev)
 	dev->type		= ARPHRD_PHONET_PIPE;
 	dev->flags		= IFF_POINTOPOINT | IFF_NOARP;
 	dev->mtu		= GPRS_DEFAULT_MTU;
+	dev->min_mtu		= 576;
+	dev->max_mtu		= (PHONET_MAX_MTU - 11);
 	dev->hard_header_len	= 0;
 	dev->addr_len		= 0;
 	dev->tx_queue_len	= 10;
-- 
cgit v1.2.3


From 2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 21 Oct 2016 12:46:33 +0200
Subject: bpf: add helper for retrieving current numa node id

Use case is mainly for soreuseport to select sockets for the local
numa node, but since generic, lets also add this for other networking
and tracing program types.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  6 ++++++
 kernel/bpf/core.c        |  1 +
 kernel/bpf/helpers.c     | 12 ++++++++++++
 kernel/trace/bpf_trace.c |  2 ++
 net/core/filter.c        |  2 ++
 6 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c201017b5730..edcd96ded8aa 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -319,6 +319,7 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
+extern const struct bpf_func_proto bpf_get_numa_node_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
 extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
 extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f09c70b97eca..374ef582ae18 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -426,6 +426,12 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_set_hash_invalid,
 
+	/**
+	 * bpf_get_numa_node_id()
+	 * Returns the id of the current NUMA node.
+	 */
+	BPF_FUNC_get_numa_node_id,
+
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index aa6d98154106..82a04143368e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1043,6 +1043,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
+const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
 const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
 
 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 39918402e6e9..045cbe673356 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -13,6 +13,7 @@
 #include <linux/rcupdate.h>
 #include <linux/random.h>
 #include <linux/smp.h>
+#include <linux/topology.h>
 #include <linux/ktime.h>
 #include <linux/sched.h>
 #include <linux/uidgid.h>
@@ -92,6 +93,17 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
+BPF_CALL_0(bpf_get_numa_node_id)
+{
+	return numa_node_id();
+}
+
+const struct bpf_func_proto bpf_get_numa_node_id_proto = {
+	.func		= bpf_get_numa_node_id,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
 BPF_CALL_0(bpf_ktime_get_ns)
 {
 	/* NMI safe access to clock monotonic */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 5dcb99281259..fa77311dadb2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -422,6 +422,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 		return bpf_get_trace_printk_proto();
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_get_numa_node_id:
+		return &bpf_get_numa_node_id_proto;
 	case BPF_FUNC_perf_event_read:
 		return &bpf_perf_event_read_proto;
 	case BPF_FUNC_probe_write_user:
diff --git a/net/core/filter.c b/net/core/filter.c
index 00351cdf7d0c..cd9e2ba66b0e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2492,6 +2492,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_raw_smp_processor_id_proto;
+	case BPF_FUNC_get_numa_node_id:
+		return &bpf_get_numa_node_id_proto;
 	case BPF_FUNC_tail_call:
 		return &bpf_tail_call_proto;
 	case BPF_FUNC_ktime_get_ns:
-- 
cgit v1.2.3


From 3cf25904fe467aebeaa77d402b6cf3c6c5d6303b Mon Sep 17 00:00:00 2001
From: Xo Wang <xow@google.com>
Date: Fri, 21 Oct 2016 10:20:12 -0700
Subject: net: phy: broadcom: Update Auxiliary Control Register macros

Add the RXD-to-RXC skew (delay) time bit in the Miscellaneous Control
shadow register and a mask for the shadow selector field.

Remove a re-definition of MII_BCM54XX_AUXCTL_SHDWSEL_AUXCTL.

Signed-off-by: Xo Wang <xow@google.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/brcmphy.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index e3354b74286c..22c4421c916c 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -105,11 +105,12 @@
 #define MII_BCM54XX_AUXCTL_ACTL_SMDSP_ENA	0x0800
 
 #define MII_BCM54XX_AUXCTL_MISC_WREN	0x8000
+#define MII_BCM54XX_AUXCTL_MISC_RXD_RXC_SKEW	0x0100
 #define MII_BCM54XX_AUXCTL_MISC_FORCE_AMDIX	0x0200
 #define MII_BCM54XX_AUXCTL_MISC_RDSEL_MISC	0x7000
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC	0x0007
 
-#define MII_BCM54XX_AUXCTL_SHDWSEL_AUXCTL	0x0000
+#define MII_BCM54XX_AUXCTL_SHDWSEL_MASK	0x0007
 
 /*
  * Broadcom LED source encodings.  These are used in BCM5461, BCM5481,
-- 
cgit v1.2.3


From d92ead16be405b6d52ff7b366d1c9865ccc684bd Mon Sep 17 00:00:00 2001
From: Xo Wang <xow@google.com>
Date: Fri, 21 Oct 2016 10:20:13 -0700
Subject: net: phy: broadcom: Add support for BCM54612E

This PHY has internal delays enabled after reset. This clears the
internal delay enables unless the interface specifically requests them.

Signed-off-by: Xo Wang <xow@google.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/brcmphy.h    |  1 +
 2 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 870327efccf7..583ef8a2ec8d 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -337,6 +337,41 @@ static int bcm5481_config_aneg(struct phy_device *phydev)
 	return ret;
 }
 
+static int bcm54612e_config_aneg(struct phy_device *phydev)
+{
+	int ret;
+
+	/* First, auto-negotiate. */
+	ret = genphy_config_aneg(phydev);
+
+	/* Clear TX internal delay unless requested. */
+	if ((phydev->interface != PHY_INTERFACE_MODE_RGMII_ID) &&
+	    (phydev->interface != PHY_INTERFACE_MODE_RGMII_TXID)) {
+		/* Disable TXD to GTXCLK clock delay (default set) */
+		/* Bit 9 is the only field in shadow register 00011 */
+		bcm_phy_write_shadow(phydev, 0x03, 0);
+	}
+
+	/* Clear RX internal delay unless requested. */
+	if ((phydev->interface != PHY_INTERFACE_MODE_RGMII_ID) &&
+	    (phydev->interface != PHY_INTERFACE_MODE_RGMII_RXID)) {
+		u16 reg;
+
+		/* Errata: reads require filling in the write selector field */
+		bcm54xx_auxctl_write(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC,
+				     MII_BCM54XX_AUXCTL_MISC_RDSEL_MISC);
+		reg = phy_read(phydev, MII_BCM54XX_AUX_CTL);
+		/* Disable RXD to RXC delay (default set) */
+		reg &= ~MII_BCM54XX_AUXCTL_MISC_RXD_RXC_SKEW;
+		/* Clear shadow selector field */
+		reg &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MASK;
+		bcm54xx_auxctl_write(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC,
+				     MII_BCM54XX_AUXCTL_MISC_WREN | reg);
+	}
+
+	return ret;
+}
+
 static int brcm_phy_setbits(struct phy_device *phydev, int reg, int set)
 {
 	int val;
@@ -484,6 +519,18 @@ static struct phy_driver broadcom_drivers[] = {
 	.read_status	= genphy_read_status,
 	.ack_interrupt	= bcm_phy_ack_intr,
 	.config_intr	= bcm_phy_config_intr,
+}, {
+	.phy_id		= PHY_ID_BCM54612E,
+	.phy_id_mask	= 0xfffffff0,
+	.name		= "Broadcom BCM54612E",
+	.features	= PHY_GBIT_FEATURES |
+			  SUPPORTED_Pause | SUPPORTED_Asym_Pause,
+	.flags		= PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT,
+	.config_init	= bcm54xx_config_init,
+	.config_aneg	= bcm54612e_config_aneg,
+	.read_status	= genphy_read_status,
+	.ack_interrupt	= bcm_phy_ack_intr,
+	.config_intr	= bcm_phy_config_intr,
 }, {
 	.phy_id		= PHY_ID_BCM54616S,
 	.phy_id_mask	= 0xfffffff0,
@@ -600,6 +647,7 @@ static struct mdio_device_id __maybe_unused broadcom_tbl[] = {
 	{ PHY_ID_BCM5411, 0xfffffff0 },
 	{ PHY_ID_BCM5421, 0xfffffff0 },
 	{ PHY_ID_BCM5461, 0xfffffff0 },
+	{ PHY_ID_BCM54612E, 0xfffffff0 },
 	{ PHY_ID_BCM54616S, 0xfffffff0 },
 	{ PHY_ID_BCM5464, 0xfffffff0 },
 	{ PHY_ID_BCM5481, 0xfffffff0 },
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 22c4421c916c..60def78c4e12 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -18,6 +18,7 @@
 #define PHY_ID_BCM5421			0x002060e0
 #define PHY_ID_BCM5464			0x002060b0
 #define PHY_ID_BCM5461			0x002060c0
+#define PHY_ID_BCM54612E		0x03625e60
 #define PHY_ID_BCM54616S		0x03625d10
 #define PHY_ID_BCM57780			0x03625d90
 
-- 
cgit v1.2.3


From 3f817fe718c6cb3ddcc2ab04ba86faecc20ef8fe Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Thu, 27 Oct 2016 00:42:01 +0300
Subject: cfg80211: Define IEEE P802.11ai (FILS) information elements

Define the Element IDs and Element ID Extensions from IEEE
P802.11ai/D11.0. In addition, add a new cfg80211_find_ext_ie() wrapper
to make it easier to find information elements that used the Element ID
Extension field.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 20 ++++++++++++++++++++
 include/net/cfg80211.h    | 21 +++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index a80516fd65c8..d428adf51446 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1960,6 +1960,26 @@ enum ieee80211_eid {
 
 	WLAN_EID_VENDOR_SPECIFIC = 221,
 	WLAN_EID_QOS_PARAMETER = 222,
+	WLAN_EID_CAG_NUMBER = 237,
+	WLAN_EID_AP_CSN = 239,
+	WLAN_EID_FILS_INDICATION = 240,
+	WLAN_EID_DILS = 241,
+	WLAN_EID_FRAGMENT = 242,
+	WLAN_EID_EXTENSION = 255
+};
+
+/* Element ID Extensions for Element ID 255 */
+enum ieee80211_eid_ext {
+	WLAN_EID_EXT_ASSOC_DELAY_INFO = 1,
+	WLAN_EID_EXT_FILS_REQ_PARAMS = 2,
+	WLAN_EID_EXT_FILS_KEY_CONFIRM = 3,
+	WLAN_EID_EXT_FILS_SESSION = 4,
+	WLAN_EID_EXT_FILS_HLP_CONTAINER = 5,
+	WLAN_EID_EXT_FILS_IP_ADDR_ASSIGN = 6,
+	WLAN_EID_EXT_KEY_DELIVERY = 7,
+	WLAN_EID_EXT_FILS_WRAPPED_DATA = 8,
+	WLAN_EID_EXT_FILS_PUBLIC_KEY = 12,
+	WLAN_EID_EXT_FILS_NONCE = 13,
 };
 
 /* Action category code */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index dffc265a4fd6..8ca2e9f354f7 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4180,6 +4180,27 @@ static inline const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
 	return cfg80211_find_ie_match(eid, ies, len, NULL, 0, 0);
 }
 
+/**
+ * cfg80211_find_ext_ie - find information element with EID Extension in data
+ *
+ * @ext_eid: element ID Extension
+ * @ies: data consisting of IEs
+ * @len: length of data
+ *
+ * Return: %NULL if the extended element ID could not be found or if
+ * the element is invalid (claims to be longer than the given
+ * data), or a pointer to the first byte of the requested
+ * element, that is the byte containing the element ID.
+ *
+ * Note: There are no checks on the element length other than
+ * having to fit into the given data.
+ */
+static inline const u8 *cfg80211_find_ext_ie(u8 ext_eid, const u8 *ies, int len)
+{
+	return cfg80211_find_ie_match(WLAN_EID_EXTENSION, ies, len,
+				      &ext_eid, 1, 2);
+}
+
 /**
  * cfg80211_find_vendor_ie - find vendor specific information element in data
  *
-- 
cgit v1.2.3


From 631810603a20874554b2f17adf42b72d0f15eda5 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Thu, 27 Oct 2016 00:42:02 +0300
Subject: cfg80211: Add Fast Initial Link Setup (FILS) auth algs

This defines authentication algorithms for FILS (IEEE 802.11ai).

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h    |  3 +++
 include/uapi/linux/nl80211.h |  6 ++++++
 net/wireless/nl80211.c       | 21 +++++++++++++++++++--
 3 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index d428adf51446..793a0174ba29 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1576,6 +1576,9 @@ struct ieee80211_vht_operation {
 #define WLAN_AUTH_SHARED_KEY 1
 #define WLAN_AUTH_FT 2
 #define WLAN_AUTH_SAE 3
+#define WLAN_AUTH_FILS_SK 4
+#define WLAN_AUTH_FILS_SK_PFS 5
+#define WLAN_AUTH_FILS_PK 6
 #define WLAN_AUTH_LEAP 128
 
 #define WLAN_AUTH_CHALLENGE_LEN 128
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 7825fd4db19e..4dc21265cd12 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3669,6 +3669,9 @@ enum nl80211_bss_status {
  * @NL80211_AUTHTYPE_FT: Fast BSS Transition (IEEE 802.11r)
  * @NL80211_AUTHTYPE_NETWORK_EAP: Network EAP (some Cisco APs and mainly LEAP)
  * @NL80211_AUTHTYPE_SAE: Simultaneous authentication of equals
+ * @NL80211_AUTHTYPE_FILS_SK: Fast Initial Link Setup shared key
+ * @NL80211_AUTHTYPE_FILS_SK_PFS: Fast Initial Link Setup shared key with PFS
+ * @NL80211_AUTHTYPE_FILS_PK: Fast Initial Link Setup public key
  * @__NL80211_AUTHTYPE_NUM: internal
  * @NL80211_AUTHTYPE_MAX: maximum valid auth algorithm
  * @NL80211_AUTHTYPE_AUTOMATIC: determine automatically (if necessary by
@@ -3681,6 +3684,9 @@ enum nl80211_auth_type {
 	NL80211_AUTHTYPE_FT,
 	NL80211_AUTHTYPE_NETWORK_EAP,
 	NL80211_AUTHTYPE_SAE,
+	NL80211_AUTHTYPE_FILS_SK,
+	NL80211_AUTHTYPE_FILS_SK_PFS,
+	NL80211_AUTHTYPE_FILS_PK,
 
 	/* keep last */
 	__NL80211_AUTHTYPE_NUM,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 704851142eed..ff798620e929 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3778,12 +3778,23 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
 		if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) &&
 		    auth_type == NL80211_AUTHTYPE_SAE)
 			return false;
+		if (!wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_FILS_STA) &&
+		    (auth_type == NL80211_AUTHTYPE_FILS_SK ||
+		     auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
+		     auth_type == NL80211_AUTHTYPE_FILS_PK))
+			return false;
 		return true;
 	case NL80211_CMD_CONNECT:
 	case NL80211_CMD_START_AP:
 		/* SAE not supported yet */
 		if (auth_type == NL80211_AUTHTYPE_SAE)
 			return false;
+		/* FILS not supported yet */
+		if (auth_type == NL80211_AUTHTYPE_FILS_SK ||
+		    auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
+		    auth_type == NL80211_AUTHTYPE_FILS_PK)
+			return false;
 		return true;
 	default:
 		return false;
@@ -7810,12 +7821,18 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
 	if (!nl80211_valid_auth_type(rdev, auth_type, NL80211_CMD_AUTHENTICATE))
 		return -EINVAL;
 
-	if (auth_type == NL80211_AUTHTYPE_SAE &&
+	if ((auth_type == NL80211_AUTHTYPE_SAE ||
+	     auth_type == NL80211_AUTHTYPE_FILS_SK ||
+	     auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
+	     auth_type == NL80211_AUTHTYPE_FILS_PK) &&
 	    !info->attrs[NL80211_ATTR_AUTH_DATA])
 		return -EINVAL;
 
 	if (info->attrs[NL80211_ATTR_AUTH_DATA]) {
-		if (auth_type != NL80211_AUTHTYPE_SAE)
+		if (auth_type != NL80211_AUTHTYPE_SAE &&
+		    auth_type != NL80211_AUTHTYPE_FILS_SK &&
+		    auth_type != NL80211_AUTHTYPE_FILS_SK_PFS &&
+		    auth_type != NL80211_AUTHTYPE_FILS_PK)
 			return -EINVAL;
 		auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]);
 		auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]);
-- 
cgit v1.2.3


From 348bd456699801920a309c66e382380809fbdf41 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Thu, 27 Oct 2016 00:42:03 +0300
Subject: cfg80211: Add KEK/nonces for FILS association frames

The new nl80211 attributes can be used to provide KEK and nonces to
allow the driver to encrypt and decrypt FILS (Re)Association
Request/Response frames in station mode.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h    |  3 +++
 include/net/cfg80211.h       |  9 +++++++++
 include/uapi/linux/nl80211.h |  8 ++++++++
 net/wireless/nl80211.c       | 12 ++++++++++++
 4 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 793a0174ba29..fe849329511a 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2096,6 +2096,9 @@ enum ieee80211_key_len {
 #define IEEE80211_GCMP_MIC_LEN		16
 #define IEEE80211_GCMP_PN_LEN		6
 
+#define FILS_NONCE_LEN			16
+#define FILS_MAX_KEK_LEN		64
+
 /* Public action codes */
 enum ieee80211_pub_actioncode {
 	WLAN_PUB_ACTION_EXT_CHANSW_ANN = 4,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8ca2e9f354f7..738b4d8a4666 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1840,6 +1840,12 @@ enum cfg80211_assoc_req_flags {
  * @ht_capa_mask:  The bits of ht_capa which are to be used.
  * @vht_capa: VHT capability override
  * @vht_capa_mask: VHT capability mask indicating which fields to use
+ * @fils_kek: FILS KEK for protecting (Re)Association Request/Response frame or
+ *	%NULL if FILS is not used.
+ * @fils_kek_len: Length of fils_kek in octets
+ * @fils_nonces: FILS nonces (part of AAD) for protecting (Re)Association
+ *	Request/Response frame or %NULL if FILS is not used. This field starts
+ *	with 16 octets of STA Nonce followed by 16 octets of AP Nonce.
  */
 struct cfg80211_assoc_request {
 	struct cfg80211_bss *bss;
@@ -1851,6 +1857,9 @@ struct cfg80211_assoc_request {
 	struct ieee80211_ht_cap ht_capa;
 	struct ieee80211_ht_cap ht_capa_mask;
 	struct ieee80211_vht_cap vht_capa, vht_capa_mask;
+	const u8 *fils_kek;
+	size_t fils_kek_len;
+	const u8 *fils_nonces;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 4dc21265cd12..a268a009528a 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1944,6 +1944,11 @@ enum nl80211_commands {
  *	attribute.
  * @NL80211_ATTR_NAN_MATCH: used to report a match. This is a nested attribute.
  *	See &enum nl80211_nan_match_attributes.
+ * @NL80211_ATTR_FILS_KEK: KEK for FILS (Re)Association Request/Response frame
+ *	protection.
+ * @NL80211_ATTR_FILS_NONCES: Nonces (part of AAD) for FILS (Re)Association
+ *	Request/Response frame protection. This attribute contains the 16 octet
+ *	STA Nonce followed by 16 octets of AP Nonce.
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
@@ -2344,6 +2349,9 @@ enum nl80211_attrs {
 	NL80211_ATTR_NAN_FUNC,
 	NL80211_ATTR_NAN_MATCH,
 
+	NL80211_ATTR_FILS_KEK,
+	NL80211_ATTR_FILS_NONCES,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ff798620e929..667d5f719c22 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -414,6 +414,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 },
 	[NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 },
 	[NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED },
+	[NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY,
+				    .len = FILS_MAX_KEK_LEN },
+	[NL80211_ATTR_FILS_NONCES] = { .len = 2 * FILS_NONCE_LEN },
 };
 
 /* policy for the key attributes */
@@ -8033,6 +8036,15 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 		req.flags |= ASSOC_REQ_USE_RRM;
 	}
 
+	if (info->attrs[NL80211_ATTR_FILS_KEK]) {
+		req.fils_kek = nla_data(info->attrs[NL80211_ATTR_FILS_KEK]);
+		req.fils_kek_len = nla_len(info->attrs[NL80211_ATTR_FILS_KEK]);
+		if (!info->attrs[NL80211_ATTR_FILS_NONCES])
+			return -EINVAL;
+		req.fils_nonces =
+			nla_data(info->attrs[NL80211_ATTR_FILS_NONCES]);
+	}
+
 	err = nl80211_crypto_settings(rdev, info, &req.crypto, 1);
 	if (!err) {
 		wdev_lock(dev->ieee80211_ptr);
-- 
cgit v1.2.3


From a07ea4d9941af5a0c6f0be2a71b51ac9c083c5e5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Oct 2016 14:40:02 +0200
Subject: genetlink: no longer support using static family IDs

Static family IDs have never really been used, the only
use case was the workaround I introduced for those users
that assumed their family ID was also their multicast
group ID.

Additionally, because static family IDs would never be
reserved by the generic netlink code, using a relatively
low ID would only work for built-in families that can be
registered immediately after generic netlink is started,
which is basically only the control family (apart from
the workaround code, which I also had to add code for so
it would reserve those IDs)

Thus, anything other than GENL_ID_GENERATE is flawed and
luckily not used except in the cases I mentioned. Move
those workarounds into a few lines of code, and then get
rid of GENL_ID_GENERATE entirely, making it more robust.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/acpi/event.c                  |  1 -
 drivers/net/gtp.c                     |  1 -
 drivers/net/macsec.c                  |  1 -
 drivers/net/team/team.c               |  1 -
 drivers/net/wireless/mac80211_hwsim.c |  1 -
 drivers/scsi/pmcraid.c                |  6 ------
 drivers/target/target_core_user.c     |  1 -
 drivers/thermal/thermal_core.c        |  1 -
 fs/dlm/netlink.c                      |  1 -
 fs/quota/netlink.c                    |  7 -------
 include/linux/genl_magic_func.h       |  1 -
 include/net/genetlink.h               |  7 ++-----
 include/uapi/linux/genetlink.h        |  1 -
 kernel/taskstats.c                    |  1 -
 net/batman-adv/netlink.c              |  1 -
 net/core/devlink.c                    |  1 -
 net/core/drop_monitor.c               |  1 -
 net/hsr/hsr_netlink.c                 |  1 -
 net/ieee802154/netlink.c              |  1 -
 net/ieee802154/nl802154.c             |  1 -
 net/ipv4/fou.c                        |  1 -
 net/ipv4/tcp_metrics.c                |  1 -
 net/ipv6/ila/ila_xlat.c               |  1 -
 net/irda/irnetlink.c                  |  1 -
 net/l2tp/l2tp_netlink.c               |  1 -
 net/netfilter/ipvs/ip_vs_ctl.c        |  1 -
 net/netlabel/netlabel_calipso.c       |  1 -
 net/netlabel/netlabel_cipso_v4.c      |  1 -
 net/netlabel/netlabel_mgmt.c          |  1 -
 net/netlabel/netlabel_unlabeled.c     |  1 -
 net/netlink/genetlink.c               | 37 +++++++++++++++++++++--------------
 net/nfc/netlink.c                     |  1 -
 net/openvswitch/datapath.c            |  4 ----
 net/tipc/netlink.c                    |  1 -
 net/tipc/netlink_compat.c             |  1 -
 net/wimax/stack.c                     |  1 -
 net/wireless/nl80211.c                |  1 -
 37 files changed, 24 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c
index e24ea4e796e4..8dfca3d53131 100644
--- a/drivers/acpi/event.c
+++ b/drivers/acpi/event.c
@@ -83,7 +83,6 @@ static const struct genl_multicast_group acpi_event_mcgrps[] = {
 };
 
 static struct genl_family acpi_event_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.name = ACPI_GENL_FAMILY_NAME,
 	.version = ACPI_GENL_VERSION,
 	.maxattr = ACPI_GENL_ATTR_MAX,
diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 97e0cbca0a08..f66737ba1299 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -1095,7 +1095,6 @@ static int gtp_genl_del_pdp(struct sk_buff *skb, struct genl_info *info)
 }
 
 static struct genl_family gtp_genl_family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= "gtp",
 	.version	= 0,
 	.hdrsize	= 0,
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 1a134cb2d52c..a5309b81a786 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -1422,7 +1422,6 @@ static void clear_tx_sa(struct macsec_tx_sa *tx_sa)
 }
 
 static struct genl_family macsec_fam = {
-	.id		= GENL_ID_GENERATE,
 	.name		= MACSEC_GENL_NAME,
 	.hdrsize	= 0,
 	.version	= MACSEC_GENL_VERSION,
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index a380649bf6b5..0b50205764ff 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2151,7 +2151,6 @@ static struct rtnl_link_ops team_link_ops __read_mostly = {
  ***********************************/
 
 static struct genl_family team_nl_family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= TEAM_GENL_NAME,
 	.version	= TEAM_GENL_VERSION,
 	.maxattr	= TEAM_ATTR_MAX,
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index e95b79bccf9b..54b6cd62676e 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -589,7 +589,6 @@ struct hwsim_radiotap_ack_hdr {
 
 /* MAC80211_HWSIM netlinf family */
 static struct genl_family hwsim_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = "MAC80211_HWSIM",
 	.version = 1,
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index 68a5c347fae9..cc50eb87b28a 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -1369,12 +1369,6 @@ static struct genl_multicast_group pmcraid_mcgrps[] = {
 };
 
 static struct genl_family pmcraid_event_family = {
-	/*
-	 * Due to prior multicast group abuse (the code having assumed that
-	 * the family ID can be used as a multicast group ID) we need to
-	 * statically allocate a family (and thus group) ID.
-	 */
-	.id = GENL_ID_PMCRAID,
 	.name = "pmcraid",
 	.version = 1,
 	.maxattr = PMCRAID_AEN_ATTR_MAX,
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 62bf4fe5704a..313a0ef3cda7 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -148,7 +148,6 @@ static const struct genl_multicast_group tcmu_mcgrps[] = {
 
 /* Our generic netlink family */
 static struct genl_family tcmu_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = "TCM-USER",
 	.version = 1,
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 226b0b4aced6..68d7503f6417 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -2164,7 +2164,6 @@ static const struct genl_multicast_group thermal_event_mcgrps[] = {
 };
 
 static struct genl_family thermal_event_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.name = THERMAL_GENL_FAMILY_NAME,
 	.version = THERMAL_GENL_VERSION,
 	.maxattr = THERMAL_GENL_ATTR_MAX,
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 1e6e227134d7..00d226956264 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -17,7 +17,6 @@ static uint32_t dlm_nl_seqnum;
 static uint32_t listener_nlportid;
 
 static struct genl_family family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= DLM_GENL_NAME,
 	.version	= DLM_GENL_VERSION,
 };
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 8b252673d454..3965a5cdfaa2 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -13,13 +13,6 @@ static const struct genl_multicast_group quota_mcgrps[] = {
 
 /* Netlink family structure for quota */
 static struct genl_family quota_genl_family = {
-	/*
-	 * Needed due to multicast group ID abuse - old code assumed
-	 * the family ID was also a valid multicast group ID (which
-	 * isn't true) and userspace might thus rely on it. Assign a
-	 * static ID for this group to make dealing with that easier.
-	 */
-	.id = GENL_ID_VFS_DQUOT,
 	.hdrsize = 0,
 	.name = "VFS_DQUOT",
 	.version = 1,
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 667c31101b8b..7c070c1fe457 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -260,7 +260,6 @@ static struct genl_ops ZZZ_genl_ops[] __read_mostly = {
  */
 #define ZZZ_genl_family		CONCAT_(GENL_MAGIC_FAMILY, _genl_family)
 static struct genl_family ZZZ_genl_family __read_mostly = {
-	.id = GENL_ID_GENERATE,
 	.name = __stringify(GENL_MAGIC_FAMILY),
 	.version = GENL_MAGIC_VERSION,
 #ifdef GENL_MAGIC_FAMILY_HDRSZ
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index ef9defb3f5bc..43a5c3975a2f 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -20,7 +20,7 @@ struct genl_info;
 
 /**
  * struct genl_family - generic netlink family
- * @id: protocol family idenfitier
+ * @id: protocol family identifier (private)
  * @hdrsize: length of user specific header in bytes
  * @name: name of family
  * @version: protocol version
@@ -48,7 +48,7 @@ struct genl_info;
  * @n_ops: number of operations supported by this family (private)
  */
 struct genl_family {
-	unsigned int		id;
+	unsigned int		id;		/* private */
 	unsigned int		hdrsize;
 	char			name[GENL_NAMSIZ];
 	unsigned int		version;
@@ -149,9 +149,6 @@ static inline int genl_register_family(struct genl_family *family)
  * Registers the specified family and operations from the specified table.
  * Only one family may be registered with the same family name or identifier.
  *
- * The family id may equal GENL_ID_GENERATE causing an unique id to
- * be automatically generated and assigned.
- *
  * Either a doit or dumpit callback must be specified for every registered
  * operation or the function will fail. Only one operation structure per
  * command identifier may be registered.
diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index 5512c90af7e3..d9b2db4a29c6 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -26,7 +26,6 @@ struct genlmsghdr {
 /*
  * List of reserved static generic netlink identifiers:
  */
-#define GENL_ID_GENERATE	0
 #define GENL_ID_CTRL		NLMSG_MIN_TYPE
 #define GENL_ID_VFS_DQUOT	(NLMSG_MIN_TYPE + 1)
 #define GENL_ID_PMCRAID		(NLMSG_MIN_TYPE + 2)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b3f05ee20d18..d7a1a9461a10 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -42,7 +42,6 @@ static int family_registered;
 struct kmem_cache *taskstats_cache;
 
 static struct genl_family family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= TASKSTATS_GENL_NAME,
 	.version	= TASKSTATS_GENL_VERSION,
 	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 64cb6acbe0a6..a03b0ed7e8dd 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -49,7 +49,6 @@
 #include "translation-table.h"
 
 struct genl_family batadv_netlink_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = BATADV_NL_NAME,
 	.version = 1,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index d2fd736de6a2..3008d9c33875 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -342,7 +342,6 @@ static void devlink_nl_post_doit(const struct genl_ops *ops,
 }
 
 static struct genl_family devlink_nl_family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= DEVLINK_GENL_NAME,
 	.version	= DEVLINK_GENL_VERSION,
 	.maxattr	= DEVLINK_ATTR_MAX,
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 72cfb0c61125..a5320dfcd978 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -60,7 +60,6 @@ struct dm_hw_stat_delta {
 };
 
 static struct genl_family net_drop_monitor_family = {
-	.id             = GENL_ID_GENERATE,
 	.hdrsize        = 0,
 	.name           = "NET_DM",
 	.version        = 2,
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index d4d1617f43a8..2ad039492bee 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -132,7 +132,6 @@ static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = {
 };
 
 static struct genl_family hsr_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = "HSR",
 	.version = 1,
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index c8133c07ceee..19144158b696 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -29,7 +29,6 @@ static unsigned int ieee802154_seq_num;
 static DEFINE_SPINLOCK(ieee802154_seq_lock);
 
 struct genl_family nl802154_family = {
-	.id		= GENL_ID_GENERATE,
 	.hdrsize	= 0,
 	.name		= IEEE802154_NL_NAME,
 	.version	= 1,
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 21aabadccd0e..182299858f1d 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -34,7 +34,6 @@ static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
 
 /* the netlink family */
 static struct genl_family nl802154_fam = {
-	.id = GENL_ID_GENERATE,		/* don't bother with a hardcoded ID */
 	.name = NL802154_GENL_NAME,	/* have users key off the name instead */
 	.hdrsize = 0,			/* no private header */
 	.version = 1,			/* no particular meaning now */
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index cf50f7e2b012..e3fc527c5d37 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -623,7 +623,6 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg)
 }
 
 static struct genl_family fou_nl_family = {
-	.id		= GENL_ID_GENERATE,
 	.hdrsize	= 0,
 	.name		= FOU_GENL_NAME,
 	.version	= FOU_GENL_VERSION,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index bf1f3b2b29d1..3da305127b32 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -743,7 +743,6 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
 }
 
 static struct genl_family tcp_metrics_nl_family = {
-	.id		= GENL_ID_GENERATE,
 	.hdrsize	= 0,
 	.name		= TCP_METRICS_GENL_NAME,
 	.version	= TCP_METRICS_GENL_VERSION,
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index e604013dd814..0d57e27d1cdd 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -119,7 +119,6 @@ static const struct rhashtable_params rht_params = {
 };
 
 static struct genl_family ila_nl_family = {
-	.id		= GENL_ID_GENERATE,
 	.hdrsize	= 0,
 	.name		= ILA_GENL_NAME,
 	.version	= ILA_GENL_VERSION,
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
index e15c40e86660..f23b81aa91fe 100644
--- a/net/irda/irnetlink.c
+++ b/net/irda/irnetlink.c
@@ -25,7 +25,6 @@
 
 
 static struct genl_family irda_nl_family = {
-	.id = GENL_ID_GENERATE,
 	.name = IRDA_NL_NAME,
 	.hdrsize = 0,
 	.version = IRDA_NL_VERSION,
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index bf3117771822..4fbf1f41ac52 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -32,7 +32,6 @@
 
 
 static struct genl_family l2tp_nl_family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= L2TP_GENL_NAME,
 	.version	= L2TP_GENL_VERSION,
 	.hdrsize	= 0,
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c3c809b2e712..ceed66cdd03e 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2841,7 +2841,6 @@ static struct nf_sockopt_ops ip_vs_sockopts = {
 
 /* IPVS genetlink family */
 static struct genl_family ip_vs_genl_family = {
-	.id		= GENL_ID_GENERATE,
 	.hdrsize	= 0,
 	.name		= IPVS_GENL_NAME,
 	.version	= IPVS_GENL_VERSION,
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 2ec93c5e77bb..152e503b8c5d 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -61,7 +61,6 @@ struct netlbl_domhsh_walk_arg {
 
 /* NetLabel Generic NETLINK CALIPSO family */
 static struct genl_family netlbl_calipso_gnl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_CALIPSO_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index 7fd1104ba900..755b284e7ad4 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -60,7 +60,6 @@ struct netlbl_domhsh_walk_arg {
 
 /* NetLabel Generic NETLINK CIPSOv4 family */
 static struct genl_family netlbl_cipsov4_gnl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_CIPSOV4_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index f85d0e07af2d..3b00f2368fcd 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -61,7 +61,6 @@ struct netlbl_domhsh_walk_arg {
 
 /* NetLabel Generic NETLINK CIPSOv4 family */
 static struct genl_family netlbl_mgmt_gnl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_MGMT_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 4528cff9138b..c2ea8d1f653a 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -124,7 +124,6 @@ static u8 netlabel_unlabel_acceptflg;
 
 /* NetLabel Generic NETLINK unlabeled family */
 static struct genl_family netlbl_unlabel_gnl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_UNLABELED_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 01291b7a27bb..f19ec969edee 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -349,8 +349,6 @@ static int genl_validate_ops(const struct genl_family *family)
  *
  * Registers the specified family after validating it first. Only one
  * family may be registered with the same family name or identifier.
- * The family id may equal GENL_ID_GENERATE causing an unique id to
- * be automatically generated and assigned.
  *
  * The family's ops array must already be assigned, you can use the
  * genl_register_family_with_ops() helper function.
@@ -359,13 +357,7 @@ static int genl_validate_ops(const struct genl_family *family)
  */
 int __genl_register_family(struct genl_family *family)
 {
-	int err = -EINVAL, i;
-
-	if (family->id && family->id < GENL_MIN_ID)
-		goto errout;
-
-	if (family->id > GENL_MAX_ID)
-		goto errout;
+	int err, i;
 
 	err = genl_validate_ops(family);
 	if (err)
@@ -378,8 +370,27 @@ int __genl_register_family(struct genl_family *family)
 		goto errout_locked;
 	}
 
-	if (family->id == GENL_ID_GENERATE) {
-		u16 newid = genl_generate_id();
+	if (family == &genl_ctrl) {
+		family->id = GENL_ID_CTRL;
+	} else {
+		u16 newid;
+
+		/* this should be left zero in the struct */
+		WARN_ON(family->id);
+
+		/*
+		 * Sadly, a few cases need to be special-cased
+		 * due to them having previously abused the API
+		 * and having used their family ID also as their
+		 * multicast group ID, so we use reserved IDs
+		 * for both to be sure we can do that mapping.
+		 */
+		if (strcmp(family->name, "pmcraid") == 0)
+			newid = GENL_ID_PMCRAID;
+		else if (strcmp(family->name, "VFS_DQUOT") == 0)
+			newid = GENL_ID_VFS_DQUOT;
+		else
+			newid = genl_generate_id();
 
 		if (!newid) {
 			err = -ENOMEM;
@@ -387,9 +398,6 @@ int __genl_register_family(struct genl_family *family)
 		}
 
 		family->id = newid;
-	} else if (genl_family_find_byid(family->id)) {
-		err = -EEXIST;
-		goto errout_locked;
 	}
 
 	if (family->maxattr && !family->parallel_ops) {
@@ -419,7 +427,6 @@ int __genl_register_family(struct genl_family *family)
 
 errout_locked:
 	genl_unlock_all();
-errout:
 	return err;
 }
 EXPORT_SYMBOL(__genl_register_family);
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index 79786bf62b88..c230403e066c 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -39,7 +39,6 @@ static const struct genl_multicast_group nfc_genl_mcgrps[] = {
 };
 
 static struct genl_family nfc_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = NFC_GENL_NAME,
 	.version = NFC_GENL_VERSION,
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 194435aa1165..f9fef7dfba15 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -671,7 +671,6 @@ static const struct genl_ops dp_packet_genl_ops[] = {
 };
 
 static struct genl_family dp_packet_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_PACKET_FAMILY,
 	.version = OVS_PACKET_VERSION,
@@ -1436,7 +1435,6 @@ static const struct genl_ops dp_flow_genl_ops[] = {
 };
 
 static struct genl_family dp_flow_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_FLOW_FAMILY,
 	.version = OVS_FLOW_VERSION,
@@ -1822,7 +1820,6 @@ static const struct genl_ops dp_datapath_genl_ops[] = {
 };
 
 static struct genl_family dp_datapath_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_DATAPATH_FAMILY,
 	.version = OVS_DATAPATH_VERSION,
@@ -2244,7 +2241,6 @@ static const struct genl_ops dp_vport_genl_ops[] = {
 };
 
 struct genl_family dp_vport_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_VPORT_FAMILY,
 	.version = OVS_VPORT_VERSION,
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 4b94f3cfe3af..383b8fedabc7 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -136,7 +136,6 @@ const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {
  * so we have a separate genl handling for the new API.
  */
 struct genl_family tipc_genl_family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= TIPC_GENL_V2_NAME,
 	.version	= TIPC_GENL_V2_VERSION,
 	.hdrsize	= 0,
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 1fd464764765..f04428e4c8e5 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -1216,7 +1216,6 @@ send:
 }
 
 static struct genl_family tipc_genl_compat_family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= TIPC_GENL_NAME,
 	.version	= TIPC_GENL_VERSION,
 	.hdrsize	= TIPC_GENL_HDRLEN,
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 3f816e2971ee..8ac83a41585f 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -573,7 +573,6 @@ size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
 
 
 struct genl_family wimax_gnl_family = {
-	.id = GENL_ID_GENERATE,
 	.name = "WiMAX",
 	.version = WIMAX_GNL_VERSION,
 	.hdrsize = 0,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7d8cb3330c86..714beafe05e0 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -39,7 +39,6 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
 
 /* the netlink family */
 static struct genl_family nl80211_fam = {
-	.id = GENL_ID_GENERATE,		/* don't bother with a hardcoded ID */
 	.name = NL80211_GENL_NAME,	/* have users key off the name instead */
 	.hdrsize = 0,			/* no private header */
 	.version = 1,			/* no particular meaning now */
-- 
cgit v1.2.3


From 489111e5c25b93be80340c3113d71903d7c82136 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Oct 2016 14:40:03 +0200
Subject: genetlink: statically initialize families

Instead of providing macros/inline functions to initialize
the families, make all users initialize them statically and
get rid of the macros.

This reduces the kernel code size by about 1.6k on x86-64
(with allyesconfig).

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/acpi/event.c                  |  1 +
 drivers/net/gtp.c                     | 21 +++++++----
 drivers/net/macsec.c                  | 21 +++++++----
 drivers/net/team/team.c               | 22 +++++++----
 drivers/net/wireless/mac80211_hwsim.c | 26 +++++++------
 drivers/scsi/pmcraid.c                |  1 +
 drivers/target/target_core_user.c     |  1 +
 drivers/thermal/thermal_core.c        |  1 +
 fs/dlm/netlink.c                      | 15 +++++---
 fs/quota/netlink.c                    |  1 +
 include/linux/drbd_genl.h             |  2 +-
 include/linux/genl_magic_func.h       | 28 ++++++++------
 include/net/genetlink.h               | 71 ++++++-----------------------------
 kernel/taskstats.c                    | 17 ++++++---
 net/batman-adv/netlink.c              | 25 +++++++-----
 net/core/devlink.c                    | 27 +++++++------
 net/core/drop_monitor.c               | 20 ++++++----
 net/hsr/hsr_netlink.c                 | 22 +++++++----
 net/ieee802154/netlink.c              | 23 +++++++-----
 net/ieee802154/nl802154.c             | 34 ++++++++---------
 net/ipv4/fou.c                        | 22 ++++++-----
 net/ipv4/tcp_metrics.c                | 22 ++++++-----
 net/ipv6/ila/ila_xlat.c               | 24 +++++++-----
 net/irda/irnetlink.c                  | 19 ++++++----
 net/l2tp/l2tp_netlink.c               | 25 +++++++-----
 net/netfilter/ipvs/ip_vs_ctl.c        | 22 ++++++-----
 net/netlabel/netlabel_calipso.c       | 20 ++++++----
 net/netlabel/netlabel_cipso_v4.c      | 21 ++++++-----
 net/netlabel/netlabel_mgmt.c          | 20 ++++++----
 net/netlabel/netlabel_unlabeled.c     | 20 ++++++----
 net/netlink/genetlink.c               | 35 +++++++++--------
 net/nfc/netlink.c                     | 24 +++++++-----
 net/openvswitch/datapath.c            |  4 ++
 net/tipc/netlink.c                    | 22 ++++++-----
 net/tipc/netlink_compat.c             | 20 +++++-----
 net/wimax/stack.c                     | 19 +++++-----
 net/wireless/nl80211.c                | 33 ++++++++--------
 37 files changed, 414 insertions(+), 337 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c
index 8dfca3d53131..1ab12ad7d5ba 100644
--- a/drivers/acpi/event.c
+++ b/drivers/acpi/event.c
@@ -83,6 +83,7 @@ static const struct genl_multicast_group acpi_event_mcgrps[] = {
 };
 
 static struct genl_family acpi_event_genl_family = {
+	.module = THIS_MODULE,
 	.name = ACPI_GENL_FAMILY_NAME,
 	.version = ACPI_GENL_VERSION,
 	.maxattr = ACPI_GENL_ATTR_MAX,
diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index f66737ba1299..0604fd78f826 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -1094,13 +1094,7 @@ static int gtp_genl_del_pdp(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
-static struct genl_family gtp_genl_family = {
-	.name		= "gtp",
-	.version	= 0,
-	.hdrsize	= 0,
-	.maxattr	= GTPA_MAX,
-	.netnsok	= true,
-};
+static struct genl_family gtp_genl_family;
 
 static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq,
 			      u32 type, struct pdp_ctx *pctx)
@@ -1296,6 +1290,17 @@ static const struct genl_ops gtp_genl_ops[] = {
 	},
 };
 
+static struct genl_family gtp_genl_family = {
+	.name		= "gtp",
+	.version	= 0,
+	.hdrsize	= 0,
+	.maxattr	= GTPA_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.ops		= gtp_genl_ops,
+	.n_ops		= ARRAY_SIZE(gtp_genl_ops),
+};
+
 static int __net_init gtp_net_init(struct net *net)
 {
 	struct gtp_net *gn = net_generic(net, gtp_net_id);
@@ -1335,7 +1340,7 @@ static int __init gtp_init(void)
 	if (err < 0)
 		goto error_out;
 
-	err = genl_register_family_with_ops(&gtp_genl_family, gtp_genl_ops);
+	err = genl_register_family(&gtp_genl_family);
 	if (err < 0)
 		goto unreg_rtnl_link;
 
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index a5309b81a786..63ca7a3c77cf 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -1421,13 +1421,7 @@ static void clear_tx_sa(struct macsec_tx_sa *tx_sa)
 	macsec_txsa_put(tx_sa);
 }
 
-static struct genl_family macsec_fam = {
-	.name		= MACSEC_GENL_NAME,
-	.hdrsize	= 0,
-	.version	= MACSEC_GENL_VERSION,
-	.maxattr	= MACSEC_ATTR_MAX,
-	.netnsok	= true,
-};
+static struct genl_family macsec_fam;
 
 static struct net_device *get_dev_from_nl(struct net *net,
 					  struct nlattr **attrs)
@@ -2654,6 +2648,17 @@ static const struct genl_ops macsec_genl_ops[] = {
 	},
 };
 
+static struct genl_family macsec_fam = {
+	.name		= MACSEC_GENL_NAME,
+	.hdrsize	= 0,
+	.version	= MACSEC_GENL_VERSION,
+	.maxattr	= MACSEC_ATTR_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.ops		= macsec_genl_ops,
+	.n_ops		= ARRAY_SIZE(macsec_genl_ops),
+};
+
 static netdev_tx_t macsec_start_xmit(struct sk_buff *skb,
 				     struct net_device *dev)
 {
@@ -3461,7 +3466,7 @@ static int __init macsec_init(void)
 	if (err)
 		goto notifier;
 
-	err = genl_register_family_with_ops(&macsec_fam, macsec_genl_ops);
+	err = genl_register_family(&macsec_fam);
 	if (err)
 		goto rtnl;
 
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 0b50205764ff..46bf7c1216c0 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2150,12 +2150,7 @@ static struct rtnl_link_ops team_link_ops __read_mostly = {
  * Generic netlink custom interface
  ***********************************/
 
-static struct genl_family team_nl_family = {
-	.name		= TEAM_GENL_NAME,
-	.version	= TEAM_GENL_VERSION,
-	.maxattr	= TEAM_ATTR_MAX,
-	.netnsok	= true,
-};
+static struct genl_family team_nl_family;
 
 static const struct nla_policy team_nl_policy[TEAM_ATTR_MAX + 1] = {
 	[TEAM_ATTR_UNSPEC]			= { .type = NLA_UNSPEC, },
@@ -2745,6 +2740,18 @@ static const struct genl_multicast_group team_nl_mcgrps[] = {
 	{ .name = TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME, },
 };
 
+static struct genl_family team_nl_family = {
+	.name		= TEAM_GENL_NAME,
+	.version	= TEAM_GENL_VERSION,
+	.maxattr	= TEAM_ATTR_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.ops		= team_nl_ops,
+	.n_ops		= ARRAY_SIZE(team_nl_ops),
+	.mcgrps		= team_nl_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(team_nl_mcgrps),
+};
+
 static int team_nl_send_multicast(struct sk_buff *skb,
 				  struct team *team, u32 portid)
 {
@@ -2768,8 +2775,7 @@ static int team_nl_send_event_port_get(struct team *team,
 
 static int team_nl_init(void)
 {
-	return genl_register_family_with_ops_groups(&team_nl_family, team_nl_ops,
-						    team_nl_mcgrps);
+	return genl_register_family(&team_nl_family);
 }
 
 static void team_nl_fini(void)
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 54b6cd62676e..5d4637e586e8 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -587,14 +587,8 @@ struct hwsim_radiotap_ack_hdr {
 	__le16 rt_chbitmask;
 } __packed;
 
-/* MAC80211_HWSIM netlinf family */
-static struct genl_family hwsim_genl_family = {
-	.hdrsize = 0,
-	.name = "MAC80211_HWSIM",
-	.version = 1,
-	.maxattr = HWSIM_ATTR_MAX,
-	.netnsok = true,
-};
+/* MAC80211_HWSIM netlink family */
+static struct genl_family hwsim_genl_family;
 
 enum hwsim_multicast_groups {
 	HWSIM_MCGRP_CONFIG,
@@ -3234,6 +3228,18 @@ static const struct genl_ops hwsim_ops[] = {
 	},
 };
 
+static struct genl_family hwsim_genl_family = {
+	.name = "MAC80211_HWSIM",
+	.version = 1,
+	.maxattr = HWSIM_ATTR_MAX,
+	.netnsok = true,
+	.module = THIS_MODULE,
+	.ops = hwsim_ops,
+	.n_ops = ARRAY_SIZE(hwsim_ops),
+	.mcgrps = hwsim_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(hwsim_mcgrps),
+};
+
 static void destroy_radio(struct work_struct *work)
 {
 	struct mac80211_hwsim_data *data =
@@ -3287,9 +3293,7 @@ static int hwsim_init_netlink(void)
 
 	printk(KERN_INFO "mac80211_hwsim: initializing netlink\n");
 
-	rc = genl_register_family_with_ops_groups(&hwsim_genl_family,
-						  hwsim_ops,
-						  hwsim_mcgrps);
+	rc = genl_register_family(&hwsim_genl_family);
 	if (rc)
 		goto failure;
 
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index cc50eb87b28a..c0ab7bb8c3ce 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -1369,6 +1369,7 @@ static struct genl_multicast_group pmcraid_mcgrps[] = {
 };
 
 static struct genl_family pmcraid_event_family = {
+	.module = THIS_MODULE,
 	.name = "pmcraid",
 	.version = 1,
 	.maxattr = PMCRAID_AEN_ATTR_MAX,
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 313a0ef3cda7..3483372f5562 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -148,6 +148,7 @@ static const struct genl_multicast_group tcmu_mcgrps[] = {
 
 /* Our generic netlink family */
 static struct genl_family tcmu_genl_family = {
+	.module = THIS_MODULE,
 	.hdrsize = 0,
 	.name = "TCM-USER",
 	.version = 1,
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 68d7503f6417..93b6caab2d9f 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -2164,6 +2164,7 @@ static const struct genl_multicast_group thermal_event_mcgrps[] = {
 };
 
 static struct genl_family thermal_event_genl_family = {
+	.module = THIS_MODULE,
 	.name = THERMAL_GENL_FAMILY_NAME,
 	.version = THERMAL_GENL_VERSION,
 	.maxattr = THERMAL_GENL_ATTR_MAX,
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 00d226956264..04042d69573c 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -16,10 +16,7 @@
 static uint32_t dlm_nl_seqnum;
 static uint32_t listener_nlportid;
 
-static struct genl_family family = {
-	.name		= DLM_GENL_NAME,
-	.version	= DLM_GENL_VERSION,
-};
+static struct genl_family family;
 
 static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
 {
@@ -75,9 +72,17 @@ static struct genl_ops dlm_nl_ops[] = {
 	},
 };
 
+static struct genl_family family = {
+	.name		= DLM_GENL_NAME,
+	.version	= DLM_GENL_VERSION,
+	.ops		= dlm_nl_ops,
+	.n_ops		= ARRAY_SIZE(dlm_nl_ops),
+	.module		= THIS_MODULE,
+};
+
 int __init dlm_netlink_init(void)
 {
-	return genl_register_family_with_ops(&family, dlm_nl_ops);
+	return genl_register_family(&family);
 }
 
 void dlm_netlink_exit(void)
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 3965a5cdfaa2..9457c7b0dfa2 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -13,6 +13,7 @@ static const struct genl_multicast_group quota_mcgrps[] = {
 
 /* Netlink family structure for quota */
 static struct genl_family quota_genl_family = {
+	.module = THIS_MODULE,
 	.hdrsize = 0,
 	.name = "VFS_DQUOT",
 	.version = 1,
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index c934d3a96b5e..2896f93808ae 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -67,7 +67,7 @@
  *	genl_magic_func.h
  *		generates an entry in the static genl_ops array,
  *		and static register/unregister functions to
- *		genl_register_family_with_ops().
+ *		genl_register_family().
  *
  *	flags and handler:
  *		GENL_op_init( .doit = x, .dumpit = y, .flags = something)
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 7c070c1fe457..40c2e39362c8 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -259,15 +259,7 @@ static struct genl_ops ZZZ_genl_ops[] __read_mostly = {
  *									{{{2
  */
 #define ZZZ_genl_family		CONCAT_(GENL_MAGIC_FAMILY, _genl_family)
-static struct genl_family ZZZ_genl_family __read_mostly = {
-	.name = __stringify(GENL_MAGIC_FAMILY),
-	.version = GENL_MAGIC_VERSION,
-#ifdef GENL_MAGIC_FAMILY_HDRSZ
-	.hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ),
-#endif
-	.maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1,
-};
-
+static struct genl_family ZZZ_genl_family;
 /*
  * Magic: define multicast groups
  * Magic: define multicast group registration helper
@@ -301,11 +293,23 @@ static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)(	\
 #undef GENL_mc_group
 #define GENL_mc_group(group)
 
+static struct genl_family ZZZ_genl_family __read_mostly = {
+	.name = __stringify(GENL_MAGIC_FAMILY),
+	.version = GENL_MAGIC_VERSION,
+#ifdef GENL_MAGIC_FAMILY_HDRSZ
+	.hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ),
+#endif
+	.maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1,
+	.ops = ZZZ_genl_ops,
+	.n_ops = ARRAY_SIZE(ZZZ_genl_ops),
+	.mcgrps = ZZZ_genl_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(ZZZ_genl_mcgrps),
+	.module = THIS_MODULE,
+};
+
 int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void)
 {
-	return genl_register_family_with_ops_groups(&ZZZ_genl_family,	\
-						    ZZZ_genl_ops,	\
-						    ZZZ_genl_mcgrps);
+	return genl_register_family(&ZZZ_genl_family);
 }
 
 void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void)
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 43a5c3975a2f..2298b50cee34 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -39,13 +39,14 @@ struct genl_info;
  *	Note that unbind() will not be called symmetrically if the
  *	generic netlink family is removed while there are still open
  *	sockets.
- * @attrbuf: buffer to store parsed attributes
- * @family_list: family list
- * @mcgrps: multicast groups used by this family (private)
- * @n_mcgrps: number of multicast groups (private)
+ * @attrbuf: buffer to store parsed attributes (private)
+ * @family_list: family list (private)
+ * @mcgrps: multicast groups used by this family
+ * @n_mcgrps: number of multicast groups
  * @mcgrp_offset: starting number of multicast group IDs in this family
- * @ops: the operations supported by this family (private)
- * @n_ops: number of operations supported by this family (private)
+ *	(private)
+ * @ops: the operations supported by this family
+ * @n_ops: number of operations supported by this family
  */
 struct genl_family {
 	unsigned int		id;		/* private */
@@ -64,10 +65,10 @@ struct genl_family {
 	int			(*mcast_bind)(struct net *net, int group);
 	void			(*mcast_unbind)(struct net *net, int group);
 	struct nlattr **	attrbuf;	/* private */
-	const struct genl_ops *	ops;		/* private */
-	const struct genl_multicast_group *mcgrps; /* private */
-	unsigned int		n_ops;		/* private */
-	unsigned int		n_mcgrps;	/* private */
+	const struct genl_ops *	ops;
+	const struct genl_multicast_group *mcgrps;
+	unsigned int		n_ops;
+	unsigned int		n_mcgrps;
 	unsigned int		mcgrp_offset;	/* private */
 	struct list_head	family_list;	/* private */
 	struct module		*module;
@@ -132,55 +133,7 @@ struct genl_ops {
 	u8			flags;
 };
 
-int __genl_register_family(struct genl_family *family);
-
-static inline int genl_register_family(struct genl_family *family)
-{
-	family->module = THIS_MODULE;
-	return __genl_register_family(family);
-}
-
-/**
- * genl_register_family_with_ops - register a generic netlink family with ops
- * @family: generic netlink family
- * @ops: operations to be registered
- * @n_ops: number of elements to register
- *
- * Registers the specified family and operations from the specified table.
- * Only one family may be registered with the same family name or identifier.
- *
- * Either a doit or dumpit callback must be specified for every registered
- * operation or the function will fail. Only one operation structure per
- * command identifier may be registered.
- *
- * See include/net/genetlink.h for more documenation on the operations
- * structure.
- *
- * Return 0 on success or a negative error code.
- */
-static inline int
-_genl_register_family_with_ops_grps(struct genl_family *family,
-				    const struct genl_ops *ops, size_t n_ops,
-				    const struct genl_multicast_group *mcgrps,
-				    size_t n_mcgrps)
-{
-	family->module = THIS_MODULE;
-	family->ops = ops;
-	family->n_ops = n_ops;
-	family->mcgrps = mcgrps;
-	family->n_mcgrps = n_mcgrps;
-	return __genl_register_family(family);
-}
-
-#define genl_register_family_with_ops(family, ops)			\
-	_genl_register_family_with_ops_grps((family),			\
-					    (ops), ARRAY_SIZE(ops),	\
-					    NULL, 0)
-#define genl_register_family_with_ops_groups(family, ops, grps)	\
-	_genl_register_family_with_ops_grps((family),			\
-					    (ops), ARRAY_SIZE(ops),	\
-					    (grps), ARRAY_SIZE(grps))
-
+int genl_register_family(struct genl_family *family);
 int genl_unregister_family(struct genl_family *family);
 void genl_notify(struct genl_family *family, struct sk_buff *skb,
 		 struct genl_info *info, u32 group, gfp_t flags);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d7a1a9461a10..4075ece592f2 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -41,11 +41,7 @@ static DEFINE_PER_CPU(__u32, taskstats_seqnum);
 static int family_registered;
 struct kmem_cache *taskstats_cache;
 
-static struct genl_family family = {
-	.name		= TASKSTATS_GENL_NAME,
-	.version	= TASKSTATS_GENL_VERSION,
-	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
-};
+static struct genl_family family;
 
 static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
 	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
@@ -650,6 +646,15 @@ static const struct genl_ops taskstats_ops[] = {
 	},
 };
 
+static struct genl_family family = {
+	.name		= TASKSTATS_GENL_NAME,
+	.version	= TASKSTATS_GENL_VERSION,
+	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
+	.module		= THIS_MODULE,
+	.ops		= taskstats_ops,
+	.n_ops		= ARRAY_SIZE(taskstats_ops),
+};
+
 /* Needed early in initialization */
 void __init taskstats_init_early(void)
 {
@@ -666,7 +671,7 @@ static int __init taskstats_init(void)
 {
 	int rc;
 
-	rc = genl_register_family_with_ops(&family, taskstats_ops);
+	rc = genl_register_family(&family);
 	if (rc)
 		return rc;
 
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index a03b0ed7e8dd..e28cec34a016 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -48,13 +48,7 @@
 #include "tp_meter.h"
 #include "translation-table.h"
 
-struct genl_family batadv_netlink_family = {
-	.hdrsize = 0,
-	.name = BATADV_NL_NAME,
-	.version = 1,
-	.maxattr = BATADV_ATTR_MAX,
-	.netnsok = true,
-};
+struct genl_family batadv_netlink_family;
 
 /* multicast groups */
 enum batadv_netlink_multicast_groups {
@@ -609,6 +603,19 @@ static struct genl_ops batadv_netlink_ops[] = {
 
 };
 
+struct genl_family batadv_netlink_family = {
+	.hdrsize = 0,
+	.name = BATADV_NL_NAME,
+	.version = 1,
+	.maxattr = BATADV_ATTR_MAX,
+	.netnsok = true,
+	.module = THIS_MODULE,
+	.ops = batadv_netlink_ops,
+	.n_ops = ARRAY_SIZE(batadv_netlink_ops),
+	.mcgrps = batadv_netlink_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(batadv_netlink_mcgrps),
+};
+
 /**
  * batadv_netlink_register - register batadv genl netlink family
  */
@@ -616,9 +623,7 @@ void __init batadv_netlink_register(void)
 {
 	int ret;
 
-	ret = genl_register_family_with_ops_groups(&batadv_netlink_family,
-						   batadv_netlink_ops,
-						   batadv_netlink_mcgrps);
+	ret = genl_register_family(&batadv_netlink_family);
 	if (ret)
 		pr_warn("unable to register netlink family");
 }
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 3008d9c33875..063da8091aef 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -341,14 +341,7 @@ static void devlink_nl_post_doit(const struct genl_ops *ops,
 	mutex_unlock(&devlink_mutex);
 }
 
-static struct genl_family devlink_nl_family = {
-	.name		= DEVLINK_GENL_NAME,
-	.version	= DEVLINK_GENL_VERSION,
-	.maxattr	= DEVLINK_ATTR_MAX,
-	.netnsok	= true,
-	.pre_doit	= devlink_nl_pre_doit,
-	.post_doit	= devlink_nl_post_doit,
-};
+static struct genl_family devlink_nl_family;
 
 enum devlink_multicast_groups {
 	DEVLINK_MCGRP_CONFIG,
@@ -1619,6 +1612,20 @@ static const struct genl_ops devlink_nl_ops[] = {
 	},
 };
 
+static struct genl_family devlink_nl_family = {
+	.name		= DEVLINK_GENL_NAME,
+	.version	= DEVLINK_GENL_VERSION,
+	.maxattr	= DEVLINK_ATTR_MAX,
+	.netnsok	= true,
+	.pre_doit	= devlink_nl_pre_doit,
+	.post_doit	= devlink_nl_post_doit,
+	.module		= THIS_MODULE,
+	.ops		= devlink_nl_ops,
+	.n_ops		= ARRAY_SIZE(devlink_nl_ops),
+	.mcgrps		= devlink_nl_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(devlink_nl_mcgrps),
+};
+
 /**
  *	devlink_alloc - Allocate new devlink instance resources
  *
@@ -1841,9 +1848,7 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister);
 
 static int __init devlink_module_init(void)
 {
-	return genl_register_family_with_ops_groups(&devlink_nl_family,
-						    devlink_nl_ops,
-						    devlink_nl_mcgrps);
+	return genl_register_family(&devlink_nl_family);
 }
 
 static void __exit devlink_module_exit(void)
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index a5320dfcd978..80c002794ff6 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -59,11 +59,7 @@ struct dm_hw_stat_delta {
 	unsigned long last_drop_val;
 };
 
-static struct genl_family net_drop_monitor_family = {
-	.hdrsize        = 0,
-	.name           = "NET_DM",
-	.version        = 2,
-};
+static struct genl_family net_drop_monitor_family;
 
 static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
 
@@ -350,6 +346,17 @@ static const struct genl_ops dropmon_ops[] = {
 	},
 };
 
+static struct genl_family net_drop_monitor_family = {
+	.hdrsize        = 0,
+	.name           = "NET_DM",
+	.version        = 2,
+	.module		= THIS_MODULE,
+	.ops		= dropmon_ops,
+	.n_ops		= ARRAY_SIZE(dropmon_ops),
+	.mcgrps		= dropmon_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(dropmon_mcgrps),
+};
+
 static struct notifier_block dropmon_net_notifier = {
 	.notifier_call = dropmon_net_event
 };
@@ -366,8 +373,7 @@ static int __init init_net_drop_monitor(void)
 		return -ENOSPC;
 	}
 
-	rc = genl_register_family_with_ops_groups(&net_drop_monitor_family,
-						  dropmon_ops, dropmon_mcgrps);
+	rc = genl_register_family(&net_drop_monitor_family);
 	if (rc) {
 		pr_err("Could not create drop monitor netlink family\n");
 		return rc;
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 2ad039492bee..aab34c7f6f89 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -131,12 +131,7 @@ static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = {
 	[HSR_A_IF2_SEQ] = { .type = NLA_U16 },
 };
 
-static struct genl_family hsr_genl_family = {
-	.hdrsize = 0,
-	.name = "HSR",
-	.version = 1,
-	.maxattr = HSR_A_MAX,
-};
+static struct genl_family hsr_genl_family;
 
 static const struct genl_multicast_group hsr_mcgrps[] = {
 	{ .name = "hsr-network", },
@@ -466,6 +461,18 @@ static const struct genl_ops hsr_ops[] = {
 	},
 };
 
+static struct genl_family hsr_genl_family = {
+	.hdrsize = 0,
+	.name = "HSR",
+	.version = 1,
+	.maxattr = HSR_A_MAX,
+	.module = THIS_MODULE,
+	.ops = hsr_ops,
+	.n_ops = ARRAY_SIZE(hsr_ops),
+	.mcgrps = hsr_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(hsr_mcgrps),
+};
+
 int __init hsr_netlink_init(void)
 {
 	int rc;
@@ -474,8 +481,7 @@ int __init hsr_netlink_init(void)
 	if (rc)
 		goto fail_rtnl_link_register;
 
-	rc = genl_register_family_with_ops_groups(&hsr_genl_family, hsr_ops,
-						  hsr_mcgrps);
+	rc = genl_register_family(&hsr_genl_family);
 	if (rc)
 		goto fail_genl_register_family;
 
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index 19144158b696..08e62470bac2 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -28,13 +28,6 @@
 static unsigned int ieee802154_seq_num;
 static DEFINE_SPINLOCK(ieee802154_seq_lock);
 
-struct genl_family nl802154_family = {
-	.hdrsize	= 0,
-	.name		= IEEE802154_NL_NAME,
-	.version	= 1,
-	.maxattr	= IEEE802154_ATTR_MAX,
-};
-
 /* Requests to userspace */
 struct sk_buff *ieee802154_nl_create(int flags, u8 req)
 {
@@ -138,11 +131,21 @@ static const struct genl_multicast_group ieee802154_mcgrps[] = {
 	[IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, },
 };
 
+struct genl_family nl802154_family = {
+	.hdrsize	= 0,
+	.name		= IEEE802154_NL_NAME,
+	.version	= 1,
+	.maxattr	= IEEE802154_ATTR_MAX,
+	.module		= THIS_MODULE,
+	.ops		= ieee8021154_ops,
+	.n_ops		= ARRAY_SIZE(ieee8021154_ops),
+	.mcgrps		= ieee802154_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(ieee802154_mcgrps),
+};
+
 int __init ieee802154_nl_init(void)
 {
-	return genl_register_family_with_ops_groups(&nl802154_family,
-						    ieee8021154_ops,
-						    ieee802154_mcgrps);
+	return genl_register_family(&nl802154_family);
 }
 
 void ieee802154_nl_exit(void)
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 182299858f1d..f7e75578aedd 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -26,22 +26,8 @@
 #include "rdev-ops.h"
 #include "core.h"
 
-static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
-			     struct genl_info *info);
-
-static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
-			       struct genl_info *info);
-
 /* the netlink family */
-static struct genl_family nl802154_fam = {
-	.name = NL802154_GENL_NAME,	/* have users key off the name instead */
-	.hdrsize = 0,			/* no private header */
-	.version = 1,			/* no particular meaning now */
-	.maxattr = NL802154_ATTR_MAX,
-	.netnsok = true,
-	.pre_doit = nl802154_pre_doit,
-	.post_doit = nl802154_post_doit,
-};
+static struct genl_family nl802154_fam;
 
 /* multicast groups */
 enum nl802154_multicast_groups {
@@ -2476,11 +2462,25 @@ static const struct genl_ops nl802154_ops[] = {
 #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
 };
 
+static struct genl_family nl802154_fam = {
+	.name = NL802154_GENL_NAME,	/* have users key off the name instead */
+	.hdrsize = 0,			/* no private header */
+	.version = 1,			/* no particular meaning now */
+	.maxattr = NL802154_ATTR_MAX,
+	.netnsok = true,
+	.pre_doit = nl802154_pre_doit,
+	.post_doit = nl802154_post_doit,
+	.module = THIS_MODULE,
+	.ops = nl802154_ops,
+	.n_ops = ARRAY_SIZE(nl802154_ops),
+	.mcgrps = nl802154_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(nl802154_mcgrps),
+};
+
 /* initialisation/exit functions */
 int nl802154_init(void)
 {
-	return genl_register_family_with_ops_groups(&nl802154_fam, nl802154_ops,
-						    nl802154_mcgrps);
+	return genl_register_family(&nl802154_fam);
 }
 
 void nl802154_exit(void)
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index e3fc527c5d37..5b5226a2434f 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -622,13 +622,7 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg)
 	return err;
 }
 
-static struct genl_family fou_nl_family = {
-	.hdrsize	= 0,
-	.name		= FOU_GENL_NAME,
-	.version	= FOU_GENL_VERSION,
-	.maxattr	= FOU_ATTR_MAX,
-	.netnsok	= true,
-};
+static struct genl_family fou_nl_family;
 
 static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
 	[FOU_ATTR_PORT] = { .type = NLA_U16, },
@@ -830,6 +824,17 @@ static const struct genl_ops fou_nl_ops[] = {
 	},
 };
 
+static struct genl_family fou_nl_family = {
+	.hdrsize	= 0,
+	.name		= FOU_GENL_NAME,
+	.version	= FOU_GENL_VERSION,
+	.maxattr	= FOU_ATTR_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.ops		= fou_nl_ops,
+	.n_ops		= ARRAY_SIZE(fou_nl_ops),
+};
+
 size_t fou_encap_hlen(struct ip_tunnel_encap *e)
 {
 	return sizeof(struct udphdr);
@@ -1085,8 +1090,7 @@ static int __init fou_init(void)
 	if (ret)
 		goto exit;
 
-	ret = genl_register_family_with_ops(&fou_nl_family,
-					    fou_nl_ops);
+	ret = genl_register_family(&fou_nl_family);
 	if (ret < 0)
 		goto unregister;
 
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 3da305127b32..bba3c72c4a39 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -742,13 +742,7 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
 	rcu_read_unlock();
 }
 
-static struct genl_family tcp_metrics_nl_family = {
-	.hdrsize	= 0,
-	.name		= TCP_METRICS_GENL_NAME,
-	.version	= TCP_METRICS_GENL_VERSION,
-	.maxattr	= TCP_METRICS_ATTR_MAX,
-	.netnsok	= true,
-};
+static struct genl_family tcp_metrics_nl_family;
 
 static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
 	[TCP_METRICS_ATTR_ADDR_IPV4]	= { .type = NLA_U32, },
@@ -1115,6 +1109,17 @@ static const struct genl_ops tcp_metrics_nl_ops[] = {
 	},
 };
 
+static struct genl_family tcp_metrics_nl_family = {
+	.hdrsize	= 0,
+	.name		= TCP_METRICS_GENL_NAME,
+	.version	= TCP_METRICS_GENL_VERSION,
+	.maxattr	= TCP_METRICS_ATTR_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.ops		= tcp_metrics_nl_ops,
+	.n_ops		= ARRAY_SIZE(tcp_metrics_nl_ops),
+};
+
 static unsigned int tcpmhash_entries;
 static int __init set_tcpmhash_entries(char *str)
 {
@@ -1178,8 +1183,7 @@ void __init tcp_metrics_init(void)
 	if (ret < 0)
 		panic("Could not allocate the tcp_metrics hash table\n");
 
-	ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
-					    tcp_metrics_nl_ops);
+	ret = genl_register_family(&tcp_metrics_nl_family);
 	if (ret < 0)
 		panic("Could not register tcp_metrics generic netlink\n");
 }
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 0d57e27d1cdd..97f7b0cc4675 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -118,14 +118,7 @@ static const struct rhashtable_params rht_params = {
 	.obj_cmpfn = ila_cmpfn,
 };
 
-static struct genl_family ila_nl_family = {
-	.hdrsize	= 0,
-	.name		= ILA_GENL_NAME,
-	.version	= ILA_GENL_VERSION,
-	.maxattr	= ILA_ATTR_MAX,
-	.netnsok	= true,
-	.parallel_ops	= true,
-};
+static struct genl_family ila_nl_family;
 
 static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
 	[ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
@@ -560,6 +553,18 @@ static const struct genl_ops ila_nl_ops[] = {
 	},
 };
 
+static struct genl_family ila_nl_family = {
+	.hdrsize	= 0,
+	.name		= ILA_GENL_NAME,
+	.version	= ILA_GENL_VERSION,
+	.maxattr	= ILA_ATTR_MAX,
+	.netnsok	= true,
+	.parallel_ops	= true,
+	.module		= THIS_MODULE,
+	.ops		= ila_nl_ops,
+	.n_ops		= ARRAY_SIZE(ila_nl_ops),
+};
+
 #define ILA_HASH_TABLE_SIZE 1024
 
 static __net_init int ila_init_net(struct net *net)
@@ -630,8 +635,7 @@ int ila_xlat_init(void)
 	if (ret)
 		goto exit;
 
-	ret = genl_register_family_with_ops(&ila_nl_family,
-					    ila_nl_ops);
+	ret = genl_register_family(&ila_nl_family);
 	if (ret < 0)
 		goto unregister;
 
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
index f23b81aa91fe..07877347c2f7 100644
--- a/net/irda/irnetlink.c
+++ b/net/irda/irnetlink.c
@@ -24,12 +24,7 @@
 
 
-static struct genl_family irda_nl_family = {
-	.name = IRDA_NL_NAME,
-	.hdrsize = 0,
-	.version = IRDA_NL_VERSION,
-	.maxattr = IRDA_NL_CMD_MAX,
-};
+static struct genl_family irda_nl_family;
 
 static struct net_device * ifname_to_netdev(struct net *net, struct genl_info *info)
 {
@@ -146,9 +141,19 @@ static const struct genl_ops irda_nl_ops[] = {
 
 };
 
+static struct genl_family irda_nl_family = {
+	.name = IRDA_NL_NAME,
+	.hdrsize = 0,
+	.version = IRDA_NL_VERSION,
+	.maxattr = IRDA_NL_CMD_MAX,
+	.module = THIS_MODULE,
+	.ops = irda_nl_ops,
+	.n_ops = ARRAY_SIZE(irda_nl_ops),
+};
+
 int irda_nl_register(void)
 {
-	return genl_register_family_with_ops(&irda_nl_family, irda_nl_ops);
+	return genl_register_family(&irda_nl_family);
 }
 
 void irda_nl_unregister(void)
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 4fbf1f41ac52..e4e8c0769a6b 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -31,13 +31,7 @@
 #include "l2tp_core.h"
 
 
-static struct genl_family l2tp_nl_family = {
-	.name		= L2TP_GENL_NAME,
-	.version	= L2TP_GENL_VERSION,
-	.hdrsize	= 0,
-	.maxattr	= L2TP_ATTR_MAX,
-	.netnsok	= true,
-};
+static struct genl_family l2tp_nl_family;
 
 static const struct genl_multicast_group l2tp_multicast_group[] = {
 	{
@@ -976,6 +970,19 @@ static const struct genl_ops l2tp_nl_ops[] = {
 	},
 };
 
+static struct genl_family l2tp_nl_family = {
+	.name		= L2TP_GENL_NAME,
+	.version	= L2TP_GENL_VERSION,
+	.hdrsize	= 0,
+	.maxattr	= L2TP_ATTR_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.ops		= l2tp_nl_ops,
+	.n_ops		= ARRAY_SIZE(l2tp_nl_ops),
+	.mcgrps		= l2tp_multicast_group,
+	.n_mcgrps	= ARRAY_SIZE(l2tp_multicast_group),
+};
+
 int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops)
 {
 	int ret;
@@ -1012,9 +1019,7 @@ EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops);
 static int l2tp_nl_init(void)
 {
 	pr_info("L2TP netlink interface\n");
-	return genl_register_family_with_ops_groups(&l2tp_nl_family,
-						    l2tp_nl_ops,
-						    l2tp_multicast_group);
+	return genl_register_family(&l2tp_nl_family);
 }
 
 static void l2tp_nl_cleanup(void)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index ceed66cdd03e..ea3e8aed063f 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2840,13 +2840,7 @@ static struct nf_sockopt_ops ip_vs_sockopts = {
  */
 
 /* IPVS genetlink family */
-static struct genl_family ip_vs_genl_family = {
-	.hdrsize	= 0,
-	.name		= IPVS_GENL_NAME,
-	.version	= IPVS_GENL_VERSION,
-	.maxattr	= IPVS_CMD_MAX,
-	.netnsok        = true,         /* Make ipvsadm to work on netns */
-};
+static struct genl_family ip_vs_genl_family;
 
 /* Policy used for first-level command attributes */
 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
@@ -3871,10 +3865,20 @@ static const struct genl_ops ip_vs_genl_ops[] = {
 	},
 };
 
+static struct genl_family ip_vs_genl_family = {
+	.hdrsize	= 0,
+	.name		= IPVS_GENL_NAME,
+	.version	= IPVS_GENL_VERSION,
+	.maxattr	= IPVS_CMD_MAX,
+	.netnsok        = true,         /* Make ipvsadm to work on netns */
+	.module		= THIS_MODULE,
+	.ops		= ip_vs_genl_ops,
+	.n_ops		= ARRAY_SIZE(ip_vs_genl_ops),
+};
+
 static int __init ip_vs_genl_register(void)
 {
-	return genl_register_family_with_ops(&ip_vs_genl_family,
-					     ip_vs_genl_ops);
+	return genl_register_family(&ip_vs_genl_family);
 }
 
 static void ip_vs_genl_unregister(void)
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 152e503b8c5d..ca7c9c411a5c 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -60,12 +60,7 @@ struct netlbl_domhsh_walk_arg {
 };
 
 /* NetLabel Generic NETLINK CALIPSO family */
-static struct genl_family netlbl_calipso_gnl_family = {
-	.hdrsize = 0,
-	.name = NETLBL_NLTYPE_CALIPSO_NAME,
-	.version = NETLBL_PROTO_VERSION,
-	.maxattr = NLBL_CALIPSO_A_MAX,
-};
+static struct genl_family netlbl_calipso_gnl_family;
 
 /* NetLabel Netlink attribute policy */
 static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = {
@@ -354,6 +349,16 @@ static const struct genl_ops netlbl_calipso_ops[] = {
 	},
 };
 
+static struct genl_family netlbl_calipso_gnl_family = {
+	.hdrsize = 0,
+	.name = NETLBL_NLTYPE_CALIPSO_NAME,
+	.version = NETLBL_PROTO_VERSION,
+	.maxattr = NLBL_CALIPSO_A_MAX,
+	.module = THIS_MODULE,
+	.ops = netlbl_calipso_ops,
+	.n_ops = ARRAY_SIZE(netlbl_calipso_ops),
+};
+
 /* NetLabel Generic NETLINK Protocol Functions
  */
 
@@ -367,8 +372,7 @@ static const struct genl_ops netlbl_calipso_ops[] = {
  */
 int __init netlbl_calipso_genl_init(void)
 {
-	return genl_register_family_with_ops(&netlbl_calipso_gnl_family,
-					     netlbl_calipso_ops);
+	return genl_register_family(&netlbl_calipso_gnl_family);
 }
 
 static const struct netlbl_calipso_ops *calipso_ops;
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index 755b284e7ad4..a665eae91245 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -59,13 +59,7 @@ struct netlbl_domhsh_walk_arg {
 };
 
 /* NetLabel Generic NETLINK CIPSOv4 family */
-static struct genl_family netlbl_cipsov4_gnl_family = {
-	.hdrsize = 0,
-	.name = NETLBL_NLTYPE_CIPSOV4_NAME,
-	.version = NETLBL_PROTO_VERSION,
-	.maxattr = NLBL_CIPSOV4_A_MAX,
-};
-
+static struct genl_family netlbl_cipsov4_gnl_family;
 /* NetLabel Netlink attribute policy */
 static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1] = {
 	[NLBL_CIPSOV4_A_DOI] = { .type = NLA_U32 },
@@ -766,6 +760,16 @@ static const struct genl_ops netlbl_cipsov4_ops[] = {
 	},
 };
 
+static struct genl_family netlbl_cipsov4_gnl_family = {
+	.hdrsize = 0,
+	.name = NETLBL_NLTYPE_CIPSOV4_NAME,
+	.version = NETLBL_PROTO_VERSION,
+	.maxattr = NLBL_CIPSOV4_A_MAX,
+	.module = THIS_MODULE,
+	.ops = netlbl_cipsov4_ops,
+	.n_ops = ARRAY_SIZE(netlbl_cipsov4_ops),
+};
+
 /*
  * NetLabel Generic NETLINK Protocol Functions
  */
@@ -780,6 +784,5 @@ static const struct genl_ops netlbl_cipsov4_ops[] = {
  */
 int __init netlbl_cipsov4_genl_init(void)
 {
-	return genl_register_family_with_ops(&netlbl_cipsov4_gnl_family,
-					     netlbl_cipsov4_ops);
+	return genl_register_family(&netlbl_cipsov4_gnl_family);
 }
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index 3b00f2368fcd..ecfe8eb149db 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -60,12 +60,7 @@ struct netlbl_domhsh_walk_arg {
 };
 
 /* NetLabel Generic NETLINK CIPSOv4 family */
-static struct genl_family netlbl_mgmt_gnl_family = {
-	.hdrsize = 0,
-	.name = NETLBL_NLTYPE_MGMT_NAME,
-	.version = NETLBL_PROTO_VERSION,
-	.maxattr = NLBL_MGMT_A_MAX,
-};
+static struct genl_family netlbl_mgmt_gnl_family;
 
 /* NetLabel Netlink attribute policy */
 static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = {
@@ -833,6 +828,16 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = {
 	},
 };
 
+static struct genl_family netlbl_mgmt_gnl_family = {
+	.hdrsize = 0,
+	.name = NETLBL_NLTYPE_MGMT_NAME,
+	.version = NETLBL_PROTO_VERSION,
+	.maxattr = NLBL_MGMT_A_MAX,
+	.module = THIS_MODULE,
+	.ops = netlbl_mgmt_genl_ops,
+	.n_ops = ARRAY_SIZE(netlbl_mgmt_genl_ops),
+};
+
 /*
  * NetLabel Generic NETLINK Protocol Functions
  */
@@ -847,6 +852,5 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = {
  */
 int __init netlbl_mgmt_genl_init(void)
 {
-	return genl_register_family_with_ops(&netlbl_mgmt_gnl_family,
-					     netlbl_mgmt_genl_ops);
+	return genl_register_family(&netlbl_mgmt_gnl_family);
 }
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index c2ea8d1f653a..5dbbad41114f 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -123,12 +123,7 @@ static struct netlbl_unlhsh_iface __rcu *netlbl_unlhsh_def;
 static u8 netlabel_unlabel_acceptflg;
 
 /* NetLabel Generic NETLINK unlabeled family */
-static struct genl_family netlbl_unlabel_gnl_family = {
-	.hdrsize = 0,
-	.name = NETLBL_NLTYPE_UNLABELED_NAME,
-	.version = NETLBL_PROTO_VERSION,
-	.maxattr = NLBL_UNLABEL_A_MAX,
-};
+static struct genl_family netlbl_unlabel_gnl_family;
 
 /* NetLabel Netlink attribute policy */
 static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
@@ -1377,6 +1372,16 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = {
 	},
 };
 
+static struct genl_family netlbl_unlabel_gnl_family = {
+	.hdrsize = 0,
+	.name = NETLBL_NLTYPE_UNLABELED_NAME,
+	.version = NETLBL_PROTO_VERSION,
+	.maxattr = NLBL_UNLABEL_A_MAX,
+	.module = THIS_MODULE,
+	.ops = netlbl_unlabel_genl_ops,
+	.n_ops = ARRAY_SIZE(netlbl_unlabel_genl_ops),
+};
+
 /*
  * NetLabel Generic NETLINK Protocol Functions
  */
@@ -1391,8 +1396,7 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = {
  */
 int __init netlbl_unlabel_genl_init(void)
 {
-	return genl_register_family_with_ops(&netlbl_unlabel_gnl_family,
-					     netlbl_unlabel_genl_ops);
+	return genl_register_family(&netlbl_unlabel_gnl_family);
 }
 
 /*
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index f19ec969edee..ca582ee4ae05 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -344,18 +344,18 @@ static int genl_validate_ops(const struct genl_family *family)
 }
 
 /**
- * __genl_register_family - register a generic netlink family
+ * genl_register_family - register a generic netlink family
  * @family: generic netlink family
  *
  * Registers the specified family after validating it first. Only one
  * family may be registered with the same family name or identifier.
  *
- * The family's ops array must already be assigned, you can use the
- * genl_register_family_with_ops() helper function.
+ * The family's ops, multicast groups and module pointer must already
+ * be assigned.
  *
  * Return 0 on success or a negative error code.
  */
-int __genl_register_family(struct genl_family *family)
+int genl_register_family(struct genl_family *family)
 {
 	int err, i;
 
@@ -429,7 +429,7 @@ errout_locked:
 	genl_unlock_all();
 	return err;
 }
-EXPORT_SYMBOL(__genl_register_family);
+EXPORT_SYMBOL(genl_register_family);
 
 /**
  * genl_unregister_family - unregister generic netlink family
@@ -452,7 +452,6 @@ int genl_unregister_family(struct genl_family *family)
 		genl_unregister_mc_groups(family);
 
 		list_del(&rc->family_list);
-		family->n_ops = 0;
 		up_write(&cb_lock);
 		wait_event(genl_sk_destructing_waitq,
 			   atomic_read(&genl_sk_destructing_cnt) == 0);
@@ -681,13 +680,7 @@ static void genl_rcv(struct sk_buff *skb)
  * Controller
  **************************************************************************/
 
-static struct genl_family genl_ctrl = {
-	.id = GENL_ID_CTRL,
-	.name = "nlctrl",
-	.version = 0x2,
-	.maxattr = CTRL_ATTR_MAX,
-	.netnsok = true,
-};
+static struct genl_family genl_ctrl;
 
 static int ctrl_fill_info(struct genl_family *family, u32 portid, u32 seq,
 			  u32 flags, struct sk_buff *skb, u8 cmd)
@@ -997,6 +990,19 @@ static const struct genl_multicast_group genl_ctrl_groups[] = {
 	{ .name = "notify", },
 };
 
+static struct genl_family genl_ctrl = {
+	.module = THIS_MODULE,
+	.ops = genl_ctrl_ops,
+	.n_ops = ARRAY_SIZE(genl_ctrl_ops),
+	.mcgrps = genl_ctrl_groups,
+	.n_mcgrps = ARRAY_SIZE(genl_ctrl_groups),
+	.id = GENL_ID_CTRL,
+	.name = "nlctrl",
+	.version = 0x2,
+	.maxattr = CTRL_ATTR_MAX,
+	.netnsok = true,
+};
+
 static int genl_bind(struct net *net, int group)
 {
 	int i, err = -ENOENT;
@@ -1086,8 +1092,7 @@ static int __init genl_init(void)
 	for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
 		INIT_LIST_HEAD(&family_ht[i]);
 
-	err = genl_register_family_with_ops_groups(&genl_ctrl, genl_ctrl_ops,
-						   genl_ctrl_groups);
+	err = genl_register_family(&genl_ctrl);
 	if (err < 0)
 		goto problem;
 
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index c230403e066c..450b1e5144cc 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -38,13 +38,7 @@ static const struct genl_multicast_group nfc_genl_mcgrps[] = {
 	{ .name = NFC_GENL_MCAST_EVENT_NAME, },
 };
 
-static struct genl_family nfc_genl_family = {
-	.hdrsize = 0,
-	.name = NFC_GENL_NAME,
-	.version = NFC_GENL_VERSION,
-	.maxattr = NFC_ATTR_MAX,
-};
-
+static struct genl_family nfc_genl_family;
 static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = {
 	[NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 },
 	[NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING,
@@ -1752,6 +1746,18 @@ static const struct genl_ops nfc_genl_ops[] = {
 	},
 };
 
+static struct genl_family nfc_genl_family = {
+	.hdrsize = 0,
+	.name = NFC_GENL_NAME,
+	.version = NFC_GENL_VERSION,
+	.maxattr = NFC_ATTR_MAX,
+	.module = THIS_MODULE,
+	.ops = nfc_genl_ops,
+	.n_ops = ARRAY_SIZE(nfc_genl_ops),
+	.mcgrps = nfc_genl_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(nfc_genl_mcgrps),
+};
+
 
 struct urelease_work {
 	struct	work_struct w;
@@ -1837,9 +1843,7 @@ int __init nfc_genl_init(void)
 {
 	int rc;
 
-	rc = genl_register_family_with_ops_groups(&nfc_genl_family,
-						  nfc_genl_ops,
-						  nfc_genl_mcgrps);
+	rc = genl_register_family(&nfc_genl_family);
 	if (rc)
 		return rc;
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index f9fef7dfba15..ad6a111a0014 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -679,6 +679,7 @@ static struct genl_family dp_packet_genl_family = {
 	.parallel_ops = true,
 	.ops = dp_packet_genl_ops,
 	.n_ops = ARRAY_SIZE(dp_packet_genl_ops),
+	.module = THIS_MODULE,
 };
 
 static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
@@ -1445,6 +1446,7 @@ static struct genl_family dp_flow_genl_family = {
 	.n_ops = ARRAY_SIZE(dp_flow_genl_ops),
 	.mcgrps = &ovs_dp_flow_multicast_group,
 	.n_mcgrps = 1,
+	.module = THIS_MODULE,
 };
 
 static size_t ovs_dp_cmd_msg_size(void)
@@ -1830,6 +1832,7 @@ static struct genl_family dp_datapath_genl_family = {
 	.n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
 	.mcgrps = &ovs_dp_datapath_multicast_group,
 	.n_mcgrps = 1,
+	.module = THIS_MODULE,
 };
 
 /* Called with ovs_mutex or RCU read lock. */
@@ -2251,6 +2254,7 @@ struct genl_family dp_vport_genl_family = {
 	.n_ops = ARRAY_SIZE(dp_vport_genl_ops),
 	.mcgrps = &ovs_dp_vport_multicast_group,
 	.n_mcgrps = 1,
+	.module = THIS_MODULE,
 };
 
 static struct genl_family * const dp_genl_families[] = {
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 383b8fedabc7..74a405bf107b 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -135,14 +135,6 @@ const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {
 /* Users of the legacy API (tipc-config) can't handle that we add operations,
  * so we have a separate genl handling for the new API.
  */
-struct genl_family tipc_genl_family = {
-	.name		= TIPC_GENL_V2_NAME,
-	.version	= TIPC_GENL_V2_VERSION,
-	.hdrsize	= 0,
-	.maxattr	= TIPC_NLA_MAX,
-	.netnsok	= true,
-};
-
 static const struct genl_ops tipc_genl_v2_ops[] = {
 	{
 		.cmd	= TIPC_NL_BEARER_DISABLE,
@@ -257,6 +249,17 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
 #endif
 };
 
+struct genl_family tipc_genl_family = {
+	.name		= TIPC_GENL_V2_NAME,
+	.version	= TIPC_GENL_V2_VERSION,
+	.hdrsize	= 0,
+	.maxattr	= TIPC_NLA_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.ops		= tipc_genl_v2_ops,
+	.n_ops		= ARRAY_SIZE(tipc_genl_v2_ops),
+};
+
 int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr)
 {
 	u32 maxattr = tipc_genl_family.maxattr;
@@ -272,8 +275,7 @@ int tipc_netlink_start(void)
 {
 	int res;
 
-	res = genl_register_family_with_ops(&tipc_genl_family,
-					    tipc_genl_v2_ops);
+	res = genl_register_family(&tipc_genl_family);
 	if (res) {
 		pr_err("Failed to register netlink interface\n");
 		return res;
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index f04428e4c8e5..07b19931e458 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -1215,27 +1215,29 @@ send:
 	return err;
 }
 
+static struct genl_ops tipc_genl_compat_ops[] = {
+	{
+		.cmd		= TIPC_GENL_CMD,
+		.doit		= tipc_nl_compat_recv,
+	},
+};
+
 static struct genl_family tipc_genl_compat_family = {
 	.name		= TIPC_GENL_NAME,
 	.version	= TIPC_GENL_VERSION,
 	.hdrsize	= TIPC_GENL_HDRLEN,
 	.maxattr	= 0,
 	.netnsok	= true,
-};
-
-static struct genl_ops tipc_genl_compat_ops[] = {
-	{
-		.cmd		= TIPC_GENL_CMD,
-		.doit		= tipc_nl_compat_recv,
-	},
+	.module		= THIS_MODULE,
+	.ops		= tipc_genl_compat_ops,
+	.n_ops		= ARRAY_SIZE(tipc_genl_compat_ops),
 };
 
 int tipc_netlink_compat_start(void)
 {
 	int res;
 
-	res = genl_register_family_with_ops(&tipc_genl_compat_family,
-					    tipc_genl_compat_ops);
+	res = genl_register_family(&tipc_genl_compat_family);
 	if (res) {
 		pr_err("Failed to register legacy compat interface\n");
 		return res;
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 8ac83a41585f..587e1627681f 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -572,15 +572,20 @@ struct d_level D_LEVEL[] = {
 size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
 
 
+static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
+	{ .name = "msg", },
+};
+
 struct genl_family wimax_gnl_family = {
 	.name = "WiMAX",
 	.version = WIMAX_GNL_VERSION,
 	.hdrsize = 0,
 	.maxattr = WIMAX_GNL_ATTR_MAX,
-};
-
-static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
-	{ .name = "msg", },
+	.module = THIS_MODULE,
+	.ops = wimax_gnl_ops,
+	.n_ops = ARRAY_SIZE(wimax_gnl_ops),
+	.mcgrps = wimax_gnl_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(wimax_gnl_mcgrps),
 };
 
 
@@ -595,11 +600,7 @@ int __init wimax_subsys_init(void)
 	d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params,
 		       "wimax.debug");
 
-	snprintf(wimax_gnl_family.name, sizeof(wimax_gnl_family.name),
-		 "WiMAX");
-	result = genl_register_family_with_ops_groups(&wimax_gnl_family,
-						      wimax_gnl_ops,
-						      wimax_gnl_mcgrps);
+	result = genl_register_family(&wimax_gnl_family);
 	if (unlikely(result < 0)) {
 		pr_err("cannot register generic netlink family: %d\n", result);
 		goto error_register_family;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 714beafe05e0..8e5ca3c47593 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -32,21 +32,8 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
 				   struct cfg80211_crypto_settings *settings,
 				   int cipher_limit);
 
-static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
-			    struct genl_info *info);
-static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
-			      struct genl_info *info);
-
 /* the netlink family */
-static struct genl_family nl80211_fam = {
-	.name = NL80211_GENL_NAME,	/* have users key off the name instead */
-	.hdrsize = 0,			/* no private header */
-	.version = 1,			/* no particular meaning now */
-	.maxattr = NL80211_ATTR_MAX,
-	.netnsok = true,
-	.pre_doit = nl80211_pre_doit,
-	.post_doit = nl80211_post_doit,
-};
+static struct genl_family nl80211_fam;
 
 /* multicast groups */
 enum nl80211_multicast_groups {
@@ -12599,6 +12586,21 @@ static const struct genl_ops nl80211_ops[] = {
 	},
 };
 
+static struct genl_family nl80211_fam = {
+	.name = NL80211_GENL_NAME,	/* have users key off the name instead */
+	.hdrsize = 0,			/* no private header */
+	.version = 1,			/* no particular meaning now */
+	.maxattr = NL80211_ATTR_MAX,
+	.netnsok = true,
+	.pre_doit = nl80211_pre_doit,
+	.post_doit = nl80211_post_doit,
+	.module = THIS_MODULE,
+	.ops = nl80211_ops,
+	.n_ops = ARRAY_SIZE(nl80211_ops),
+	.mcgrps = nl80211_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(nl80211_mcgrps),
+};
+
 /* notification functions */
 
 void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev,
@@ -14565,8 +14567,7 @@ int nl80211_init(void)
 {
 	int err;
 
-	err = genl_register_family_with_ops_groups(&nl80211_fam, nl80211_ops,
-						   nl80211_mcgrps);
+	err = genl_register_family(&nl80211_fam);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 56989f6d8568c21257dcec0f5e644d5570ba3281 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Oct 2016 14:40:05 +0200
Subject: genetlink: mark families as __ro_after_init

Now genl_register_family() is the only thing (other than the
users themselves, perhaps, but I didn't find any doing that)
writing to the family struct.

In all families that I found, genl_register_family() is only
called from __init functions (some indirectly, in which case
I've add __init annotations to clarifly things), so all can
actually be marked __ro_after_init.

This protects the data structure from accidental corruption.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/acpi/event.c                  |  4 ++--
 drivers/net/gtp.c                     |  2 +-
 drivers/net/macsec.c                  |  2 +-
 drivers/net/team/team.c               |  4 ++--
 drivers/net/wireless/mac80211_hwsim.c |  4 ++--
 drivers/scsi/pmcraid.c                |  4 ++--
 drivers/target/target_core_user.c     |  2 +-
 drivers/thermal/thermal_core.c        |  4 ++--
 fs/dlm/netlink.c                      |  2 +-
 fs/quota/netlink.c                    |  2 +-
 include/linux/genl_magic_func.h       |  2 +-
 kernel/taskstats.c                    |  2 +-
 net/batman-adv/netlink.c              |  2 +-
 net/core/devlink.c                    |  2 +-
 net/core/drop_monitor.c               |  2 +-
 net/hsr/hsr_netlink.c                 |  2 +-
 net/ieee802154/netlink.c              |  2 +-
 net/ieee802154/nl802154.c             |  4 ++--
 net/ipv4/fou.c                        |  2 +-
 net/ipv4/tcp_metrics.c                |  2 +-
 net/ipv6/ila/ila_xlat.c               |  4 ++--
 net/irda/irnetlink.c                  |  4 ++--
 net/l2tp/l2tp_netlink.c               |  4 ++--
 net/netfilter/ipvs/ip_vs_ctl.c        |  2 +-
 net/netlabel/netlabel_calipso.c       |  2 +-
 net/netlabel/netlabel_cipso_v4.c      |  2 +-
 net/netlabel/netlabel_mgmt.c          |  2 +-
 net/netlabel/netlabel_unlabeled.c     |  2 +-
 net/netlink/genetlink.c               |  2 +-
 net/nfc/netlink.c                     |  2 +-
 net/openvswitch/datapath.c            | 10 +++++-----
 net/tipc/netlink.c                    |  4 ++--
 net/tipc/netlink_compat.c             |  4 ++--
 net/wimax/stack.c                     |  2 +-
 net/wireless/nl80211.c                |  4 ++--
 35 files changed, 51 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c
index 1ab12ad7d5ba..7fceb3b4691b 100644
--- a/drivers/acpi/event.c
+++ b/drivers/acpi/event.c
@@ -82,7 +82,7 @@ static const struct genl_multicast_group acpi_event_mcgrps[] = {
 	{ .name = ACPI_GENL_MCAST_GROUP_NAME, },
 };
 
-static struct genl_family acpi_event_genl_family = {
+static struct genl_family acpi_event_genl_family __ro_after_init = {
 	.module = THIS_MODULE,
 	.name = ACPI_GENL_FAMILY_NAME,
 	.version = ACPI_GENL_VERSION,
@@ -144,7 +144,7 @@ int acpi_bus_generate_netlink_event(const char *device_class,
 
 EXPORT_SYMBOL(acpi_bus_generate_netlink_event);
 
-static int acpi_event_genetlink_init(void)
+static int __init acpi_event_genetlink_init(void)
 {
 	return genl_register_family(&acpi_event_genl_family);
 }
diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 0604fd78f826..719d19f35673 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -1290,7 +1290,7 @@ static const struct genl_ops gtp_genl_ops[] = {
 	},
 };
 
-static struct genl_family gtp_genl_family = {
+static struct genl_family gtp_genl_family __ro_after_init = {
 	.name		= "gtp",
 	.version	= 0,
 	.hdrsize	= 0,
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 63ca7a3c77cf..0a715ab9d9cc 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -2648,7 +2648,7 @@ static const struct genl_ops macsec_genl_ops[] = {
 	},
 };
 
-static struct genl_family macsec_fam = {
+static struct genl_family macsec_fam __ro_after_init = {
 	.name		= MACSEC_GENL_NAME,
 	.hdrsize	= 0,
 	.version	= MACSEC_GENL_VERSION,
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 46bf7c1216c0..bdc58567d10e 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2740,7 +2740,7 @@ static const struct genl_multicast_group team_nl_mcgrps[] = {
 	{ .name = TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME, },
 };
 
-static struct genl_family team_nl_family = {
+static struct genl_family team_nl_family __ro_after_init = {
 	.name		= TEAM_GENL_NAME,
 	.version	= TEAM_GENL_VERSION,
 	.maxattr	= TEAM_ATTR_MAX,
@@ -2773,7 +2773,7 @@ static int team_nl_send_event_port_get(struct team *team,
 					  port);
 }
 
-static int team_nl_init(void)
+static int __init team_nl_init(void)
 {
 	return genl_register_family(&team_nl_family);
 }
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 5d4637e586e8..220e9dc8ccf8 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -3228,7 +3228,7 @@ static const struct genl_ops hwsim_ops[] = {
 	},
 };
 
-static struct genl_family hwsim_genl_family = {
+static struct genl_family hwsim_genl_family __ro_after_init = {
 	.name = "MAC80211_HWSIM",
 	.version = 1,
 	.maxattr = HWSIM_ATTR_MAX,
@@ -3287,7 +3287,7 @@ static struct notifier_block hwsim_netlink_notifier = {
 	.notifier_call = mac80211_hwsim_netlink_notify,
 };
 
-static int hwsim_init_netlink(void)
+static int __init hwsim_init_netlink(void)
 {
 	int rc;
 
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index c0ab7bb8c3ce..845affa112f7 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -1368,7 +1368,7 @@ static struct genl_multicast_group pmcraid_mcgrps[] = {
 	{ .name = "events", /* not really used - see ID discussion below */ },
 };
 
-static struct genl_family pmcraid_event_family = {
+static struct genl_family pmcraid_event_family __ro_after_init = {
 	.module = THIS_MODULE,
 	.name = "pmcraid",
 	.version = 1,
@@ -1384,7 +1384,7 @@ static struct genl_family pmcraid_event_family = {
  *	0 if the pmcraid_event_family is successfully registered
  *	with netlink generic, non-zero otherwise
  */
-static int pmcraid_netlink_init(void)
+static int __init pmcraid_netlink_init(void)
 {
 	int result;
 
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 3483372f5562..0f173bf7dbac 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -147,7 +147,7 @@ static const struct genl_multicast_group tcmu_mcgrps[] = {
 };
 
 /* Our generic netlink family */
-static struct genl_family tcmu_genl_family = {
+static struct genl_family tcmu_genl_family __ro_after_init = {
 	.module = THIS_MODULE,
 	.hdrsize = 0,
 	.name = "TCM-USER",
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 93b6caab2d9f..911fd964c742 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -2163,7 +2163,7 @@ static const struct genl_multicast_group thermal_event_mcgrps[] = {
 	{ .name = THERMAL_GENL_MCAST_GROUP_NAME, },
 };
 
-static struct genl_family thermal_event_genl_family = {
+static struct genl_family thermal_event_genl_family __ro_after_init = {
 	.module = THIS_MODULE,
 	.name = THERMAL_GENL_FAMILY_NAME,
 	.version = THERMAL_GENL_VERSION,
@@ -2235,7 +2235,7 @@ int thermal_generate_netlink_event(struct thermal_zone_device *tz,
 }
 EXPORT_SYMBOL_GPL(thermal_generate_netlink_event);
 
-static int genetlink_init(void)
+static int __init genetlink_init(void)
 {
 	return genl_register_family(&thermal_event_genl_family);
 }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 04042d69573c..0643ae44f342 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -72,7 +72,7 @@ static struct genl_ops dlm_nl_ops[] = {
 	},
 };
 
-static struct genl_family family = {
+static struct genl_family family __ro_after_init = {
 	.name		= DLM_GENL_NAME,
 	.version	= DLM_GENL_VERSION,
 	.ops		= dlm_nl_ops,
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 9457c7b0dfa2..e99b1a72d9a7 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -12,7 +12,7 @@ static const struct genl_multicast_group quota_mcgrps[] = {
 };
 
 /* Netlink family structure for quota */
-static struct genl_family quota_genl_family = {
+static struct genl_family quota_genl_family __ro_after_init = {
 	.module = THIS_MODULE,
 	.hdrsize = 0,
 	.name = "VFS_DQUOT",
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 40c2e39362c8..377257d8f7e3 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -293,7 +293,7 @@ static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)(	\
 #undef GENL_mc_group
 #define GENL_mc_group(group)
 
-static struct genl_family ZZZ_genl_family __read_mostly = {
+static struct genl_family ZZZ_genl_family __ro_after_init = {
 	.name = __stringify(GENL_MAGIC_FAMILY),
 	.version = GENL_MAGIC_VERSION,
 #ifdef GENL_MAGIC_FAMILY_HDRSZ
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4075ece592f2..9b7f838511ce 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -646,7 +646,7 @@ static const struct genl_ops taskstats_ops[] = {
 	},
 };
 
-static struct genl_family family = {
+static struct genl_family family __ro_after_init = {
 	.name		= TASKSTATS_GENL_NAME,
 	.version	= TASKSTATS_GENL_VERSION,
 	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index e28cec34a016..005012ba9b48 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -603,7 +603,7 @@ static struct genl_ops batadv_netlink_ops[] = {
 
 };
 
-struct genl_family batadv_netlink_family = {
+struct genl_family batadv_netlink_family __ro_after_init = {
 	.hdrsize = 0,
 	.name = BATADV_NL_NAME,
 	.version = 1,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 063da8091aef..c14f8b661db9 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1612,7 +1612,7 @@ static const struct genl_ops devlink_nl_ops[] = {
 	},
 };
 
-static struct genl_family devlink_nl_family = {
+static struct genl_family devlink_nl_family __ro_after_init = {
 	.name		= DEVLINK_GENL_NAME,
 	.version	= DEVLINK_GENL_VERSION,
 	.maxattr	= DEVLINK_ATTR_MAX,
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 80c002794ff6..8e0c0635ee97 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -346,7 +346,7 @@ static const struct genl_ops dropmon_ops[] = {
 	},
 };
 
-static struct genl_family net_drop_monitor_family = {
+static struct genl_family net_drop_monitor_family __ro_after_init = {
 	.hdrsize        = 0,
 	.name           = "NET_DM",
 	.version        = 2,
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index aab34c7f6f89..1ab30e7d3f99 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -461,7 +461,7 @@ static const struct genl_ops hsr_ops[] = {
 	},
 };
 
-static struct genl_family hsr_genl_family = {
+static struct genl_family hsr_genl_family __ro_after_init = {
 	.hdrsize = 0,
 	.name = "HSR",
 	.version = 1,
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index 08e62470bac2..6bde9e5a5503 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -131,7 +131,7 @@ static const struct genl_multicast_group ieee802154_mcgrps[] = {
 	[IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, },
 };
 
-struct genl_family nl802154_family = {
+struct genl_family nl802154_family __ro_after_init = {
 	.hdrsize	= 0,
 	.name		= IEEE802154_NL_NAME,
 	.version	= 1,
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index f7e75578aedd..fc60cd061f39 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -2462,7 +2462,7 @@ static const struct genl_ops nl802154_ops[] = {
 #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
 };
 
-static struct genl_family nl802154_fam = {
+static struct genl_family nl802154_fam __ro_after_init = {
 	.name = NL802154_GENL_NAME,	/* have users key off the name instead */
 	.hdrsize = 0,			/* no private header */
 	.version = 1,			/* no particular meaning now */
@@ -2478,7 +2478,7 @@ static struct genl_family nl802154_fam = {
 };
 
 /* initialisation/exit functions */
-int nl802154_init(void)
+int __init nl802154_init(void)
 {
 	return genl_register_family(&nl802154_fam);
 }
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 5b5226a2434f..6cb57bb8692d 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -824,7 +824,7 @@ static const struct genl_ops fou_nl_ops[] = {
 	},
 };
 
-static struct genl_family fou_nl_family = {
+static struct genl_family fou_nl_family __ro_after_init = {
 	.hdrsize	= 0,
 	.name		= FOU_GENL_NAME,
 	.version	= FOU_GENL_VERSION,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index bba3c72c4a39..d46f4d5b1c62 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1109,7 +1109,7 @@ static const struct genl_ops tcp_metrics_nl_ops[] = {
 	},
 };
 
-static struct genl_family tcp_metrics_nl_family = {
+static struct genl_family tcp_metrics_nl_family __ro_after_init = {
 	.hdrsize	= 0,
 	.name		= TCP_METRICS_GENL_NAME,
 	.version	= TCP_METRICS_GENL_VERSION,
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 97f7b0cc4675..628ae6d85b59 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -553,7 +553,7 @@ static const struct genl_ops ila_nl_ops[] = {
 	},
 };
 
-static struct genl_family ila_nl_family = {
+static struct genl_family ila_nl_family __ro_after_init = {
 	.hdrsize	= 0,
 	.name		= ILA_GENL_NAME,
 	.version	= ILA_GENL_VERSION,
@@ -627,7 +627,7 @@ static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral)
 	return 0;
 }
 
-int ila_xlat_init(void)
+int __init ila_xlat_init(void)
 {
 	int ret;
 
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
index 07877347c2f7..7fc340e574cf 100644
--- a/net/irda/irnetlink.c
+++ b/net/irda/irnetlink.c
@@ -141,7 +141,7 @@ static const struct genl_ops irda_nl_ops[] = {
 
 };
 
-static struct genl_family irda_nl_family = {
+static struct genl_family irda_nl_family __ro_after_init = {
 	.name = IRDA_NL_NAME,
 	.hdrsize = 0,
 	.version = IRDA_NL_VERSION,
@@ -151,7 +151,7 @@ static struct genl_family irda_nl_family = {
 	.n_ops = ARRAY_SIZE(irda_nl_ops),
 };
 
-int irda_nl_register(void)
+int __init irda_nl_register(void)
 {
 	return genl_register_family(&irda_nl_family);
 }
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index e4e8c0769a6b..59aa2d204e4a 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -970,7 +970,7 @@ static const struct genl_ops l2tp_nl_ops[] = {
 	},
 };
 
-static struct genl_family l2tp_nl_family = {
+static struct genl_family l2tp_nl_family __ro_after_init = {
 	.name		= L2TP_GENL_NAME,
 	.version	= L2TP_GENL_VERSION,
 	.hdrsize	= 0,
@@ -1016,7 +1016,7 @@ void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type)
 }
 EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops);
 
-static int l2tp_nl_init(void)
+static int __init l2tp_nl_init(void)
 {
 	pr_info("L2TP netlink interface\n");
 	return genl_register_family(&l2tp_nl_family);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index ea3e8aed063f..6b85ded4f91d 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3865,7 +3865,7 @@ static const struct genl_ops ip_vs_genl_ops[] = {
 	},
 };
 
-static struct genl_family ip_vs_genl_family = {
+static struct genl_family ip_vs_genl_family __ro_after_init = {
 	.hdrsize	= 0,
 	.name		= IPVS_GENL_NAME,
 	.version	= IPVS_GENL_VERSION,
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index ca7c9c411a5c..d177dd066504 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -349,7 +349,7 @@ static const struct genl_ops netlbl_calipso_ops[] = {
 	},
 };
 
-static struct genl_family netlbl_calipso_gnl_family = {
+static struct genl_family netlbl_calipso_gnl_family __ro_after_init = {
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_CALIPSO_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index a665eae91245..4149d3e63589 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -760,7 +760,7 @@ static const struct genl_ops netlbl_cipsov4_ops[] = {
 	},
 };
 
-static struct genl_family netlbl_cipsov4_gnl_family = {
+static struct genl_family netlbl_cipsov4_gnl_family __ro_after_init = {
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_CIPSOV4_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index ecfe8eb149db..21e0095b1d14 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -828,7 +828,7 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = {
 	},
 };
 
-static struct genl_family netlbl_mgmt_gnl_family = {
+static struct genl_family netlbl_mgmt_gnl_family __ro_after_init = {
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_MGMT_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 5dbbad41114f..22dc1b9d6362 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -1372,7 +1372,7 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = {
 	},
 };
 
-static struct genl_family netlbl_unlabel_gnl_family = {
+static struct genl_family netlbl_unlabel_gnl_family __ro_after_init = {
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_UNLABELED_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 85659921e7b2..df0cbcddda2c 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -936,7 +936,7 @@ static const struct genl_multicast_group genl_ctrl_groups[] = {
 	{ .name = "notify", },
 };
 
-static struct genl_family genl_ctrl = {
+static struct genl_family genl_ctrl __ro_after_init = {
 	.module = THIS_MODULE,
 	.ops = genl_ctrl_ops,
 	.n_ops = ARRAY_SIZE(genl_ctrl_ops),
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index 450b1e5144cc..03f3d5c7beb8 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -1746,7 +1746,7 @@ static const struct genl_ops nfc_genl_ops[] = {
 	},
 };
 
-static struct genl_family nfc_genl_family = {
+static struct genl_family nfc_genl_family __ro_after_init = {
 	.hdrsize = 0,
 	.name = NFC_GENL_NAME,
 	.version = NFC_GENL_VERSION,
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index ad6a111a0014..fa8760176b7d 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -670,7 +670,7 @@ static const struct genl_ops dp_packet_genl_ops[] = {
 	}
 };
 
-static struct genl_family dp_packet_genl_family = {
+static struct genl_family dp_packet_genl_family __ro_after_init = {
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_PACKET_FAMILY,
 	.version = OVS_PACKET_VERSION,
@@ -1435,7 +1435,7 @@ static const struct genl_ops dp_flow_genl_ops[] = {
 	},
 };
 
-static struct genl_family dp_flow_genl_family = {
+static struct genl_family dp_flow_genl_family __ro_after_init = {
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_FLOW_FAMILY,
 	.version = OVS_FLOW_VERSION,
@@ -1821,7 +1821,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = {
 	},
 };
 
-static struct genl_family dp_datapath_genl_family = {
+static struct genl_family dp_datapath_genl_family __ro_after_init = {
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_DATAPATH_FAMILY,
 	.version = OVS_DATAPATH_VERSION,
@@ -2243,7 +2243,7 @@ static const struct genl_ops dp_vport_genl_ops[] = {
 	},
 };
 
-struct genl_family dp_vport_genl_family = {
+struct genl_family dp_vport_genl_family __ro_after_init = {
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_VPORT_FAMILY,
 	.version = OVS_VPORT_VERSION,
@@ -2272,7 +2272,7 @@ static void dp_unregister_genl(int n_families)
 		genl_unregister_family(dp_genl_families[i]);
 }
 
-static int dp_register_genl(void)
+static int __init dp_register_genl(void)
 {
 	int err;
 	int i;
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 74a405bf107b..26ca8dd64ded 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -249,7 +249,7 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
 #endif
 };
 
-struct genl_family tipc_genl_family = {
+struct genl_family tipc_genl_family __ro_after_init = {
 	.name		= TIPC_GENL_V2_NAME,
 	.version	= TIPC_GENL_V2_VERSION,
 	.hdrsize	= 0,
@@ -271,7 +271,7 @@ int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr)
 	return nlmsg_parse(nlh, GENL_HDRLEN, *attr, maxattr, tipc_nl_policy);
 }
 
-int tipc_netlink_start(void)
+int __init tipc_netlink_start(void)
 {
 	int res;
 
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 07b19931e458..e1ae8a8a2b8e 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -1222,7 +1222,7 @@ static struct genl_ops tipc_genl_compat_ops[] = {
 	},
 };
 
-static struct genl_family tipc_genl_compat_family = {
+static struct genl_family tipc_genl_compat_family __ro_after_init = {
 	.name		= TIPC_GENL_NAME,
 	.version	= TIPC_GENL_VERSION,
 	.hdrsize	= TIPC_GENL_HDRLEN,
@@ -1233,7 +1233,7 @@ static struct genl_family tipc_genl_compat_family = {
 	.n_ops		= ARRAY_SIZE(tipc_genl_compat_ops),
 };
 
-int tipc_netlink_compat_start(void)
+int __init tipc_netlink_compat_start(void)
 {
 	int res;
 
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 587e1627681f..5db731512014 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -576,7 +576,7 @@ static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
 	{ .name = "msg", },
 };
 
-struct genl_family wimax_gnl_family = {
+struct genl_family wimax_gnl_family __ro_after_init = {
 	.name = "WiMAX",
 	.version = WIMAX_GNL_VERSION,
 	.hdrsize = 0,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 8e5ca3c47593..271707dacfea 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -12586,7 +12586,7 @@ static const struct genl_ops nl80211_ops[] = {
 	},
 };
 
-static struct genl_family nl80211_fam = {
+static struct genl_family nl80211_fam __ro_after_init = {
 	.name = NL80211_GENL_NAME,	/* have users key off the name instead */
 	.hdrsize = 0,			/* no private header */
 	.version = 1,			/* no particular meaning now */
@@ -14563,7 +14563,7 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev)
 
 /* initialisation/exit functions */
 
-int nl80211_init(void)
+int __init nl80211_init(void)
 {
 	int err;
 
-- 
cgit v1.2.3


From b917783c7b350518f8c5d88bb5848aa8064408a6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 26 Oct 2016 18:49:46 +0200
Subject: flow_dissector: __skb_get_hash_symmetric arg can be const

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    | 2 +-
 net/core/flow_dissector.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 601258f6e621..663fda2887f7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1086,7 +1086,7 @@ __skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4)
 }
 
 void __skb_get_hash(struct sk_buff *skb);
-u32 __skb_get_hash_symmetric(struct sk_buff *skb);
+u32 __skb_get_hash_symmetric(const struct sk_buff *skb);
 u32 skb_get_poff(const struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 1a7b80f73376..0cc607d05fc8 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -723,7 +723,7 @@ EXPORT_SYMBOL(make_flow_keys_digest);
 
 static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
 
-u32 __skb_get_hash_symmetric(struct sk_buff *skb)
+u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
 {
 	struct flow_keys keys;
 
-- 
cgit v1.2.3


From 5579e1519bad43b874922dbe87c74fdcbd97a7db Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Wed, 31 Aug 2016 05:17:54 +0000
Subject: net/mlx5: Update struct mlx5_ifc_xrqc_bits

Update struct mlx5_ifc_xrqc_bits according to last specification

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6045d4d58065..12f72e45a3f0 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -2844,7 +2844,7 @@ struct mlx5_ifc_xrqc_bits {
 
 	struct mlx5_ifc_tag_matching_topology_context_bits tag_matching_topology_context;
 
-	u8         reserved_at_180[0x200];
+	u8         reserved_at_180[0x880];
 
 	struct mlx5_ifc_wq_bits wq;
 };
-- 
cgit v1.2.3


From dd257efb1e0f8875ed7e42b88837a8dada0d0e41 Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Wed, 31 Aug 2016 05:29:58 +0000
Subject: net/mlx5: Ensure SRQ physical address structure endianness

SRQ physical address structure field should be in big-endian format.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/srq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/srq.h b/include/linux/mlx5/srq.h
index 33c97dc900f8..1cde0fd53f90 100644
--- a/include/linux/mlx5/srq.h
+++ b/include/linux/mlx5/srq.h
@@ -55,7 +55,7 @@ struct mlx5_srq_attr {
 	u32 lwm;
 	u32 user_index;
 	u64 db_record;
-	u64 *pas;
+	__be64 *pas;
 };
 
 struct mlx5_core_dev;
-- 
cgit v1.2.3


From 813f854053c26204e2723c498def4c7870dcc7f4 Mon Sep 17 00:00:00 2001
From: Mohamad Haj Yahia <mohamad@mellanox.com>
Date: Thu, 11 Aug 2016 11:21:39 +0300
Subject: net/mlx5: Introduce TSAR manipulation firmware commands

TSAR (stands for Transmit Scheduling ARbiter) is a hardware component
that is responsible for selecting the next entity to serve on the
transmit path.
The arbitration defines the QoS policy between the agents connected to
the TSAR.
The TSAR is a consist two main features:
1) BW Allocation between agents:
The TSAR implements a defecit weighted round robin between the agents.
Each agent attached to the TSAR is assigned with a weight and it is
awarded transmission tokens according to this weight.
2) Rate limer per agent:
Each agent attached to the TSAR is (optionally) assigned with a rate
limit.
TSAR will not allow scheduling for an agent exceeding its defined rate
limit.

In this patch we implement the API of manipulating the TSAR.

Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      |  13 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   7 +
 drivers/net/ethernet/mellanox/mlx5/core/rl.c       |  65 +++++++
 include/linux/mlx5/mlx5_ifc.h                      | 199 ++++++++++++++++++++-
 4 files changed, 279 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 1e639f886021..8561102f2563 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -318,6 +318,8 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
 	case MLX5_CMD_OP_SET_FLOW_TABLE_ROOT:
 	case MLX5_CMD_OP_DEALLOC_ENCAP_HEADER:
+	case MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT:
+	case MLX5_CMD_OP_DESTROY_QOS_PARA_VPORT:
 		return MLX5_CMD_STAT_OK;
 
 	case MLX5_CMD_OP_QUERY_HCA_CAP:
@@ -419,11 +421,14 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
 	case MLX5_CMD_OP_QUERY_FLOW_TABLE:
 	case MLX5_CMD_OP_CREATE_FLOW_GROUP:
 	case MLX5_CMD_OP_QUERY_FLOW_GROUP:
-
 	case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY:
 	case MLX5_CMD_OP_ALLOC_FLOW_COUNTER:
 	case MLX5_CMD_OP_QUERY_FLOW_COUNTER:
 	case MLX5_CMD_OP_ALLOC_ENCAP_HEADER:
+	case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
+	case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT:
+	case MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT:
+	case MLX5_CMD_OP_CREATE_QOS_PARA_VPORT:
 		*status = MLX5_DRIVER_STATUS_ABORTED;
 		*synd = MLX5_DRIVER_SYND;
 		return -EIO;
@@ -580,6 +585,12 @@ const char *mlx5_command_str(int command)
 	MLX5_COMMAND_STR_CASE(MODIFY_FLOW_TABLE);
 	MLX5_COMMAND_STR_CASE(ALLOC_ENCAP_HEADER);
 	MLX5_COMMAND_STR_CASE(DEALLOC_ENCAP_HEADER);
+	MLX5_COMMAND_STR_CASE(CREATE_SCHEDULING_ELEMENT);
+	MLX5_COMMAND_STR_CASE(DESTROY_SCHEDULING_ELEMENT);
+	MLX5_COMMAND_STR_CASE(QUERY_SCHEDULING_ELEMENT);
+	MLX5_COMMAND_STR_CASE(MODIFY_SCHEDULING_ELEMENT);
+	MLX5_COMMAND_STR_CASE(CREATE_QOS_PARA_VPORT);
+	MLX5_COMMAND_STR_CASE(DESTROY_QOS_PARA_VPORT);
 	default: return "unknown command opcode";
 	}
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 3d0cfb9f18f9..bf431715172c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -91,6 +91,13 @@ int mlx5_core_sriov_configure(struct pci_dev *dev, int num_vfs);
 bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev);
 int mlx5_core_enable_hca(struct mlx5_core_dev *dev, u16 func_id);
 int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id);
+int mlx5_create_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+				       void *context, u32 *element_id);
+int mlx5_modify_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+				       void *context, u32 element_id,
+				       u32 modify_bitmask);
+int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+					u32 element_id);
 int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev);
 cycle_t mlx5_read_internal_timer(struct mlx5_core_dev *dev);
 u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
index 104902a93a0b..e651e4c02867 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
@@ -36,6 +36,71 @@
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
 
+/* Scheduling element fw management */
+int mlx5_create_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+				       void *ctx, u32 *element_id)
+{
+	u32 in[MLX5_ST_SZ_DW(create_scheduling_element_in)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(create_scheduling_element_in)] = {0};
+	void *schedc;
+	int err;
+
+	schedc = MLX5_ADDR_OF(create_scheduling_element_in, in,
+			      scheduling_context);
+	MLX5_SET(create_scheduling_element_in, in, opcode,
+		 MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT);
+	MLX5_SET(create_scheduling_element_in, in, scheduling_hierarchy,
+		 hierarchy);
+	memcpy(schedc, ctx, MLX5_ST_SZ_BYTES(scheduling_context));
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	*element_id = MLX5_GET(create_scheduling_element_out, out,
+			       scheduling_element_id);
+	return 0;
+}
+
+int mlx5_modify_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+				       void *ctx, u32 element_id,
+				       u32 modify_bitmask)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_scheduling_element_in)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(modify_scheduling_element_in)] = {0};
+	void *schedc;
+
+	schedc = MLX5_ADDR_OF(modify_scheduling_element_in, in,
+			      scheduling_context);
+	MLX5_SET(modify_scheduling_element_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT);
+	MLX5_SET(modify_scheduling_element_in, in, scheduling_element_id,
+		 element_id);
+	MLX5_SET(modify_scheduling_element_in, in, modify_bitmask,
+		 modify_bitmask);
+	MLX5_SET(modify_scheduling_element_in, in, scheduling_hierarchy,
+		 hierarchy);
+	memcpy(schedc, ctx, MLX5_ST_SZ_BYTES(scheduling_context));
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
+					u32 element_id)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_scheduling_element_in)]  = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_scheduling_element_in)] = {0};
+
+	MLX5_SET(destroy_scheduling_element_in, in, opcode,
+		 MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT);
+	MLX5_SET(destroy_scheduling_element_in, in, scheduling_element_id,
+		 element_id);
+	MLX5_SET(destroy_scheduling_element_in, in, scheduling_hierarchy,
+		 hierarchy);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
 /* Finds an entry where we can register the given rate
  * If the rate already exists, return the entry where it is registered,
  * otherwise return the first available entry.
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 12f72e45a3f0..2632cb2caf10 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -145,6 +145,12 @@ enum {
 	MLX5_CMD_OP_QUERY_Q_COUNTER               = 0x773,
 	MLX5_CMD_OP_SET_RATE_LIMIT                = 0x780,
 	MLX5_CMD_OP_QUERY_RATE_LIMIT              = 0x781,
+	MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT      = 0x782,
+	MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT     = 0x783,
+	MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT       = 0x784,
+	MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT      = 0x785,
+	MLX5_CMD_OP_CREATE_QOS_PARA_VPORT         = 0x786,
+	MLX5_CMD_OP_DESTROY_QOS_PARA_VPORT        = 0x787,
 	MLX5_CMD_OP_ALLOC_PD                      = 0x800,
 	MLX5_CMD_OP_DEALLOC_PD                    = 0x801,
 	MLX5_CMD_OP_ALLOC_UAR                     = 0x802,
@@ -537,13 +543,27 @@ struct mlx5_ifc_e_switch_cap_bits {
 
 struct mlx5_ifc_qos_cap_bits {
 	u8         packet_pacing[0x1];
-	u8         reserved_0[0x1f];
-	u8         reserved_1[0x20];
+	u8         esw_scheduling[0x1];
+	u8         reserved_at_2[0x1e];
+
+	u8         reserved_at_20[0x20];
+
 	u8         packet_pacing_max_rate[0x20];
+
 	u8         packet_pacing_min_rate[0x20];
-	u8         reserved_2[0x10];
+
+	u8         reserved_at_80[0x10];
 	u8         packet_pacing_rate_table_size[0x10];
-	u8         reserved_3[0x760];
+
+	u8         esw_element_type[0x10];
+	u8         esw_tsar_type[0x10];
+
+	u8         reserved_at_c0[0x10];
+	u8         max_qos_para_vport[0x10];
+
+	u8         max_tsar_bw_share[0x20];
+
+	u8         reserved_at_100[0x700];
 };
 
 struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
@@ -2333,6 +2353,30 @@ struct mlx5_ifc_sqc_bits {
 	struct mlx5_ifc_wq_bits wq;
 };
 
+enum {
+	SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR = 0x0,
+	SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT = 0x1,
+	SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC = 0x2,
+	SCHEDULING_CONTEXT_ELEMENT_TYPE_PARA_VPORT_TC = 0x3,
+};
+
+struct mlx5_ifc_scheduling_context_bits {
+	u8         element_type[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         element_attributes[0x20];
+
+	u8         parent_element_id[0x20];
+
+	u8         reserved_at_60[0x40];
+
+	u8         bw_share[0x20];
+
+	u8         max_average_bw[0x20];
+
+	u8         reserved_at_e0[0x120];
+};
+
 struct mlx5_ifc_rqtc_bits {
 	u8         reserved_at_0[0xa0];
 
@@ -2920,6 +2964,29 @@ struct mlx5_ifc_register_loopback_control_bits {
 	u8         reserved_at_20[0x60];
 };
 
+struct mlx5_ifc_vport_tc_element_bits {
+	u8         traffic_class[0x4];
+	u8         reserved_at_4[0xc];
+	u8         vport_number[0x10];
+};
+
+struct mlx5_ifc_vport_element_bits {
+	u8         reserved_at_0[0x10];
+	u8         vport_number[0x10];
+};
+
+enum {
+	TSAR_ELEMENT_TSAR_TYPE_DWRR = 0x0,
+	TSAR_ELEMENT_TSAR_TYPE_ROUND_ROBIN = 0x1,
+	TSAR_ELEMENT_TSAR_TYPE_ETS = 0x2,
+};
+
+struct mlx5_ifc_tsar_element_bits {
+	u8         reserved_at_0[0x8];
+	u8         tsar_type[0x8];
+	u8         reserved_at_10[0x10];
+};
+
 struct mlx5_ifc_teardown_hca_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
@@ -3540,6 +3607,39 @@ struct mlx5_ifc_query_special_contexts_in_bits {
 	u8         reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_query_scheduling_element_out_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0xc0];
+
+	struct mlx5_ifc_scheduling_context_bits scheduling_context;
+
+	u8         reserved_at_300[0x100];
+};
+
+enum {
+	SCHEDULING_HIERARCHY_E_SWITCH = 0x2,
+};
+
+struct mlx5_ifc_query_scheduling_element_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         scheduling_hierarchy[0x8];
+	u8         reserved_at_48[0x18];
+
+	u8         scheduling_element_id[0x20];
+
+	u8         reserved_at_80[0x180];
+};
+
 struct mlx5_ifc_query_rqt_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
@@ -4725,6 +4825,43 @@ struct mlx5_ifc_modify_sq_in_bits {
 	struct mlx5_ifc_sqc_bits ctx;
 };
 
+struct mlx5_ifc_modify_scheduling_element_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x1c0];
+};
+
+enum {
+	MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE = 0x1,
+	MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW = 0x2,
+};
+
+struct mlx5_ifc_modify_scheduling_element_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         scheduling_hierarchy[0x8];
+	u8         reserved_at_48[0x18];
+
+	u8         scheduling_element_id[0x20];
+
+	u8         reserved_at_80[0x20];
+
+	u8         modify_bitmask[0x20];
+
+	u8         reserved_at_c0[0x40];
+
+	struct mlx5_ifc_scheduling_context_bits scheduling_context;
+
+	u8         reserved_at_300[0x100];
+};
+
 struct mlx5_ifc_modify_rqt_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
@@ -5390,6 +5527,30 @@ struct mlx5_ifc_destroy_sq_in_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_destroy_scheduling_element_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x1c0];
+};
+
+struct mlx5_ifc_destroy_scheduling_element_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         scheduling_hierarchy[0x8];
+	u8         reserved_at_48[0x18];
+
+	u8         scheduling_element_id[0x20];
+
+	u8         reserved_at_80[0x180];
+};
+
 struct mlx5_ifc_destroy_rqt_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
@@ -6017,6 +6178,36 @@ struct mlx5_ifc_create_sq_in_bits {
 	struct mlx5_ifc_sqc_bits ctx;
 };
 
+struct mlx5_ifc_create_scheduling_element_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	u8         scheduling_element_id[0x20];
+
+	u8         reserved_at_a0[0x160];
+};
+
+struct mlx5_ifc_create_scheduling_element_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         scheduling_hierarchy[0x8];
+	u8         reserved_at_48[0x18];
+
+	u8         reserved_at_60[0xa0];
+
+	struct mlx5_ifc_scheduling_context_bits scheduling_context;
+
+	u8         reserved_at_300[0x100];
+};
+
 struct mlx5_ifc_create_rqt_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
-- 
cgit v1.2.3


From 74491de937125d0c98c9b9c9208b4105717a3caa Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Wed, 31 Aug 2016 11:24:25 +0000
Subject: net/mlx5: Add multi dest support

Currently when calling mlx5_add_flow_rule we accept
only one flow destination, this commit allows to pass
multiple destinations.

This change forces us to change the return structure to a more
flexible one. We introduce a flow handle (struct mlx5_flow_handle),
it holds internally the number for rules created and holds an array
where each cell points the to a flow rule.

From the consumers (of mlx5_add_flow_rule) point of view this
change is only cosmetic and requires only to change the type
of the returned value they store.

From the core point of view, we now need to use a loop when
allocating and deleting rules (e.g given to us a flow handler).

Signed-off-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/mlx5/main.c                  |  14 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h               |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  14 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c  |  38 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c    |  49 ++--
 .../ethernet/mellanox/mlx5/core/en_fs_ethtool.c    |  19 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |   6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  32 +--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  68 ++---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  22 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |  42 +--
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 289 ++++++++++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |   5 +
 include/linux/mlx5/fs.h                            |  28 +-
 14 files changed, 374 insertions(+), 254 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index d02341eebddb..8e0dbd51944e 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1771,13 +1771,13 @@ static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
 	mutex_lock(&dev->flow_db.lock);
 
 	list_for_each_entry_safe(iter, tmp, &handler->list, list) {
-		mlx5_del_flow_rule(iter->rule);
+		mlx5_del_flow_rules(iter->rule);
 		put_flow_table(dev, iter->prio, true);
 		list_del(&iter->list);
 		kfree(iter);
 	}
 
-	mlx5_del_flow_rule(handler->rule);
+	mlx5_del_flow_rules(handler->rule);
 	put_flow_table(dev, handler->prio, true);
 	mutex_unlock(&dev->flow_db.lock);
 
@@ -1907,10 +1907,10 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
 	spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
 	action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
 		MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
-	handler->rule = mlx5_add_flow_rule(ft, spec,
+	handler->rule = mlx5_add_flow_rules(ft, spec,
 					   action,
 					   MLX5_FS_DEFAULT_FLOW_TAG,
-					   dst);
+					   dst, 1);
 
 	if (IS_ERR(handler->rule)) {
 		err = PTR_ERR(handler->rule);
@@ -1941,7 +1941,7 @@ static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *de
 		handler_dst = create_flow_rule(dev, ft_prio,
 					       flow_attr, dst);
 		if (IS_ERR(handler_dst)) {
-			mlx5_del_flow_rule(handler->rule);
+			mlx5_del_flow_rules(handler->rule);
 			ft_prio->refcount--;
 			kfree(handler);
 			handler = handler_dst;
@@ -2004,7 +2004,7 @@ static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *de
 						 &leftovers_specs[LEFTOVERS_UC].flow_attr,
 						 dst);
 		if (IS_ERR(handler_ucast)) {
-			mlx5_del_flow_rule(handler->rule);
+			mlx5_del_flow_rules(handler->rule);
 			ft_prio->refcount--;
 			kfree(handler);
 			handler = handler_ucast;
@@ -2046,7 +2046,7 @@ static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
 	return handler_rx;
 
 err_tx:
-	mlx5_del_flow_rule(handler_rx->rule);
+	mlx5_del_flow_rules(handler_rx->rule);
 	ft_rx->refcount--;
 	kfree(handler_rx);
 err:
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index dcdcd195fe53..d5d007740159 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -153,7 +153,7 @@ struct mlx5_ib_flow_handler {
 	struct list_head		list;
 	struct ib_flow			ibflow;
 	struct mlx5_ib_flow_prio	*prio;
-	struct mlx5_flow_rule	*rule;
+	struct mlx5_flow_handle		*rule;
 };
 
 struct mlx5_ib_flow_db {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 460363b66cb1..47ee8ffe987f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -520,7 +520,7 @@ struct mlx5e_vxlan_db {
 
 struct mlx5e_l2_rule {
 	u8  addr[ETH_ALEN + 2];
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
 };
 
 struct mlx5e_flow_table {
@@ -541,10 +541,10 @@ struct mlx5e_tc_table {
 struct mlx5e_vlan_table {
 	struct mlx5e_flow_table		ft;
 	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
-	struct mlx5_flow_rule	*active_vlans_rule[VLAN_N_VID];
-	struct mlx5_flow_rule	*untagged_rule;
-	struct mlx5_flow_rule	*any_vlan_rule;
-	bool          filter_disabled;
+	struct mlx5_flow_handle	*active_vlans_rule[VLAN_N_VID];
+	struct mlx5_flow_handle	*untagged_rule;
+	struct mlx5_flow_handle	*any_vlan_rule;
+	bool		filter_disabled;
 };
 
 struct mlx5e_l2_table {
@@ -562,14 +562,14 @@ struct mlx5e_l2_table {
 /* L3/L4 traffic type classifier */
 struct mlx5e_ttc_table {
 	struct mlx5e_flow_table  ft;
-	struct mlx5_flow_rule	 *rules[MLX5E_NUM_TT];
+	struct mlx5_flow_handle	 *rules[MLX5E_NUM_TT];
 };
 
 #define ARFS_HASH_SHIFT BITS_PER_BYTE
 #define ARFS_HASH_SIZE BIT(BITS_PER_BYTE)
 struct arfs_table {
 	struct mlx5e_flow_table  ft;
-	struct mlx5_flow_rule    *default_rule;
+	struct mlx5_flow_handle	 *default_rule;
 	struct hlist_head	 rules_hash[ARFS_HASH_SIZE];
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
index a8cb38789774..8ff22e83e1dd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -56,7 +56,7 @@ struct arfs_tuple {
 struct arfs_rule {
 	struct mlx5e_priv	*priv;
 	struct work_struct      arfs_work;
-	struct mlx5_flow_rule   *rule;
+	struct mlx5_flow_handle *rule;
 	struct hlist_node	hlist;
 	int			rxq;
 	/* Flow ID passed to ndo_rx_flow_steer */
@@ -104,7 +104,7 @@ static int arfs_disable(struct mlx5e_priv *priv)
 		tt = arfs_get_tt(i);
 		/* Modify ttc rules destination to bypass the aRFS tables*/
 		err = mlx5_modify_rule_destination(priv->fs.ttc.rules[tt],
-						   &dest);
+						   &dest, NULL);
 		if (err) {
 			netdev_err(priv->netdev,
 				   "%s: modify ttc destination failed\n",
@@ -137,7 +137,7 @@ int mlx5e_arfs_enable(struct mlx5e_priv *priv)
 		tt = arfs_get_tt(i);
 		/* Modify ttc rules destination to point on the aRFS FTs */
 		err = mlx5_modify_rule_destination(priv->fs.ttc.rules[tt],
-						   &dest);
+						   &dest, NULL);
 		if (err) {
 			netdev_err(priv->netdev,
 				   "%s: modify ttc destination failed err=%d\n",
@@ -151,7 +151,7 @@ int mlx5e_arfs_enable(struct mlx5e_priv *priv)
 
 static void arfs_destroy_table(struct arfs_table *arfs_t)
 {
-	mlx5_del_flow_rule(arfs_t->default_rule);
+	mlx5_del_flow_rules(arfs_t->default_rule);
 	mlx5e_destroy_flow_table(&arfs_t->ft);
 }
 
@@ -205,10 +205,10 @@ static int arfs_add_default_rule(struct mlx5e_priv *priv,
 		goto out;
 	}
 
-	arfs_t->default_rule = mlx5_add_flow_rule(arfs_t->ft.t, spec,
-						  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-						  MLX5_FS_DEFAULT_FLOW_TAG,
-						  &dest);
+	arfs_t->default_rule = mlx5_add_flow_rules(arfs_t->ft.t, spec,
+						   MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+						   MLX5_FS_DEFAULT_FLOW_TAG,
+						   &dest, 1);
 	if (IS_ERR(arfs_t->default_rule)) {
 		err = PTR_ERR(arfs_t->default_rule);
 		arfs_t->default_rule = NULL;
@@ -396,7 +396,7 @@ static void arfs_may_expire_flow(struct mlx5e_priv *priv)
 	spin_unlock_bh(&priv->fs.arfs.arfs_lock);
 	hlist_for_each_entry_safe(arfs_rule, htmp, &del_list, hlist) {
 		if (arfs_rule->rule)
-			mlx5_del_flow_rule(arfs_rule->rule);
+			mlx5_del_flow_rules(arfs_rule->rule);
 		hlist_del(&arfs_rule->hlist);
 		kfree(arfs_rule);
 	}
@@ -420,7 +420,7 @@ static void arfs_del_rules(struct mlx5e_priv *priv)
 	hlist_for_each_entry_safe(rule, htmp, &del_list, hlist) {
 		cancel_work_sync(&rule->arfs_work);
 		if (rule->rule)
-			mlx5_del_flow_rule(rule->rule);
+			mlx5_del_flow_rules(rule->rule);
 		hlist_del(&rule->hlist);
 		kfree(rule);
 	}
@@ -462,12 +462,12 @@ static struct arfs_table *arfs_get_table(struct mlx5e_arfs_tables *arfs,
 	return NULL;
 }
 
-static struct mlx5_flow_rule *arfs_add_rule(struct mlx5e_priv *priv,
-					    struct arfs_rule *arfs_rule)
+static struct mlx5_flow_handle *arfs_add_rule(struct mlx5e_priv *priv,
+					      struct arfs_rule *arfs_rule)
 {
 	struct mlx5e_arfs_tables *arfs = &priv->fs.arfs;
 	struct arfs_tuple *tuple = &arfs_rule->tuple;
-	struct mlx5_flow_rule *rule = NULL;
+	struct mlx5_flow_handle *rule = NULL;
 	struct mlx5_flow_destination dest;
 	struct arfs_table *arfs_table;
 	struct mlx5_flow_spec *spec;
@@ -544,9 +544,9 @@ static struct mlx5_flow_rule *arfs_add_rule(struct mlx5e_priv *priv,
 	}
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
 	dest.tir_num = priv->direct_tir[arfs_rule->rxq].tirn;
-	rule = mlx5_add_flow_rule(ft, spec, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				  MLX5_FS_DEFAULT_FLOW_TAG,
-				  &dest);
+	rule = mlx5_add_flow_rules(ft, spec, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+				   MLX5_FS_DEFAULT_FLOW_TAG,
+				   &dest, 1);
 	if (IS_ERR(rule)) {
 		err = PTR_ERR(rule);
 		netdev_err(priv->netdev, "%s: add rule(filter id=%d, rq idx=%d) failed, err=%d\n",
@@ -559,14 +559,14 @@ out:
 }
 
 static void arfs_modify_rule_rq(struct mlx5e_priv *priv,
-				struct mlx5_flow_rule *rule, u16 rxq)
+				struct mlx5_flow_handle *rule, u16 rxq)
 {
 	struct mlx5_flow_destination dst;
 	int err = 0;
 
 	dst.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
 	dst.tir_num = priv->direct_tir[rxq].tirn;
-	err =  mlx5_modify_rule_destination(rule, &dst);
+	err =  mlx5_modify_rule_destination(rule, &dst, NULL);
 	if (err)
 		netdev_warn(priv->netdev,
 			    "Failed to modfiy aRFS rule destination to rq=%d\n", rxq);
@@ -578,7 +578,7 @@ static void arfs_handle_work(struct work_struct *work)
 						   struct arfs_rule,
 						   arfs_work);
 	struct mlx5e_priv *priv = arfs_rule->priv;
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
 
 	mutex_lock(&priv->state_lock);
 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index 36fbc6b21a33..bed544d47ba1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -160,7 +160,7 @@ static int __mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
 {
 	struct mlx5_flow_table *ft = priv->fs.vlan.ft.t;
 	struct mlx5_flow_destination dest;
-	struct mlx5_flow_rule **rule_p;
+	struct mlx5_flow_handle **rule_p;
 	int err = 0;
 
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
@@ -187,10 +187,10 @@ static int __mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
 		break;
 	}
 
-	*rule_p = mlx5_add_flow_rule(ft, spec,
-				     MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				     MLX5_FS_DEFAULT_FLOW_TAG,
-				     &dest);
+	*rule_p = mlx5_add_flow_rules(ft, spec,
+				      MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+				      MLX5_FS_DEFAULT_FLOW_TAG,
+				      &dest, 1);
 
 	if (IS_ERR(*rule_p)) {
 		err = PTR_ERR(*rule_p);
@@ -229,20 +229,20 @@ static void mlx5e_del_vlan_rule(struct mlx5e_priv *priv,
 	switch (rule_type) {
 	case MLX5E_VLAN_RULE_TYPE_UNTAGGED:
 		if (priv->fs.vlan.untagged_rule) {
-			mlx5_del_flow_rule(priv->fs.vlan.untagged_rule);
+			mlx5_del_flow_rules(priv->fs.vlan.untagged_rule);
 			priv->fs.vlan.untagged_rule = NULL;
 		}
 		break;
 	case MLX5E_VLAN_RULE_TYPE_ANY_VID:
 		if (priv->fs.vlan.any_vlan_rule) {
-			mlx5_del_flow_rule(priv->fs.vlan.any_vlan_rule);
+			mlx5_del_flow_rules(priv->fs.vlan.any_vlan_rule);
 			priv->fs.vlan.any_vlan_rule = NULL;
 		}
 		break;
 	case MLX5E_VLAN_RULE_TYPE_MATCH_VID:
 		mlx5e_vport_context_update_vlans(priv);
 		if (priv->fs.vlan.active_vlans_rule[vid]) {
-			mlx5_del_flow_rule(priv->fs.vlan.active_vlans_rule[vid]);
+			mlx5_del_flow_rules(priv->fs.vlan.active_vlans_rule[vid]);
 			priv->fs.vlan.active_vlans_rule[vid] = NULL;
 		}
 		mlx5e_vport_context_update_vlans(priv);
@@ -560,7 +560,7 @@ static void mlx5e_cleanup_ttc_rules(struct mlx5e_ttc_table *ttc)
 
 	for (i = 0; i < MLX5E_NUM_TT; i++) {
 		if (!IS_ERR_OR_NULL(ttc->rules[i])) {
-			mlx5_del_flow_rule(ttc->rules[i]);
+			mlx5_del_flow_rules(ttc->rules[i]);
 			ttc->rules[i] = NULL;
 		}
 	}
@@ -616,13 +616,14 @@ static struct {
 	},
 };
 
-static struct mlx5_flow_rule *mlx5e_generate_ttc_rule(struct mlx5e_priv *priv,
-						      struct mlx5_flow_table *ft,
-						      struct mlx5_flow_destination *dest,
-						      u16 etype,
-						      u8 proto)
+static struct mlx5_flow_handle *
+mlx5e_generate_ttc_rule(struct mlx5e_priv *priv,
+			struct mlx5_flow_table *ft,
+			struct mlx5_flow_destination *dest,
+			u16 etype,
+			u8 proto)
 {
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
 	struct mlx5_flow_spec *spec;
 	int err = 0;
 
@@ -643,10 +644,10 @@ static struct mlx5_flow_rule *mlx5e_generate_ttc_rule(struct mlx5e_priv *priv,
 		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ethertype, etype);
 	}
 
-	rule = mlx5_add_flow_rule(ft, spec,
-				  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				  MLX5_FS_DEFAULT_FLOW_TAG,
-				  dest);
+	rule = mlx5_add_flow_rules(ft, spec,
+				   MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+				   MLX5_FS_DEFAULT_FLOW_TAG,
+				   dest, 1);
 	if (IS_ERR(rule)) {
 		err = PTR_ERR(rule);
 		netdev_err(priv->netdev, "%s: add rule failed\n", __func__);
@@ -660,7 +661,7 @@ static int mlx5e_generate_ttc_table_rules(struct mlx5e_priv *priv)
 {
 	struct mlx5_flow_destination dest;
 	struct mlx5e_ttc_table *ttc;
-	struct mlx5_flow_rule **rules;
+	struct mlx5_flow_handle **rules;
 	struct mlx5_flow_table *ft;
 	int tt;
 	int err;
@@ -801,7 +802,7 @@ static void mlx5e_del_l2_flow_rule(struct mlx5e_priv *priv,
 				   struct mlx5e_l2_rule *ai)
 {
 	if (!IS_ERR_OR_NULL(ai->rule)) {
-		mlx5_del_flow_rule(ai->rule);
+		mlx5_del_flow_rules(ai->rule);
 		ai->rule = NULL;
 	}
 }
@@ -847,9 +848,9 @@ static int mlx5e_add_l2_flow_rule(struct mlx5e_priv *priv,
 		break;
 	}
 
-	ai->rule = mlx5_add_flow_rule(ft, spec,
-				      MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				      MLX5_FS_DEFAULT_FLOW_TAG, &dest);
+	ai->rule = mlx5_add_flow_rules(ft, spec,
+				       MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+				       MLX5_FS_DEFAULT_FLOW_TAG, &dest, 1);
 	if (IS_ERR(ai->rule)) {
 		netdev_err(priv->netdev, "%s: add l2 rule(mac:%pM) failed\n",
 			   __func__, mv_dmac);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index d17c24227900..cf52c06377f2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -36,7 +36,7 @@
 struct mlx5e_ethtool_rule {
 	struct list_head             list;
 	struct ethtool_rx_flow_spec  flow_spec;
-	struct mlx5_flow_rule        *rule;
+	struct mlx5_flow_handle	     *rule;
 	struct mlx5e_ethtool_table   *eth_ft;
 };
 
@@ -284,13 +284,14 @@ static bool outer_header_zero(u32 *match_criteria)
 						  size - 1);
 }
 
-static struct mlx5_flow_rule *add_ethtool_flow_rule(struct mlx5e_priv *priv,
-						    struct mlx5_flow_table *ft,
-						    struct ethtool_rx_flow_spec *fs)
+static struct mlx5_flow_handle *
+add_ethtool_flow_rule(struct mlx5e_priv *priv,
+		      struct mlx5_flow_table *ft,
+		      struct ethtool_rx_flow_spec *fs)
 {
 	struct mlx5_flow_destination *dst = NULL;
 	struct mlx5_flow_spec *spec;
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
 	int err = 0;
 	u32 action;
 
@@ -317,8 +318,8 @@ static struct mlx5_flow_rule *add_ethtool_flow_rule(struct mlx5e_priv *priv,
 	}
 
 	spec->match_criteria_enable = (!outer_header_zero(spec->match_criteria));
-	rule = mlx5_add_flow_rule(ft, spec, action,
-				  MLX5_FS_DEFAULT_FLOW_TAG, dst);
+	rule = mlx5_add_flow_rules(ft, spec, action,
+				   MLX5_FS_DEFAULT_FLOW_TAG, dst, 1);
 	if (IS_ERR(rule)) {
 		err = PTR_ERR(rule);
 		netdev_err(priv->netdev, "%s: failed to add ethtool steering rule: %d\n",
@@ -335,7 +336,7 @@ static void del_ethtool_rule(struct mlx5e_priv *priv,
 			     struct mlx5e_ethtool_rule *eth_rule)
 {
 	if (eth_rule->rule)
-		mlx5_del_flow_rule(eth_rule->rule);
+		mlx5_del_flow_rules(eth_rule->rule);
 	list_del(&eth_rule->list);
 	priv->fs.ethtool.tot_num_rules--;
 	put_flow_table(eth_rule->eth_ft);
@@ -475,7 +476,7 @@ int mlx5e_ethtool_flow_replace(struct mlx5e_priv *priv,
 {
 	struct mlx5e_ethtool_table *eth_ft;
 	struct mlx5e_ethtool_rule *eth_rule;
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
 	int num_tuples;
 	int err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 3c97da103d30..88d3fd132d63 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -328,7 +328,7 @@ static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	struct mlx5_eswitch_rep *rep = priv->ppriv;
 	struct mlx5_core_dev *mdev = priv->mdev;
-	struct mlx5_flow_rule *flow_rule;
+	struct mlx5_flow_handle *flow_rule;
 	int err;
 	int i;
 
@@ -360,7 +360,7 @@ static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
 	return 0;
 
 err_del_flow_rule:
-	mlx5_del_flow_rule(rep->vport_rx_rule);
+	mlx5_del_flow_rules(rep->vport_rx_rule);
 err_destroy_direct_tirs:
 	mlx5e_destroy_direct_tirs(priv);
 err_destroy_direct_rqts:
@@ -375,7 +375,7 @@ static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv)
 	int i;
 
 	mlx5e_tc_cleanup(priv);
-	mlx5_del_flow_rule(rep->vport_rx_rule);
+	mlx5_del_flow_rules(rep->vport_rx_rule);
 	mlx5e_destroy_direct_tirs(priv);
 	for (i = 0; i < priv->params.num_channels; i++)
 		mlx5e_destroy_rqt(priv, &priv->direct_tir[i].rqt);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index ce8c54d18906..5d9ac0dbf3bf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -47,21 +47,22 @@
 struct mlx5e_tc_flow {
 	struct rhash_head	node;
 	u64			cookie;
-	struct mlx5_flow_rule	*rule;
+	struct mlx5_flow_handle *rule;
 	struct mlx5_esw_flow_attr *attr;
 };
 
 #define MLX5E_TC_TABLE_NUM_ENTRIES 1024
 #define MLX5E_TC_TABLE_NUM_GROUPS 4
 
-static struct mlx5_flow_rule *mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
-						    struct mlx5_flow_spec *spec,
-						    u32 action, u32 flow_tag)
+static struct mlx5_flow_handle *
+mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
+		      struct mlx5_flow_spec *spec,
+		      u32 action, u32 flow_tag)
 {
 	struct mlx5_core_dev *dev = priv->mdev;
 	struct mlx5_flow_destination dest = { 0 };
 	struct mlx5_fc *counter = NULL;
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
 	bool table_created = false;
 
 	if (action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
@@ -94,9 +95,9 @@ static struct mlx5_flow_rule *mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 	}
 
 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	rule = mlx5_add_flow_rule(priv->fs.tc.t, spec,
-				  action, flow_tag,
-				  &dest);
+	rule = mlx5_add_flow_rules(priv->fs.tc.t, spec,
+				   action, flow_tag,
+				   &dest, 1);
 
 	if (IS_ERR(rule))
 		goto err_add_rule;
@@ -114,9 +115,10 @@ err_create_ft:
 	return rule;
 }
 
-static struct mlx5_flow_rule *mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
-						    struct mlx5_flow_spec *spec,
-						    struct mlx5_esw_flow_attr *attr)
+static struct mlx5_flow_handle *
+mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
+		      struct mlx5_flow_spec *spec,
+		      struct mlx5_esw_flow_attr *attr)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	int err;
@@ -129,7 +131,7 @@ static struct mlx5_flow_rule *mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 }
 
 static void mlx5e_tc_del_flow(struct mlx5e_priv *priv,
-			      struct mlx5_flow_rule *rule,
+			      struct mlx5_flow_handle *rule,
 			      struct mlx5_esw_flow_attr *attr)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
@@ -140,7 +142,7 @@ static void mlx5e_tc_del_flow(struct mlx5e_priv *priv,
 	if (esw && esw->mode == SRIOV_OFFLOADS)
 		mlx5_eswitch_del_vlan_action(esw, attr);
 
-	mlx5_del_flow_rule(rule);
+	mlx5_del_flow_rules(rule);
 
 	mlx5_fc_destroy(priv->mdev, counter);
 
@@ -450,7 +452,7 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
 	u32 flow_tag, action;
 	struct mlx5e_tc_flow *flow;
 	struct mlx5_flow_spec *spec;
-	struct mlx5_flow_rule *old = NULL;
+	struct mlx5_flow_handle *old = NULL;
 	struct mlx5_esw_flow_attr *old_attr = NULL;
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 
@@ -511,7 +513,7 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
 	goto out;
 
 err_del_rule:
-	mlx5_del_flow_rule(flow->rule);
+	mlx5_del_flow_rules(flow->rule);
 
 err_free:
 	if (!old)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 9ef01d1bea06..fcd8b15f6625 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -56,7 +56,7 @@ struct esw_uc_addr {
 /* E-Switch MC FDB table hash node */
 struct esw_mc_addr { /* SRIOV only */
 	struct l2addr_node     node;
-	struct mlx5_flow_rule *uplink_rule; /* Forward to uplink rule */
+	struct mlx5_flow_handle *uplink_rule; /* Forward to uplink rule */
 	u32                    refcnt;
 };
 
@@ -65,7 +65,7 @@ struct vport_addr {
 	struct l2addr_node     node;
 	u8                     action;
 	u32                    vport;
-	struct mlx5_flow_rule *flow_rule; /* SRIOV only */
+	struct mlx5_flow_handle *flow_rule; /* SRIOV only */
 	/* A flag indicating that mac was added due to mc promiscuous vport */
 	bool mc_promisc;
 };
@@ -237,13 +237,13 @@ static void del_l2_table_entry(struct mlx5_core_dev *dev, u32 index)
 }
 
 /* E-Switch FDB */
-static struct mlx5_flow_rule *
+static struct mlx5_flow_handle *
 __esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u32 vport, bool rx_rule,
 			 u8 mac_c[ETH_ALEN], u8 mac_v[ETH_ALEN])
 {
 	int match_header = (is_zero_ether_addr(mac_c) ? 0 :
 			    MLX5_MATCH_OUTER_HEADERS);
-	struct mlx5_flow_rule *flow_rule = NULL;
+	struct mlx5_flow_handle *flow_rule = NULL;
 	struct mlx5_flow_destination dest;
 	struct mlx5_flow_spec *spec;
 	void *mv_misc = NULL;
@@ -286,9 +286,9 @@ __esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u32 vport, bool rx_rule,
 		  dmac_v, dmac_c, vport);
 	spec->match_criteria_enable = match_header;
 	flow_rule =
-		mlx5_add_flow_rule(esw->fdb_table.fdb, spec,
-				   MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				   0, &dest);
+		mlx5_add_flow_rules(esw->fdb_table.fdb, spec,
+				    MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+				    0, &dest, 1);
 	if (IS_ERR(flow_rule)) {
 		esw_warn(esw->dev,
 			 "FDB: Failed to add flow rule: dmac_v(%pM) dmac_c(%pM) -> vport(%d), err(%ld)\n",
@@ -300,7 +300,7 @@ __esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u32 vport, bool rx_rule,
 	return flow_rule;
 }
 
-static struct mlx5_flow_rule *
+static struct mlx5_flow_handle *
 esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u8 mac[ETH_ALEN], u32 vport)
 {
 	u8 mac_c[ETH_ALEN];
@@ -309,7 +309,7 @@ esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u8 mac[ETH_ALEN], u32 vport)
 	return __esw_fdb_set_vport_rule(esw, vport, false, mac_c, mac);
 }
 
-static struct mlx5_flow_rule *
+static struct mlx5_flow_handle *
 esw_fdb_set_vport_allmulti_rule(struct mlx5_eswitch *esw, u32 vport)
 {
 	u8 mac_c[ETH_ALEN];
@@ -322,7 +322,7 @@ esw_fdb_set_vport_allmulti_rule(struct mlx5_eswitch *esw, u32 vport)
 	return __esw_fdb_set_vport_rule(esw, vport, false, mac_c, mac_v);
 }
 
-static struct mlx5_flow_rule *
+static struct mlx5_flow_handle *
 esw_fdb_set_vport_promisc_rule(struct mlx5_eswitch *esw, u32 vport)
 {
 	u8 mac_c[ETH_ALEN];
@@ -515,7 +515,7 @@ static int esw_del_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 	del_l2_table_entry(esw->dev, esw_uc->table_index);
 
 	if (vaddr->flow_rule)
-		mlx5_del_flow_rule(vaddr->flow_rule);
+		mlx5_del_flow_rules(vaddr->flow_rule);
 	vaddr->flow_rule = NULL;
 
 	l2addr_hash_del(esw_uc);
@@ -562,7 +562,7 @@ static void update_allmulti_vports(struct mlx5_eswitch *esw,
 		case MLX5_ACTION_DEL:
 			if (!iter_vaddr)
 				continue;
-			mlx5_del_flow_rule(iter_vaddr->flow_rule);
+			mlx5_del_flow_rules(iter_vaddr->flow_rule);
 			l2addr_hash_del(iter_vaddr);
 			break;
 		}
@@ -632,7 +632,7 @@ static int esw_del_mc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 		  esw_mc->uplink_rule);
 
 	if (vaddr->flow_rule)
-		mlx5_del_flow_rule(vaddr->flow_rule);
+		mlx5_del_flow_rules(vaddr->flow_rule);
 	vaddr->flow_rule = NULL;
 
 	/* If the multicast mac is added as a result of mc promiscuous vport,
@@ -645,7 +645,7 @@ static int esw_del_mc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 	update_allmulti_vports(esw, vaddr, esw_mc);
 
 	if (esw_mc->uplink_rule)
-		mlx5_del_flow_rule(esw_mc->uplink_rule);
+		mlx5_del_flow_rules(esw_mc->uplink_rule);
 
 	l2addr_hash_del(esw_mc);
 	return 0;
@@ -828,14 +828,14 @@ static void esw_apply_vport_rx_mode(struct mlx5_eswitch *esw, u32 vport_num,
 								UPLINK_VPORT);
 		allmulti_addr->refcnt++;
 	} else if (vport->allmulti_rule) {
-		mlx5_del_flow_rule(vport->allmulti_rule);
+		mlx5_del_flow_rules(vport->allmulti_rule);
 		vport->allmulti_rule = NULL;
 
 		if (--allmulti_addr->refcnt > 0)
 			goto promisc;
 
 		if (allmulti_addr->uplink_rule)
-			mlx5_del_flow_rule(allmulti_addr->uplink_rule);
+			mlx5_del_flow_rules(allmulti_addr->uplink_rule);
 		allmulti_addr->uplink_rule = NULL;
 	}
 
@@ -847,7 +847,7 @@ promisc:
 		vport->promisc_rule = esw_fdb_set_vport_promisc_rule(esw,
 								     vport_num);
 	} else if (vport->promisc_rule) {
-		mlx5_del_flow_rule(vport->promisc_rule);
+		mlx5_del_flow_rules(vport->promisc_rule);
 		vport->promisc_rule = NULL;
 	}
 }
@@ -1015,10 +1015,10 @@ static void esw_vport_cleanup_egress_rules(struct mlx5_eswitch *esw,
 					   struct mlx5_vport *vport)
 {
 	if (!IS_ERR_OR_NULL(vport->egress.allowed_vlan))
-		mlx5_del_flow_rule(vport->egress.allowed_vlan);
+		mlx5_del_flow_rules(vport->egress.allowed_vlan);
 
 	if (!IS_ERR_OR_NULL(vport->egress.drop_rule))
-		mlx5_del_flow_rule(vport->egress.drop_rule);
+		mlx5_del_flow_rules(vport->egress.drop_rule);
 
 	vport->egress.allowed_vlan = NULL;
 	vport->egress.drop_rule = NULL;
@@ -1173,10 +1173,10 @@ static void esw_vport_cleanup_ingress_rules(struct mlx5_eswitch *esw,
 					    struct mlx5_vport *vport)
 {
 	if (!IS_ERR_OR_NULL(vport->ingress.drop_rule))
-		mlx5_del_flow_rule(vport->ingress.drop_rule);
+		mlx5_del_flow_rules(vport->ingress.drop_rule);
 
 	if (!IS_ERR_OR_NULL(vport->ingress.allow_rule))
-		mlx5_del_flow_rule(vport->ingress.allow_rule);
+		mlx5_del_flow_rules(vport->ingress.allow_rule);
 
 	vport->ingress.drop_rule = NULL;
 	vport->ingress.allow_rule = NULL;
@@ -1253,9 +1253,9 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
 
 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
 	vport->ingress.allow_rule =
-		mlx5_add_flow_rule(vport->ingress.acl, spec,
-				   MLX5_FLOW_CONTEXT_ACTION_ALLOW,
-				   0, NULL);
+		mlx5_add_flow_rules(vport->ingress.acl, spec,
+				    MLX5_FLOW_CONTEXT_ACTION_ALLOW,
+				    0, NULL, 0);
 	if (IS_ERR(vport->ingress.allow_rule)) {
 		err = PTR_ERR(vport->ingress.allow_rule);
 		esw_warn(esw->dev,
@@ -1267,9 +1267,9 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
 
 	memset(spec, 0, sizeof(*spec));
 	vport->ingress.drop_rule =
-		mlx5_add_flow_rule(vport->ingress.acl, spec,
-				   MLX5_FLOW_CONTEXT_ACTION_DROP,
-				   0, NULL);
+		mlx5_add_flow_rules(vport->ingress.acl, spec,
+				    MLX5_FLOW_CONTEXT_ACTION_DROP,
+				    0, NULL, 0);
 	if (IS_ERR(vport->ingress.drop_rule)) {
 		err = PTR_ERR(vport->ingress.drop_rule);
 		esw_warn(esw->dev,
@@ -1321,9 +1321,9 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw,
 
 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
 	vport->egress.allowed_vlan =
-		mlx5_add_flow_rule(vport->egress.acl, spec,
-				   MLX5_FLOW_CONTEXT_ACTION_ALLOW,
-				   0, NULL);
+		mlx5_add_flow_rules(vport->egress.acl, spec,
+				    MLX5_FLOW_CONTEXT_ACTION_ALLOW,
+				    0, NULL, 0);
 	if (IS_ERR(vport->egress.allowed_vlan)) {
 		err = PTR_ERR(vport->egress.allowed_vlan);
 		esw_warn(esw->dev,
@@ -1336,9 +1336,9 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw,
 	/* Drop others rule (star rule) */
 	memset(spec, 0, sizeof(*spec));
 	vport->egress.drop_rule =
-		mlx5_add_flow_rule(vport->egress.acl, spec,
-				   MLX5_FLOW_CONTEXT_ACTION_DROP,
-				   0, NULL);
+		mlx5_add_flow_rules(vport->egress.acl, spec,
+				    MLX5_FLOW_CONTEXT_ACTION_DROP,
+				    0, NULL, 0);
 	if (IS_ERR(vport->egress.drop_rule)) {
 		err = PTR_ERR(vport->egress.drop_rule);
 		esw_warn(esw->dev,
@@ -1667,7 +1667,7 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
 		esw_disable_vport(esw, i);
 
 	if (mc_promisc && mc_promisc->uplink_rule)
-		mlx5_del_flow_rule(mc_promisc->uplink_rule);
+		mlx5_del_flow_rules(mc_promisc->uplink_rule);
 
 	esw_destroy_tsar(esw);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index ddae90c1f15b..6d414cb1b75f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -97,16 +97,16 @@ struct vport_ingress {
 	struct mlx5_flow_group *allow_spoofchk_only_grp;
 	struct mlx5_flow_group *allow_untagged_only_grp;
 	struct mlx5_flow_group *drop_grp;
-	struct mlx5_flow_rule  *allow_rule;
-	struct mlx5_flow_rule  *drop_rule;
+	struct mlx5_flow_handle  *allow_rule;
+	struct mlx5_flow_handle  *drop_rule;
 };
 
 struct vport_egress {
 	struct mlx5_flow_table *acl;
 	struct mlx5_flow_group *allowed_vlans_grp;
 	struct mlx5_flow_group *drop_grp;
-	struct mlx5_flow_rule  *allowed_vlan;
-	struct mlx5_flow_rule  *drop_rule;
+	struct mlx5_flow_handle  *allowed_vlan;
+	struct mlx5_flow_handle  *drop_rule;
 };
 
 struct mlx5_vport_info {
@@ -125,8 +125,8 @@ struct mlx5_vport {
 	int                     vport;
 	struct hlist_head       uc_list[MLX5_L2_ADDR_HASH_SIZE];
 	struct hlist_head       mc_list[MLX5_L2_ADDR_HASH_SIZE];
-	struct mlx5_flow_rule   *promisc_rule;
-	struct mlx5_flow_rule   *allmulti_rule;
+	struct mlx5_flow_handle *promisc_rule;
+	struct mlx5_flow_handle *allmulti_rule;
 	struct work_struct      vport_change_handler;
 
 	struct vport_ingress    ingress;
@@ -162,7 +162,7 @@ struct mlx5_eswitch_fdb {
 			struct mlx5_flow_table *fdb;
 			struct mlx5_flow_group *send_to_vport_grp;
 			struct mlx5_flow_group *miss_grp;
-			struct mlx5_flow_rule  *miss_rule;
+			struct mlx5_flow_handle *miss_rule;
 			int vlan_push_pop_refcount;
 		} offloads;
 	};
@@ -175,7 +175,7 @@ enum {
 };
 
 struct mlx5_esw_sq {
-	struct mlx5_flow_rule	*send_to_vport_rule;
+	struct mlx5_flow_handle	*send_to_vport_rule;
 	struct list_head	 list;
 };
 
@@ -188,7 +188,7 @@ struct mlx5_eswitch_rep {
 	u8		       hw_id[ETH_ALEN];
 	void		      *priv_data;
 
-	struct mlx5_flow_rule *vport_rx_rule;
+	struct mlx5_flow_handle *vport_rx_rule;
 	struct list_head       vport_sqs_list;
 	u16		       vlan;
 	u32		       vlan_refcount;
@@ -257,11 +257,11 @@ int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
 struct mlx5_flow_spec;
 struct mlx5_esw_flow_attr;
 
-struct mlx5_flow_rule *
+struct mlx5_flow_handle *
 mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_spec *spec,
 				struct mlx5_esw_flow_attr *attr);
-struct mlx5_flow_rule *
+struct mlx5_flow_handle *
 mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn);
 
 enum {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index c55ad8d00c05..8b2a3832cd0a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -43,14 +43,14 @@ enum {
 	FDB_SLOW_PATH
 };
 
-struct mlx5_flow_rule *
+struct mlx5_flow_handle *
 mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_spec *spec,
 				struct mlx5_esw_flow_attr *attr)
 {
 	struct mlx5_flow_destination dest = { 0 };
 	struct mlx5_fc *counter = NULL;
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
 	void *misc;
 	int action;
 
@@ -80,8 +80,8 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS |
 				      MLX5_MATCH_MISC_PARAMETERS;
 
-	rule = mlx5_add_flow_rule((struct mlx5_flow_table *)esw->fdb_table.fdb,
-				  spec, action, 0, &dest);
+	rule = mlx5_add_flow_rules((struct mlx5_flow_table *)esw->fdb_table.fdb,
+				   spec, action, 0, &dest, 1);
 
 	if (IS_ERR(rule))
 		mlx5_fc_destroy(esw->dev, counter);
@@ -269,11 +269,11 @@ out:
 	return err;
 }
 
-static struct mlx5_flow_rule *
+static struct mlx5_flow_handle *
 mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn)
 {
 	struct mlx5_flow_destination dest;
-	struct mlx5_flow_rule *flow_rule;
+	struct mlx5_flow_handle *flow_rule;
 	struct mlx5_flow_spec *spec;
 	void *misc;
 
@@ -296,9 +296,9 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
 	dest.vport_num = vport;
 
-	flow_rule = mlx5_add_flow_rule(esw->fdb_table.offloads.fdb, spec,
-				       MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				       0, &dest);
+	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
+					MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					0, &dest, 1);
 	if (IS_ERR(flow_rule))
 		esw_warn(esw->dev, "FDB: Failed to add send to vport rule err %ld\n", PTR_ERR(flow_rule));
 out:
@@ -315,7 +315,7 @@ void mlx5_eswitch_sqs2vport_stop(struct mlx5_eswitch *esw,
 		return;
 
 	list_for_each_entry_safe(esw_sq, tmp, &rep->vport_sqs_list, list) {
-		mlx5_del_flow_rule(esw_sq->send_to_vport_rule);
+		mlx5_del_flow_rules(esw_sq->send_to_vport_rule);
 		list_del(&esw_sq->list);
 		kfree(esw_sq);
 	}
@@ -325,7 +325,7 @@ int mlx5_eswitch_sqs2vport_start(struct mlx5_eswitch *esw,
 				 struct mlx5_eswitch_rep *rep,
 				 u16 *sqns_array, int sqns_num)
 {
-	struct mlx5_flow_rule *flow_rule;
+	struct mlx5_flow_handle *flow_rule;
 	struct mlx5_esw_sq *esw_sq;
 	int err;
 	int i;
@@ -362,7 +362,7 @@ out_err:
 static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
 {
 	struct mlx5_flow_destination dest;
-	struct mlx5_flow_rule *flow_rule = NULL;
+	struct mlx5_flow_handle *flow_rule = NULL;
 	struct mlx5_flow_spec *spec;
 	int err = 0;
 
@@ -376,9 +376,9 @@ static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
 	dest.vport_num = 0;
 
-	flow_rule = mlx5_add_flow_rule(esw->fdb_table.offloads.fdb, spec,
-				       MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				       0, &dest);
+	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
+					MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					0, &dest, 1);
 	if (IS_ERR(flow_rule)) {
 		err = PTR_ERR(flow_rule);
 		esw_warn(esw->dev,  "FDB: Failed to add miss flow rule err %d\n", err);
@@ -501,7 +501,7 @@ static void esw_destroy_offloads_fdb_table(struct mlx5_eswitch *esw)
 		return;
 
 	esw_debug(esw->dev, "Destroy offloads FDB Table\n");
-	mlx5_del_flow_rule(esw->fdb_table.offloads.miss_rule);
+	mlx5_del_flow_rules(esw->fdb_table.offloads.miss_rule);
 	mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
 	mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp);
 
@@ -585,11 +585,11 @@ static void esw_destroy_vport_rx_group(struct mlx5_eswitch *esw)
 	mlx5_destroy_flow_group(esw->offloads.vport_rx_group);
 }
 
-struct mlx5_flow_rule *
+struct mlx5_flow_handle *
 mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn)
 {
 	struct mlx5_flow_destination dest;
-	struct mlx5_flow_rule *flow_rule;
+	struct mlx5_flow_handle *flow_rule;
 	struct mlx5_flow_spec *spec;
 	void *misc;
 
@@ -610,9 +610,9 @@ mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn)
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
 	dest.tir_num = tirn;
 
-	flow_rule = mlx5_add_flow_rule(esw->offloads.ft_offloads, spec,
-				       MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				       0, &dest);
+	flow_rule = mlx5_add_flow_rules(esw->offloads.ft_offloads, spec,
+					MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					0, &dest, 1);
 	if (IS_ERR(flow_rule)) {
 		esw_warn(esw->dev, "fs offloads: Failed to add vport rx rule err %ld\n", PTR_ERR(flow_rule));
 		goto out;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 43d7052c76fc..6732287a98c8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -155,6 +155,9 @@ static void del_flow_group(struct fs_node *node);
 static void del_fte(struct fs_node *node);
 static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
 				struct mlx5_flow_destination *d2);
+static struct mlx5_flow_rule *
+find_flow_rule(struct fs_fte *fte,
+	       struct mlx5_flow_destination *dest);
 
 static void tree_init_node(struct fs_node *node,
 			   unsigned int refcount,
@@ -640,8 +643,8 @@ static int update_root_ft_create(struct mlx5_flow_table *ft, struct fs_prio
 	return err;
 }
 
-int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
-				 struct mlx5_flow_destination *dest)
+static int _mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
+					 struct mlx5_flow_destination *dest)
 {
 	struct mlx5_flow_table *ft;
 	struct mlx5_flow_group *fg;
@@ -666,6 +669,28 @@ int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
 	return err;
 }
 
+int mlx5_modify_rule_destination(struct mlx5_flow_handle *handle,
+				 struct mlx5_flow_destination *new_dest,
+				 struct mlx5_flow_destination *old_dest)
+{
+	int i;
+
+	if (!old_dest) {
+		if (handle->num_rules != 1)
+			return -EINVAL;
+		return _mlx5_modify_rule_destination(handle->rule[0],
+						     new_dest);
+	}
+
+	for (i = 0; i < handle->num_rules; i++) {
+		if (mlx5_flow_dests_cmp(new_dest, &handle->rule[i]->dest_attr))
+			return _mlx5_modify_rule_destination(handle->rule[i],
+							     new_dest);
+	}
+
+	return -EINVAL;
+}
+
 /* Modify/set FWD rules that point on old_next_ft to point on new_next_ft  */
 static int connect_fwd_rules(struct mlx5_core_dev *dev,
 			     struct mlx5_flow_table *new_next_ft,
@@ -688,7 +713,7 @@ static int connect_fwd_rules(struct mlx5_core_dev *dev,
 	list_splice_init(&old_next_ft->fwd_rules, &new_next_ft->fwd_rules);
 	mutex_unlock(&old_next_ft->lock);
 	list_for_each_entry(iter, &new_next_ft->fwd_rules, next_ft) {
-		err = mlx5_modify_rule_destination(iter, &dest);
+		err = _mlx5_modify_rule_destination(iter, &dest);
 		if (err)
 			pr_err("mlx5_core: failed to modify rule to point on flow table %d\n",
 			       new_next_ft->id);
@@ -917,41 +942,117 @@ static struct mlx5_flow_rule *alloc_rule(struct mlx5_flow_destination *dest)
 	return rule;
 }
 
-/* fte should not be deleted while calling this function */
-static struct mlx5_flow_rule *add_rule_fte(struct fs_fte *fte,
-					   struct mlx5_flow_group *fg,
-					   struct mlx5_flow_destination *dest,
-					   bool update_action)
+static struct mlx5_flow_handle *alloc_handle(int num_rules)
 {
+	struct mlx5_flow_handle *handle;
+
+	handle = kzalloc(sizeof(*handle) + sizeof(handle->rule[0]) *
+			  num_rules, GFP_KERNEL);
+	if (!handle)
+		return NULL;
+
+	handle->num_rules = num_rules;
+
+	return handle;
+}
+
+static void destroy_flow_handle(struct fs_fte *fte,
+				struct mlx5_flow_handle *handle,
+				struct mlx5_flow_destination *dest,
+				int i)
+{
+	for (; --i >= 0;) {
+		if (atomic_dec_and_test(&handle->rule[i]->node.refcount)) {
+			fte->dests_size--;
+			list_del(&handle->rule[i]->node.list);
+			kfree(handle->rule[i]);
+		}
+	}
+	kfree(handle);
+}
+
+static struct mlx5_flow_handle *
+create_flow_handle(struct fs_fte *fte,
+		   struct mlx5_flow_destination *dest,
+		   int dest_num,
+		   int *modify_mask,
+		   bool *new_rule)
+{
+	struct mlx5_flow_handle *handle;
+	struct mlx5_flow_rule *rule = NULL;
+	static int count = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS);
+	static int dst = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
+	int type;
+	int i = 0;
+
+	handle = alloc_handle((dest_num) ? dest_num : 1);
+	if (!handle)
+		return ERR_PTR(-ENOMEM);
+
+	do {
+		if (dest) {
+			rule = find_flow_rule(fte, dest + i);
+			if (rule) {
+				atomic_inc(&rule->node.refcount);
+				goto rule_found;
+			}
+		}
+
+		*new_rule = true;
+		rule = alloc_rule(dest + i);
+		if (!rule)
+			goto free_rules;
+
+		/* Add dest to dests list- we need flow tables to be in the
+		 * end of the list for forward to next prio rules.
+		 */
+		tree_init_node(&rule->node, 1, del_rule);
+		if (dest &&
+		    dest[i].type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
+			list_add(&rule->node.list, &fte->node.children);
+		else
+			list_add_tail(&rule->node.list, &fte->node.children);
+		if (dest) {
+			fte->dests_size++;
+
+			type = dest[i].type ==
+				MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+			*modify_mask |= type ? count : dst;
+		}
+rule_found:
+		handle->rule[i] = rule;
+	} while (++i < dest_num);
+
+	return handle;
+
+free_rules:
+	destroy_flow_handle(fte, handle, dest, i);
+	return ERR_PTR(-ENOMEM);
+}
+
+/* fte should not be deleted while calling this function */
+static struct mlx5_flow_handle *
+add_rule_fte(struct fs_fte *fte,
+	     struct mlx5_flow_group *fg,
+	     struct mlx5_flow_destination *dest,
+	     int dest_num,
+	     bool update_action)
+{
+	struct mlx5_flow_handle *handle;
 	struct mlx5_flow_table *ft;
-	struct mlx5_flow_rule *rule;
 	int modify_mask = 0;
 	int err;
+	bool new_rule = false;
 
-	rule = alloc_rule(dest);
-	if (!rule)
-		return ERR_PTR(-ENOMEM);
+	handle = create_flow_handle(fte, dest, dest_num, &modify_mask,
+				    &new_rule);
+	if (IS_ERR(handle) || !new_rule)
+		goto out;
 
 	if (update_action)
 		modify_mask |= BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION);
 
 	fs_get_obj(ft, fg->node.parent);
-	/* Add dest to dests list- we need flow tables to be in the
-	 * end of the list for forward to next prio rules.
-	 */
-	tree_init_node(&rule->node, 1, del_rule);
-	if (dest && dest->type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
-		list_add(&rule->node.list, &fte->node.children);
-	else
-		list_add_tail(&rule->node.list, &fte->node.children);
-	if (dest) {
-		fte->dests_size++;
-
-		modify_mask |= dest->type == MLX5_FLOW_DESTINATION_TYPE_COUNTER ?
-			BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS) :
-			BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
-	}
-
 	if (!(fte->status & FS_FTE_STATUS_EXISTING))
 		err = mlx5_cmd_create_fte(get_dev(&ft->node),
 					  ft, fg->id, fte);
@@ -959,17 +1060,15 @@ static struct mlx5_flow_rule *add_rule_fte(struct fs_fte *fte,
 		err = mlx5_cmd_update_fte(get_dev(&ft->node),
 					  ft, fg->id, modify_mask, fte);
 	if (err)
-		goto free_rule;
+		goto free_handle;
 
 	fte->status |= FS_FTE_STATUS_EXISTING;
 
-	return rule;
+out:
+	return handle;
 
-free_rule:
-	list_del(&rule->node.list);
-	kfree(rule);
-	if (dest)
-		fte->dests_size--;
+free_handle:
+	destroy_flow_handle(fte, handle, dest, handle->num_rules);
 	return ERR_PTR(err);
 }
 
@@ -1098,16 +1197,18 @@ static struct mlx5_flow_rule *find_flow_rule(struct fs_fte *fte,
 	return NULL;
 }
 
-static struct mlx5_flow_rule *add_rule_fg(struct mlx5_flow_group *fg,
-					  u32 *match_value,
-					  u8 action,
-					  u32 flow_tag,
-					  struct mlx5_flow_destination *dest)
+static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
+					    u32 *match_value,
+					    u8 action,
+					    u32 flow_tag,
+					    struct mlx5_flow_destination *dest,
+					    int dest_num)
 {
-	struct fs_fte *fte;
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *handle;
 	struct mlx5_flow_table *ft;
 	struct list_head *prev;
+	struct fs_fte *fte;
+	int i;
 
 	nested_lock_ref_node(&fg->node, FS_MUTEX_PARENT);
 	fs_for_each_fte(fte, fg) {
@@ -1116,40 +1217,33 @@ static struct mlx5_flow_rule *add_rule_fg(struct mlx5_flow_group *fg,
 		    (action & fte->action) && flow_tag == fte->flow_tag) {
 			int old_action = fte->action;
 
-			rule = find_flow_rule(fte, dest);
-			if (rule) {
-				atomic_inc(&rule->node.refcount);
-				unlock_ref_node(&fte->node);
-				unlock_ref_node(&fg->node);
-				return rule;
-			}
 			fte->action |= action;
-			rule = add_rule_fte(fte, fg, dest,
-					    old_action != action);
-			if (IS_ERR(rule)) {
+			handle = add_rule_fte(fte, fg, dest, dest_num,
+					      old_action != action);
+			if (IS_ERR(handle)) {
 				fte->action = old_action;
 				goto unlock_fte;
 			} else {
-				goto add_rule;
+				goto add_rules;
 			}
 		}
 		unlock_ref_node(&fte->node);
 	}
 	fs_get_obj(ft, fg->node.parent);
 	if (fg->num_ftes >= fg->max_ftes) {
-		rule = ERR_PTR(-ENOSPC);
+		handle = ERR_PTR(-ENOSPC);
 		goto unlock_fg;
 	}
 
 	fte = create_fte(fg, match_value, action, flow_tag, &prev);
 	if (IS_ERR(fte)) {
-		rule = (void *)fte;
+		handle = (void *)fte;
 		goto unlock_fg;
 	}
 	tree_init_node(&fte->node, 0, del_fte);
 	nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD);
-	rule = add_rule_fte(fte, fg, dest, false);
-	if (IS_ERR(rule)) {
+	handle = add_rule_fte(fte, fg, dest, dest_num, false);
+	if (IS_ERR(handle)) {
 		kfree(fte);
 		goto unlock_fg;
 	}
@@ -1158,21 +1252,24 @@ static struct mlx5_flow_rule *add_rule_fg(struct mlx5_flow_group *fg,
 
 	tree_add_node(&fte->node, &fg->node);
 	list_add(&fte->node.list, prev);
-add_rule:
-	tree_add_node(&rule->node, &fte->node);
+add_rules:
+	for (i = 0; i < handle->num_rules; i++) {
+		if (atomic_read(&handle->rule[i]->node.refcount) == 1)
+			tree_add_node(&handle->rule[i]->node, &fte->node);
+	}
 unlock_fte:
 	unlock_ref_node(&fte->node);
 unlock_fg:
 	unlock_ref_node(&fg->node);
-	return rule;
+	return handle;
 }
 
-struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_rule *rule)
+struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_handle *handle)
 {
 	struct mlx5_flow_rule *dst;
 	struct fs_fte *fte;
 
-	fs_get_obj(fte, rule->node.parent);
+	fs_get_obj(fte, handle->rule[0]->node.parent);
 
 	fs_for_each_dst(dst, fte) {
 		if (dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER)
@@ -1211,18 +1308,22 @@ static bool dest_is_valid(struct mlx5_flow_destination *dest,
 	return true;
 }
 
-static struct mlx5_flow_rule *
-_mlx5_add_flow_rule(struct mlx5_flow_table *ft,
-		   struct mlx5_flow_spec *spec,
-		    u32 action,
-		    u32 flow_tag,
-		    struct mlx5_flow_destination *dest)
+static struct mlx5_flow_handle *
+_mlx5_add_flow_rules(struct mlx5_flow_table *ft,
+		     struct mlx5_flow_spec *spec,
+		     u32 action,
+		     u32 flow_tag,
+		     struct mlx5_flow_destination *dest,
+		     int dest_num)
 {
 	struct mlx5_flow_group *g;
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
+	int i;
 
-	if (!dest_is_valid(dest, action, ft))
-		return ERR_PTR(-EINVAL);
+	for (i = 0; i < dest_num; i++) {
+		if (!dest_is_valid(&dest[i], action, ft))
+			return ERR_PTR(-EINVAL);
+	}
 
 	nested_lock_ref_node(&ft->node, FS_MUTEX_GRANDPARENT);
 	fs_for_each_fg(g, ft)
@@ -1231,7 +1332,7 @@ _mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 					   g->mask.match_criteria,
 					   spec->match_criteria)) {
 			rule = add_rule_fg(g, spec->match_value,
-					   action, flow_tag, dest);
+					   action, flow_tag, dest, dest_num);
 			if (!IS_ERR(rule) || PTR_ERR(rule) != -ENOSPC)
 				goto unlock;
 		}
@@ -1244,7 +1345,7 @@ _mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 	}
 
 	rule = add_rule_fg(g, spec->match_value,
-			   action, flow_tag, dest);
+			   action, flow_tag, dest, dest_num);
 	if (IS_ERR(rule)) {
 		/* Remove assumes refcount > 0 and autogroup creates a group
 		 * with a refcount = 0.
@@ -1265,17 +1366,18 @@ static bool fwd_next_prio_supported(struct mlx5_flow_table *ft)
 		(MLX5_CAP_FLOWTABLE(get_dev(&ft->node), nic_rx_multi_path_tirs)));
 }
 
-struct mlx5_flow_rule *
-mlx5_add_flow_rule(struct mlx5_flow_table *ft,
-		   struct mlx5_flow_spec *spec,
-		   u32 action,
-		   u32 flow_tag,
-		   struct mlx5_flow_destination *dest)
+struct mlx5_flow_handle *
+mlx5_add_flow_rules(struct mlx5_flow_table *ft,
+		    struct mlx5_flow_spec *spec,
+		    u32 action,
+		    u32 flow_tag,
+		    struct mlx5_flow_destination *dest,
+		    int dest_num)
 {
 	struct mlx5_flow_root_namespace *root = find_root(&ft->node);
 	struct mlx5_flow_destination gen_dest;
 	struct mlx5_flow_table *next_ft = NULL;
-	struct mlx5_flow_rule *rule = NULL;
+	struct mlx5_flow_handle *handle = NULL;
 	u32 sw_action = action;
 	struct fs_prio *prio;
 
@@ -1291,6 +1393,7 @@ mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 			gen_dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
 			gen_dest.ft = next_ft;
 			dest = &gen_dest;
+			dest_num = 1;
 			action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 		} else {
 			mutex_unlock(&root->chain_lock);
@@ -1298,27 +1401,33 @@ mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 		}
 	}
 
-	rule = _mlx5_add_flow_rule(ft, spec, action, flow_tag, dest);
+	handle = _mlx5_add_flow_rules(ft, spec, action, flow_tag, dest,
+				      dest_num);
 
 	if (sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
-		if (!IS_ERR_OR_NULL(rule) &&
-		    (list_empty(&rule->next_ft))) {
+		if (!IS_ERR_OR_NULL(handle) &&
+		    (list_empty(&handle->rule[0]->next_ft))) {
 			mutex_lock(&next_ft->lock);
-			list_add(&rule->next_ft, &next_ft->fwd_rules);
+			list_add(&handle->rule[0]->next_ft,
+				 &next_ft->fwd_rules);
 			mutex_unlock(&next_ft->lock);
-			rule->sw_action = MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
+			handle->rule[0]->sw_action = MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
 		}
 		mutex_unlock(&root->chain_lock);
 	}
-	return rule;
+	return handle;
 }
-EXPORT_SYMBOL(mlx5_add_flow_rule);
+EXPORT_SYMBOL(mlx5_add_flow_rules);
 
-void mlx5_del_flow_rule(struct mlx5_flow_rule *rule)
+void mlx5_del_flow_rules(struct mlx5_flow_handle *handle)
 {
-	tree_remove_node(&rule->node);
+	int i;
+
+	for (i = handle->num_rules - 1; i >= 0; i--)
+		tree_remove_node(&handle->rule[i]->node);
+	kfree(handle);
 }
-EXPORT_SYMBOL(mlx5_del_flow_rule);
+EXPORT_SYMBOL(mlx5_del_flow_rules);
 
 /* Assuming prio->node.children(flow tables) is sorted by level */
 static struct mlx5_flow_table *find_next_ft(struct mlx5_flow_table *ft)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 71ff03bceabb..d5150888645c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -94,6 +94,11 @@ struct mlx5_flow_rule {
 	u32					sw_action;
 };
 
+struct mlx5_flow_handle {
+	int num_rules;
+	struct mlx5_flow_rule *rule[];
+};
+
 /* Type of children is mlx5_flow_group */
 struct mlx5_flow_table {
 	struct fs_node			node;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 93ebc5e21334..0dcd287f4bd0 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -69,8 +69,8 @@ enum mlx5_flow_namespace_type {
 
 struct mlx5_flow_table;
 struct mlx5_flow_group;
-struct mlx5_flow_rule;
 struct mlx5_flow_namespace;
+struct mlx5_flow_handle;
 
 struct mlx5_flow_spec {
 	u8   match_criteria_enable;
@@ -127,18 +127,20 @@ void mlx5_destroy_flow_group(struct mlx5_flow_group *fg);
 /* Single destination per rule.
  * Group ID is implied by the match criteria.
  */
-struct mlx5_flow_rule *
-mlx5_add_flow_rule(struct mlx5_flow_table *ft,
-		   struct mlx5_flow_spec *spec,
-		   u32 action,
-		   u32 flow_tag,
-		   struct mlx5_flow_destination *dest);
-void mlx5_del_flow_rule(struct mlx5_flow_rule *fr);
-
-int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
-				 struct mlx5_flow_destination *dest);
-
-struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_rule *rule);
+struct mlx5_flow_handle *
+mlx5_add_flow_rules(struct mlx5_flow_table *ft,
+		    struct mlx5_flow_spec *spec,
+		    u32 action,
+		    u32 flow_tag,
+		    struct mlx5_flow_destination *dest,
+		    int dest_num);
+void mlx5_del_flow_rules(struct mlx5_flow_handle *fr);
+
+int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler,
+				 struct mlx5_flow_destination *new_dest,
+				 struct mlx5_flow_destination *old_dest);
+
+struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_handle *handler);
 struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
-- 
cgit v1.2.3


From c62cce2caee558e18aa05c01c2fd3b40f07174f2 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Mon, 24 Oct 2016 18:29:13 -0700
Subject: net: add an ioctl to get a socket network namespace

Each socket operates in a network namespace where it has been created,
so if we want to dump and restore a socket, we have to know its network
namespace.

We have a socket_diag to get information about sockets, it doesn't
report sockets which are not bound or connected.

This patch introduces a new socket ioctl, which is called SIOCGSKNS
and used to get a file descriptor for a socket network namespace.

A task must have CAP_NET_ADMIN in a target network namespace to
use this ioctl.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/nsfs.c                    |  2 +-
 include/linux/proc_fs.h      |  4 ++++
 include/uapi/linux/sockios.h |  1 +
 net/socket.c                 | 13 +++++++++++++
 4 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8718af895eab..8c9fb29c6673 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -118,7 +118,7 @@ again:
 	return ret;
 }
 
-static int open_related_ns(struct ns_common *ns,
+int open_related_ns(struct ns_common *ns,
 		   struct ns_common *(*get_ns)(struct ns_common *ns))
 {
 	struct path path = {};
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index b97bf2ef996e..368c7ad06ae5 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -82,4 +82,8 @@ static inline struct proc_dir_entry *proc_net_mkdir(
 	return proc_mkdir_data(name, 0, parent, net);
 }
 
+struct ns_common;
+int open_related_ns(struct ns_common *ns,
+		   struct ns_common *(*get_ns)(struct ns_common *ns));
+
 #endif /* _LINUX_PROC_FS_H */
diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h
index 8e7890b26d9a..83cc54ce6081 100644
--- a/include/uapi/linux/sockios.h
+++ b/include/uapi/linux/sockios.h
@@ -84,6 +84,7 @@
 #define SIOCWANDEV	0x894A		/* get/set netdev parameters	*/
 
 #define SIOCOUTQNSD	0x894B		/* output queue size (not sent only) */
+#define SIOCGSKNS	0x894C		/* get socket network namespace */
 
 /* ARP cache control calls. */
 		    /*  0x8950 - 0x8952  * obsolete calls, don't re-use */
diff --git a/net/socket.c b/net/socket.c
index 5a9bf5ee2464..970a7ea3fc4a 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -877,6 +877,11 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
  *	what to do with it - that's up to the protocol still.
  */
 
+static struct ns_common *get_net_ns(struct ns_common *ns)
+{
+	return &get_net(container_of(ns, struct net, ns))->ns;
+}
+
 static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	struct socket *sock;
@@ -945,6 +950,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 				err = dlci_ioctl_hook(cmd, argp);
 			mutex_unlock(&dlci_ioctl_mutex);
 			break;
+		case SIOCGSKNS:
+			err = -EPERM;
+			if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+				break;
+
+			err = open_related_ns(&net->ns, get_net_ns);
+			break;
 		default:
 			err = sock_do_ioctl(net, sock, cmd, arg);
 			break;
@@ -3093,6 +3105,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
 	case SIOCSIFVLAN:
 	case SIOCADDDLCI:
 	case SIOCDELDLCI:
+	case SIOCGSKNS:
 		return sock_ioctl(file, cmd, arg);
 
 	case SIOCGIFFLAGS:
-- 
cgit v1.2.3


From 9cf1f6a8c4cbb7836b838b51b3b02ddf32c6c6a0 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 28 Oct 2016 11:43:20 -0400
Subject: net: Move functions for configuring traffic classes out of inline
 headers

The functions for configuring the traffic class to queue mappings have
other effects that need to be addressed.  Instead of trying to export a
bunch of new functions just relocate the functions so that we can
instrument them directly with the functionality they will need.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 31 +++----------------------------
 net/core/dev.c            | 29 +++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 20ce8df115ac..e05ab3bd48d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1920,34 +1920,9 @@ int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc)
 	return 0;
 }
 
-static inline
-void netdev_reset_tc(struct net_device *dev)
-{
-	dev->num_tc = 0;
-	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
-	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
-}
-
-static inline
-int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
-{
-	if (tc >= dev->num_tc)
-		return -EINVAL;
-
-	dev->tc_to_txq[tc].count = count;
-	dev->tc_to_txq[tc].offset = offset;
-	return 0;
-}
-
-static inline
-int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
-{
-	if (num_tc > TC_MAX_QUEUE)
-		return -EINVAL;
-
-	dev->num_tc = num_tc;
-	return 0;
-}
+void netdev_reset_tc(struct net_device *dev);
+int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset);
+int netdev_set_num_tc(struct net_device *dev, u8 num_tc);
 
 static inline
 int netdev_get_num_tc(struct net_device *dev)
diff --git a/net/core/dev.c b/net/core/dev.c
index 8341dadf5e94..2d54be912136 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2173,6 +2173,35 @@ error:
 EXPORT_SYMBOL(netif_set_xps_queue);
 
 #endif
+void netdev_reset_tc(struct net_device *dev)
+{
+	dev->num_tc = 0;
+	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
+	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
+}
+EXPORT_SYMBOL(netdev_reset_tc);
+
+int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
+{
+	if (tc >= dev->num_tc)
+		return -EINVAL;
+
+	dev->tc_to_txq[tc].count = count;
+	dev->tc_to_txq[tc].offset = offset;
+	return 0;
+}
+EXPORT_SYMBOL(netdev_set_tc_queue);
+
+int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
+{
+	if (num_tc > TC_MAX_QUEUE)
+		return -EINVAL;
+
+	dev->num_tc = num_tc;
+	return 0;
+}
+EXPORT_SYMBOL(netdev_set_num_tc);
+
 /*
  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
-- 
cgit v1.2.3


From 8d059b0f6f5b1d3acf829454e1087818ad660058 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 28 Oct 2016 11:43:49 -0400
Subject: net: Add sysfs value to determine queue traffic class

Add a sysfs attribute for a Tx queue that allows us to determine the
traffic class for a given queue.  This will allow us to more easily
determine this in the future.  It is needed as XPS will take the traffic
class for a group of queues into account in order to avoid pulling traffic
from one traffic class into another.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 17 +++++++++++++++++
 net/core/net-sysfs.c      | 20 +++++++++++++++++++-
 3 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e05ab3bd48d2..d91a41860614 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1920,6 +1920,7 @@ int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc)
 	return 0;
 }
 
+int netdev_txq_to_tc(struct net_device *dev, unsigned int txq);
 void netdev_reset_tc(struct net_device *dev);
 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset);
 int netdev_set_num_tc(struct net_device *dev, u8 num_tc);
diff --git a/net/core/dev.c b/net/core/dev.c
index 2d54be912136..db0fdbbcd9b8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1948,6 +1948,23 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq)
 	}
 }
 
+int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
+{
+	if (dev->num_tc) {
+		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+		int i;
+
+		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
+			if ((txq - tc->offset) < tc->count)
+				return i;
+		}
+
+		return -1;
+	}
+
+	return 0;
+}
+
 #ifdef CONFIG_XPS
 static DEFINE_MUTEX(xps_map_mutex);
 #define xmap_dereference(P)		\
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index d4fe28606ff5..38bd9b933195 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1024,7 +1024,6 @@ static ssize_t show_trans_timeout(struct netdev_queue *queue,
 	return sprintf(buf, "%lu", trans_timeout);
 }
 
-#ifdef CONFIG_XPS
 static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
 {
 	struct net_device *dev = queue->dev;
@@ -1036,6 +1035,21 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
 	return i;
 }
 
+static ssize_t show_traffic_class(struct netdev_queue *queue,
+				  struct netdev_queue_attribute *attribute,
+				  char *buf)
+{
+	struct net_device *dev = queue->dev;
+	int index = get_netdev_queue_index(queue);
+	int tc = netdev_txq_to_tc(dev, index);
+
+	if (tc < 0)
+		return -EINVAL;
+
+	return sprintf(buf, "%u\n", tc);
+}
+
+#ifdef CONFIG_XPS
 static ssize_t show_tx_maxrate(struct netdev_queue *queue,
 			       struct netdev_queue_attribute *attribute,
 			       char *buf)
@@ -1078,6 +1092,9 @@ static struct netdev_queue_attribute queue_tx_maxrate =
 static struct netdev_queue_attribute queue_trans_timeout =
 	__ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
 
+static struct netdev_queue_attribute queue_traffic_class =
+	__ATTR(traffic_class, S_IRUGO, show_traffic_class, NULL);
+
 #ifdef CONFIG_BQL
 /*
  * Byte queue limits sysfs structures and functions.
@@ -1263,6 +1280,7 @@ static struct netdev_queue_attribute xps_cpus_attribute =
 
 static struct attribute *netdev_queue_default_attrs[] = {
 	&queue_trans_timeout.attr,
+	&queue_traffic_class.attr,
 #ifdef CONFIG_XPS
 	&xps_cpus_attribute.attr,
 	&queue_tx_maxrate.attr,
-- 
cgit v1.2.3


From 184c449f91fef521042970cca46bd5cdfc0e3a37 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 28 Oct 2016 11:50:13 -0400
Subject: net: Add support for XPS with QoS via traffic classes

This patch adds support for setting and using XPS when QoS via traffic
classes is enabled.  With this change we will factor in the priority and
traffic class mapping of the packet and use that information to correctly
select the queue.

This allows us to define a set of queues for a given traffic class via
mqprio and then configure the XPS mapping for those queues so that the
traffic flows can avoid head-of-line blocking between the individual CPUs
if so desired.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   4 +-
 net/core/dev.c            | 117 ++++++++++++++++++++++++++++++++--------------
 net/core/net-sysfs.c      |  31 +++++++-----
 3 files changed, 105 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d91a41860614..66fd61c681d9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -732,8 +732,8 @@ struct xps_dev_maps {
 	struct rcu_head rcu;
 	struct xps_map __rcu *cpu_map[0];
 };
-#define XPS_DEV_MAPS_SIZE (sizeof(struct xps_dev_maps) +		\
-    (nr_cpu_ids * sizeof(struct xps_map *)))
+#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +		\
+	(nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
 #endif /* CONFIG_XPS */
 
 #define TC_MAX_QUEUE	16
diff --git a/net/core/dev.c b/net/core/dev.c
index 108a6adce185..f23e28668f32 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2002,14 +2002,22 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
 				 struct xps_dev_maps *dev_maps,
 				 int cpu, u16 offset, u16 count)
 {
-	int i, j;
+	int num_tc = dev->num_tc ? : 1;
+	bool active = false;
+	int tci;
 
-	for (i = count, j = offset; i--; j++) {
-		if (!remove_xps_queue(dev_maps, cpu, j))
-			break;
+	for (tci = cpu * num_tc; num_tc--; tci++) {
+		int i, j;
+
+		for (i = count, j = offset; i--; j++) {
+			if (!remove_xps_queue(dev_maps, cpu, j))
+				break;
+		}
+
+		active |= i < 0;
 	}
 
-	return i < 0;
+	return active;
 }
 
 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
@@ -2086,20 +2094,28 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 			u16 index)
 {
 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+	int i, cpu, tci, numa_node_id = -2;
+	int maps_sz, num_tc = 1, tc = 0;
 	struct xps_map *map, *new_map;
-	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
-	int cpu, numa_node_id = -2;
 	bool active = false;
 
+	if (dev->num_tc) {
+		num_tc = dev->num_tc;
+		tc = netdev_txq_to_tc(dev, index);
+		if (tc < 0)
+			return -EINVAL;
+	}
+
+	maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
+	if (maps_sz < L1_CACHE_BYTES)
+		maps_sz = L1_CACHE_BYTES;
+
 	mutex_lock(&xps_map_mutex);
 
 	dev_maps = xmap_dereference(dev->xps_maps);
 
 	/* allocate memory for queue storage */
-	for_each_online_cpu(cpu) {
-		if (!cpumask_test_cpu(cpu, mask))
-			continue;
-
+	for_each_cpu_and(cpu, cpu_online_mask, mask) {
 		if (!new_dev_maps)
 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 		if (!new_dev_maps) {
@@ -2107,25 +2123,38 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 			return -ENOMEM;
 		}
 
-		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
+		tci = cpu * num_tc + tc;
+		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
 				 NULL;
 
 		map = expand_xps_map(map, cpu, index);
 		if (!map)
 			goto error;
 
-		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+		RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
 	}
 
 	if (!new_dev_maps)
 		goto out_no_new_maps;
 
 	for_each_possible_cpu(cpu) {
+		/* copy maps belonging to foreign traffic classes */
+		for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
+			/* fill in the new device map from the old device map */
+			map = xmap_dereference(dev_maps->cpu_map[tci]);
+			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+		}
+
+		/* We need to explicitly update tci as prevous loop
+		 * could break out early if dev_maps is NULL.
+		 */
+		tci = cpu * num_tc + tc;
+
 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
 			/* add queue to CPU maps */
 			int pos = 0;
 
-			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+			map = xmap_dereference(new_dev_maps->cpu_map[tci]);
 			while ((pos < map->len) && (map->queues[pos] != index))
 				pos++;
 
@@ -2139,26 +2168,36 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 #endif
 		} else if (dev_maps) {
 			/* fill in the new device map from the old device map */
-			map = xmap_dereference(dev_maps->cpu_map[cpu]);
-			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+			map = xmap_dereference(dev_maps->cpu_map[tci]);
+			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
 		}
 
+		/* copy maps belonging to foreign traffic classes */
+		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
+			/* fill in the new device map from the old device map */
+			map = xmap_dereference(dev_maps->cpu_map[tci]);
+			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
+		}
 	}
 
 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
 
 	/* Cleanup old maps */
-	if (dev_maps) {
-		for_each_possible_cpu(cpu) {
-			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
-			map = xmap_dereference(dev_maps->cpu_map[cpu]);
+	if (!dev_maps)
+		goto out_no_old_maps;
+
+	for_each_possible_cpu(cpu) {
+		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
+			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+			map = xmap_dereference(dev_maps->cpu_map[tci]);
 			if (map && map != new_map)
 				kfree_rcu(map, rcu);
 		}
-
-		kfree_rcu(dev_maps, rcu);
 	}
 
+	kfree_rcu(dev_maps, rcu);
+
+out_no_old_maps:
 	dev_maps = new_dev_maps;
 	active = true;
 
@@ -2173,11 +2212,12 @@ out_no_new_maps:
 
 	/* removes queue from unused CPUs */
 	for_each_possible_cpu(cpu) {
-		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
-			continue;
-
-		if (remove_xps_queue(dev_maps, cpu, index))
-			active = true;
+		for (i = tc, tci = cpu * num_tc; i--; tci++)
+			active |= remove_xps_queue(dev_maps, tci, index);
+		if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
+			active |= remove_xps_queue(dev_maps, tci, index);
+		for (i = num_tc - tc, tci++; --i; tci++)
+			active |= remove_xps_queue(dev_maps, tci, index);
 	}
 
 	/* free map if not active */
@@ -2193,11 +2233,14 @@ out_no_maps:
 error:
 	/* remove any maps that we added */
 	for_each_possible_cpu(cpu) {
-		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
-		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
-				 NULL;
-		if (new_map && new_map != map)
-			kfree(new_map);
+		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
+			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
+			map = dev_maps ?
+			      xmap_dereference(dev_maps->cpu_map[tci]) :
+			      NULL;
+			if (new_map && new_map != map)
+				kfree(new_map);
+		}
 	}
 
 	mutex_unlock(&xps_map_mutex);
@@ -3158,8 +3201,14 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 	rcu_read_lock();
 	dev_maps = rcu_dereference(dev->xps_maps);
 	if (dev_maps) {
-		map = rcu_dereference(
-		    dev_maps->cpu_map[skb->sender_cpu - 1]);
+		unsigned int tci = skb->sender_cpu - 1;
+
+		if (dev->num_tc) {
+			tci *= dev->num_tc;
+			tci += netdev_get_prio_tc_map(dev, skb->priority);
+		}
+
+		map = rcu_dereference(dev_maps->cpu_map[tci]);
 		if (map) {
 			if (map->len == 1)
 				queue_index = map->queues[0];
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 38bd9b933195..b0c04cf4851d 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1210,29 +1210,38 @@ static ssize_t show_xps_map(struct netdev_queue *queue,
 			    struct netdev_queue_attribute *attribute, char *buf)
 {
 	struct net_device *dev = queue->dev;
+	int cpu, len, num_tc = 1, tc = 0;
 	struct xps_dev_maps *dev_maps;
 	cpumask_var_t mask;
 	unsigned long index;
-	int i, len;
 
 	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
 		return -ENOMEM;
 
 	index = get_netdev_queue_index(queue);
 
+	if (dev->num_tc) {
+		num_tc = dev->num_tc;
+		tc = netdev_txq_to_tc(dev, index);
+		if (tc < 0)
+			return -EINVAL;
+	}
+
 	rcu_read_lock();
 	dev_maps = rcu_dereference(dev->xps_maps);
 	if (dev_maps) {
-		for_each_possible_cpu(i) {
-			struct xps_map *map =
-			    rcu_dereference(dev_maps->cpu_map[i]);
-			if (map) {
-				int j;
-				for (j = 0; j < map->len; j++) {
-					if (map->queues[j] == index) {
-						cpumask_set_cpu(i, mask);
-						break;
-					}
+		for_each_possible_cpu(cpu) {
+			int i, tci = cpu * num_tc + tc;
+			struct xps_map *map;
+
+			map = rcu_dereference(dev_maps->cpu_map[tci]);
+			if (!map)
+				continue;
+
+			for (i = map->len; i--;) {
+				if (map->queues[i] == index) {
+					cpumask_set_cpu(cpu, mask);
+					break;
 				}
 			}
 		}
-- 
cgit v1.2.3


From 0fefbfbaad298162737d5418eb85065879f99b3e Mon Sep 17 00:00:00 2001
From: Sudarsana Kalluru <Sudarsana.Kalluru@cavium.com>
Date: Mon, 31 Oct 2016 07:14:21 +0200
Subject: qed*: Management firmware - notifications and defaults

Management firmware is interested in various tidbits about
the driver - including the driver state & several configuration
related fields [MTU, primtary MAC, etc.].
This adds the necessray logic to update MFW with such configurations,
some of which are passed directly via qed while for others APIs
are provide so that qede would be able to later configure if needed.

This also introduces a new default configuration for MTU which would
replace the default inherited by being an ethernet device.

Signed-off-by: Sudarsana Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h           |   1 +
 drivers/net/ethernet/qlogic/qed/qed_dev.c       |  52 +++++++-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h       |  59 ++++++++-
 drivers/net/ethernet/qlogic/qed/qed_main.c      |  75 +++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.c       | 163 ++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h       | 102 +++++++++++++++
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |   2 +
 drivers/net/ethernet/qlogic/qede/qede_main.c    |   8 ++
 include/linux/qed/qed_if.h                      |  28 ++++
 9 files changed, 487 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 653bb5735f0c..f20243c1fb0b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -226,6 +226,7 @@ struct qed_hw_info {
 	u32				port_mode;
 	u32				hw_mode;
 	unsigned long		device_capabilities;
+	u16				mtu;
 };
 
 struct qed_hw_cid_data {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index edae5fc5fccd..33fd69e24bae 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1057,8 +1057,10 @@ int qed_hw_init(struct qed_dev *cdev,
 		bool allow_npar_tx_switch,
 		const u8 *bin_fw_data)
 {
-	u32 load_code, param;
-	int rc, mfw_rc, i;
+	u32 load_code, param, drv_mb_param;
+	bool b_default_mtu = true;
+	struct qed_hwfn *p_hwfn;
+	int rc = 0, mfw_rc, i;
 
 	if ((int_mode == QED_INT_MODE_MSI) && (cdev->num_hwfns > 1)) {
 		DP_NOTICE(cdev, "MSI mode is not supported for CMT devices\n");
@@ -1074,6 +1076,12 @@ int qed_hw_init(struct qed_dev *cdev,
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
+		/* If management didn't provide a default, set one of our own */
+		if (!p_hwfn->hw_info.mtu) {
+			p_hwfn->hw_info.mtu = 1500;
+			b_default_mtu = false;
+		}
+
 		if (IS_VF(cdev)) {
 			p_hwfn->b_int_enabled = 1;
 			continue;
@@ -1157,6 +1165,38 @@ int qed_hw_init(struct qed_dev *cdev,
 		p_hwfn->hw_init_done = true;
 	}
 
+	if (IS_PF(cdev)) {
+		p_hwfn = QED_LEADING_HWFN(cdev);
+		drv_mb_param = (FW_MAJOR_VERSION << 24) |
+			       (FW_MINOR_VERSION << 16) |
+			       (FW_REVISION_VERSION << 8) |
+			       (FW_ENGINEERING_VERSION);
+		rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt,
+				 DRV_MSG_CODE_OV_UPDATE_STORM_FW_VER,
+				 drv_mb_param, &load_code, &param);
+		if (rc)
+			DP_INFO(p_hwfn, "Failed to update firmware version\n");
+
+		if (!b_default_mtu) {
+			rc = qed_mcp_ov_update_mtu(p_hwfn, p_hwfn->p_main_ptt,
+						   p_hwfn->hw_info.mtu);
+			if (rc)
+				DP_INFO(p_hwfn,
+					"Failed to update default mtu\n");
+		}
+
+		rc = qed_mcp_ov_update_driver_state(p_hwfn,
+						    p_hwfn->p_main_ptt,
+						  QED_OV_DRIVER_STATE_DISABLED);
+		if (rc)
+			DP_INFO(p_hwfn, "Failed to update driver state\n");
+
+		rc = qed_mcp_ov_update_eswitch(p_hwfn, p_hwfn->p_main_ptt,
+					       QED_OV_ESWITCH_VEB);
+		if (rc)
+			DP_INFO(p_hwfn, "Failed to update eswitch mode\n");
+	}
+
 	return 0;
 }
 
@@ -1801,6 +1841,9 @@ qed_get_hw_info(struct qed_hwfn *p_hwfn,
 
 	qed_get_num_funcs(p_hwfn, p_ptt);
 
+	if (qed_mcp_is_init(p_hwfn))
+		p_hwfn->hw_info.mtu = p_hwfn->mcp_info->func_info.mtu;
+
 	return qed_hw_get_resc(p_hwfn);
 }
 
@@ -1975,8 +2018,13 @@ int qed_hw_prepare(struct qed_dev *cdev,
 
 void qed_hw_remove(struct qed_dev *cdev)
 {
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
 	int i;
 
+	if (IS_PF(cdev))
+		qed_mcp_ov_update_driver_state(p_hwfn, p_hwfn->p_main_ptt,
+					       QED_OV_DRIVER_STATE_NOT_LOADED);
+
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 72eee29c677f..36de87a1befa 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -8564,6 +8564,15 @@ struct public_drv_mb {
 #define DRV_MSG_CODE_INIT_PHY			0x22000000
 #define DRV_MSG_CODE_LINK_RESET			0x23000000
 #define DRV_MSG_CODE_SET_DCBX			0x25000000
+#define DRV_MSG_CODE_OV_UPDATE_CURR_CFG         0x26000000
+#define DRV_MSG_CODE_OV_UPDATE_BUS_NUM          0x27000000
+#define DRV_MSG_CODE_OV_UPDATE_BOOT_PROGRESS    0x28000000
+#define DRV_MSG_CODE_OV_UPDATE_STORM_FW_VER     0x29000000
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE     0x31000000
+#define DRV_MSG_CODE_BW_UPDATE_ACK              0x32000000
+#define DRV_MSG_CODE_OV_UPDATE_MTU              0x33000000
+#define DRV_MSG_CODE_OV_UPDATE_WOL              0x38000000
+#define DRV_MSG_CODE_OV_UPDATE_ESWITCH_MODE     0x39000000
 
 #define DRV_MSG_CODE_BW_UPDATE_ACK		0x32000000
 #define DRV_MSG_CODE_NIG_DRAIN			0x30000000
@@ -8574,6 +8583,13 @@ struct public_drv_mb {
 #define DRV_MSG_CODE_MCP_RESET			0x00090000
 #define DRV_MSG_CODE_SET_VERSION		0x000f0000
 #define DRV_MSG_CODE_MCP_HALT                   0x00100000
+#define DRV_MSG_CODE_SET_VMAC                   0x00110000
+#define DRV_MSG_CODE_GET_VMAC                   0x00120000
+#define DRV_MSG_CODE_VMAC_TYPE_SHIFT            4
+#define DRV_MSG_CODE_VMAC_TYPE_MASK             0x30
+#define DRV_MSG_CODE_VMAC_TYPE_MAC              1
+#define DRV_MSG_CODE_VMAC_TYPE_WWNN             2
+#define DRV_MSG_CODE_VMAC_TYPE_WWPN             3
 
 #define DRV_MSG_CODE_GET_STATS                  0x00130000
 #define DRV_MSG_CODE_STATS_TYPE_LAN             1
@@ -8589,7 +8605,10 @@ struct public_drv_mb {
 #define DRV_MSG_SEQ_NUMBER_MASK			0x0000ffff
 
 	u32 drv_mb_param;
-#define DRV_MB_PARAM_UNLOAD_WOL_MCP		0x00000001
+#define DRV_MB_PARAM_UNLOAD_WOL_UNKNOWN         0x00000000
+#define DRV_MB_PARAM_UNLOAD_WOL_MCP             0x00000001
+#define DRV_MB_PARAM_UNLOAD_WOL_DISABLED        0x00000002
+#define DRV_MB_PARAM_UNLOAD_WOL_ENABLED         0x00000003
 #define DRV_MB_PARAM_DCBX_NOTIFY_MASK		0x000000FF
 #define DRV_MB_PARAM_DCBX_NOTIFY_SHIFT		3
 
@@ -8602,6 +8621,44 @@ struct public_drv_mb {
 #define DRV_MB_PARAM_LLDP_SEND_MASK		0x00000001
 #define DRV_MB_PARAM_LLDP_SEND_SHIFT		0
 
+#define DRV_MB_PARAM_OV_CURR_CFG_SHIFT		0
+#define DRV_MB_PARAM_OV_CURR_CFG_MASK		0x0000000F
+#define DRV_MB_PARAM_OV_CURR_CFG_NONE		0
+#define DRV_MB_PARAM_OV_CURR_CFG_OS		1
+#define DRV_MB_PARAM_OV_CURR_CFG_VENDOR_SPEC	2
+#define DRV_MB_PARAM_OV_CURR_CFG_OTHER		3
+
+#define DRV_MB_PARAM_OV_STORM_FW_VER_SHIFT	0
+#define DRV_MB_PARAM_OV_STORM_FW_VER_MASK	0xFFFFFFFF
+#define DRV_MB_PARAM_OV_STORM_FW_VER_MAJOR_MASK	0xFF000000
+#define DRV_MB_PARAM_OV_STORM_FW_VER_MINOR_MASK	0x00FF0000
+#define DRV_MB_PARAM_OV_STORM_FW_VER_BUILD_MASK	0x0000FF00
+#define DRV_MB_PARAM_OV_STORM_FW_VER_DROP_MASK	0x000000FF
+
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_SHIFT	0
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_MASK	0xF
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_UNKNOWN	0x1
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_NOT_LOADED	0x2
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_LOADING	0x3
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_DISABLED	0x4
+#define DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_ACTIVE	0x5
+
+#define DRV_MB_PARAM_OV_MTU_SIZE_SHIFT	0
+#define DRV_MB_PARAM_OV_MTU_SIZE_MASK	0xFFFFFFFF
+
+#define DRV_MB_PARAM_WOL_MASK	(DRV_MB_PARAM_WOL_DEFAULT | \
+				 DRV_MB_PARAM_WOL_DISABLED | \
+				 DRV_MB_PARAM_WOL_ENABLED)
+#define DRV_MB_PARAM_WOL_DEFAULT	DRV_MB_PARAM_UNLOAD_WOL_MCP
+#define DRV_MB_PARAM_WOL_DISABLED	DRV_MB_PARAM_UNLOAD_WOL_DISABLED
+#define DRV_MB_PARAM_WOL_ENABLED	DRV_MB_PARAM_UNLOAD_WOL_ENABLED
+
+#define DRV_MB_PARAM_ESWITCH_MODE_MASK	(DRV_MB_PARAM_ESWITCH_MODE_NONE | \
+					 DRV_MB_PARAM_ESWITCH_MODE_VEB | \
+					 DRV_MB_PARAM_ESWITCH_MODE_VEPA)
+#define DRV_MB_PARAM_ESWITCH_MODE_NONE	0x0
+#define DRV_MB_PARAM_ESWITCH_MODE_VEB	0x1
+#define DRV_MB_PARAM_ESWITCH_MODE_VEPA	0x2
 
 #define DRV_MB_PARAM_SET_LED_MODE_OPER		0x0
 #define DRV_MB_PARAM_SET_LED_MODE_ON		0x1
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index c418360ba02a..d9fa52a22667 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -243,6 +243,8 @@ int qed_fill_dev_info(struct qed_dev *cdev,
 				    &dev_info->mfw_rev, NULL);
 	}
 
+	dev_info->mtu = QED_LEADING_HWFN(cdev)->hw_info.mtu;
+
 	return 0;
 }
 
@@ -1431,6 +1433,76 @@ static int qed_set_led(struct qed_dev *cdev, enum qed_led_mode mode)
 	return status;
 }
 
+static int qed_update_drv_state(struct qed_dev *cdev, bool active)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *ptt;
+	int status = 0;
+
+	if (IS_VF(cdev))
+		return 0;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EAGAIN;
+
+	status = qed_mcp_ov_update_driver_state(hwfn, ptt, active ?
+						QED_OV_DRIVER_STATE_ACTIVE :
+						QED_OV_DRIVER_STATE_DISABLED);
+
+	qed_ptt_release(hwfn, ptt);
+
+	return status;
+}
+
+static int qed_update_mac(struct qed_dev *cdev, u8 *mac)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *ptt;
+	int status = 0;
+
+	if (IS_VF(cdev))
+		return 0;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EAGAIN;
+
+	status = qed_mcp_ov_update_mac(hwfn, ptt, mac);
+	if (status)
+		goto out;
+
+	status = qed_mcp_ov_update_current_config(hwfn, ptt, QED_OV_CLIENT_DRV);
+
+out:
+	qed_ptt_release(hwfn, ptt);
+	return status;
+}
+
+static int qed_update_mtu(struct qed_dev *cdev, u16 mtu)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *ptt;
+	int status = 0;
+
+	if (IS_VF(cdev))
+		return 0;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EAGAIN;
+
+	status = qed_mcp_ov_update_mtu(hwfn, ptt, mtu);
+	if (status)
+		goto out;
+
+	status = qed_mcp_ov_update_current_config(hwfn, ptt, QED_OV_CLIENT_DRV);
+
+out:
+	qed_ptt_release(hwfn, ptt);
+	return status;
+}
+
 static struct qed_selftest_ops qed_selftest_ops_pass = {
 	.selftest_memory = &qed_selftest_memory,
 	.selftest_interrupt = &qed_selftest_interrupt,
@@ -1465,6 +1537,9 @@ const struct qed_common_ops qed_common_ops_pass = {
 	.get_coalesce = &qed_get_coalesce,
 	.set_coalesce = &qed_set_coalesce,
 	.set_led = &qed_set_led,
+	.update_drv_state = &qed_update_drv_state,
+	.update_mac = &qed_update_mac,
+	.update_mtu = &qed_update_mtu,
 };
 
 void qed_get_protocol_stats(struct qed_dev *cdev,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index bdc9ba92f6d4..98dc913fd76d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
+#include <linux/etherdevice.h>
 #include "qed.h"
 #include "qed_dcbx.h"
 #include "qed_hsi.h"
@@ -1068,6 +1069,8 @@ int qed_mcp_fill_shmem_func_info(struct qed_hwfn *p_hwfn,
 
 	info->ovlan = (u16)(shmem_info.ovlan_stag & FUNC_MF_CFG_OV_STAG_MASK);
 
+	info->mtu = (u16)shmem_info.mtu_size;
+
 	DP_VERBOSE(p_hwfn, (QED_MSG_SP | NETIF_MSG_IFUP),
 		   "Read configuration from shmem: pause_on_host %02x protocol %02x BW [%02x - %02x] MAC %02x:%02x:%02x:%02x:%02x:%02x wwn port %llx node %llx ovlan %04x\n",
 		info->pause_on_host, info->protocol,
@@ -1223,6 +1226,166 @@ int qed_mcp_resume(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 	return (cpu_mode & MCP_REG_CPU_MODE_SOFT_HALT) ? -EAGAIN : 0;
 }
 
+int qed_mcp_ov_update_current_config(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     enum qed_ov_client client)
+{
+	u32 resp = 0, param = 0;
+	u32 drv_mb_param;
+	int rc;
+
+	switch (client) {
+	case QED_OV_CLIENT_DRV:
+		drv_mb_param = DRV_MB_PARAM_OV_CURR_CFG_OS;
+		break;
+	case QED_OV_CLIENT_USER:
+		drv_mb_param = DRV_MB_PARAM_OV_CURR_CFG_OTHER;
+		break;
+	case QED_OV_CLIENT_VENDOR_SPEC:
+		drv_mb_param = DRV_MB_PARAM_OV_CURR_CFG_VENDOR_SPEC;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Invalid client type %d\n", client);
+		return -EINVAL;
+	}
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_OV_UPDATE_CURR_CFG,
+			 drv_mb_param, &resp, &param);
+	if (rc)
+		DP_ERR(p_hwfn, "MCP response failure, aborting\n");
+
+	return rc;
+}
+
+int qed_mcp_ov_update_driver_state(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   enum qed_ov_driver_state drv_state)
+{
+	u32 resp = 0, param = 0;
+	u32 drv_mb_param;
+	int rc;
+
+	switch (drv_state) {
+	case QED_OV_DRIVER_STATE_NOT_LOADED:
+		drv_mb_param = DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_NOT_LOADED;
+		break;
+	case QED_OV_DRIVER_STATE_DISABLED:
+		drv_mb_param = DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_DISABLED;
+		break;
+	case QED_OV_DRIVER_STATE_ACTIVE:
+		drv_mb_param = DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE_ACTIVE;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Invalid driver state %d\n", drv_state);
+		return -EINVAL;
+	}
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_OV_UPDATE_DRIVER_STATE,
+			 drv_mb_param, &resp, &param);
+	if (rc)
+		DP_ERR(p_hwfn, "Failed to send driver state\n");
+
+	return rc;
+}
+
+int qed_mcp_ov_update_mtu(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u16 mtu)
+{
+	u32 resp = 0, param = 0;
+	u32 drv_mb_param;
+	int rc;
+
+	drv_mb_param = (u32)mtu << DRV_MB_PARAM_OV_MTU_SIZE_SHIFT;
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_OV_UPDATE_MTU,
+			 drv_mb_param, &resp, &param);
+	if (rc)
+		DP_ERR(p_hwfn, "Failed to send mtu value, rc = %d\n", rc);
+
+	return rc;
+}
+
+int qed_mcp_ov_update_mac(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u8 *mac)
+{
+	struct qed_mcp_mb_params mb_params;
+	union drv_union_data union_data;
+	int rc;
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = DRV_MSG_CODE_SET_VMAC;
+	mb_params.param = DRV_MSG_CODE_VMAC_TYPE_MAC <<
+			  DRV_MSG_CODE_VMAC_TYPE_SHIFT;
+	mb_params.param |= MCP_PF_ID(p_hwfn);
+	ether_addr_copy(&union_data.raw_data[0], mac);
+	mb_params.p_data_src = &union_data;
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+	if (rc)
+		DP_ERR(p_hwfn, "Failed to send mac address, rc = %d\n", rc);
+
+	return rc;
+}
+
+int qed_mcp_ov_update_wol(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, enum qed_ov_wol wol)
+{
+	u32 resp = 0, param = 0;
+	u32 drv_mb_param;
+	int rc;
+
+	switch (wol) {
+	case QED_OV_WOL_DEFAULT:
+		drv_mb_param = DRV_MB_PARAM_WOL_DEFAULT;
+		break;
+	case QED_OV_WOL_DISABLED:
+		drv_mb_param = DRV_MB_PARAM_WOL_DISABLED;
+		break;
+	case QED_OV_WOL_ENABLED:
+		drv_mb_param = DRV_MB_PARAM_WOL_ENABLED;
+		break;
+	default:
+		DP_ERR(p_hwfn, "Invalid wol state %d\n", wol);
+		return -EINVAL;
+	}
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_OV_UPDATE_WOL,
+			 drv_mb_param, &resp, &param);
+	if (rc)
+		DP_ERR(p_hwfn, "Failed to send wol mode, rc = %d\n", rc);
+
+	return rc;
+}
+
+int qed_mcp_ov_update_eswitch(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      enum qed_ov_eswitch eswitch)
+{
+	u32 resp = 0, param = 0;
+	u32 drv_mb_param;
+	int rc;
+
+	switch (eswitch) {
+	case QED_OV_ESWITCH_NONE:
+		drv_mb_param = DRV_MB_PARAM_ESWITCH_MODE_NONE;
+		break;
+	case QED_OV_ESWITCH_VEB:
+		drv_mb_param = DRV_MB_PARAM_ESWITCH_MODE_VEB;
+		break;
+	case QED_OV_ESWITCH_VEPA:
+		drv_mb_param = DRV_MB_PARAM_ESWITCH_MODE_VEPA;
+		break;
+	default:
+		DP_ERR(p_hwfn, "Invalid eswitch mode %d\n", eswitch);
+		return -EINVAL;
+	}
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_OV_UPDATE_ESWITCH_MODE,
+			 drv_mb_param, &resp, &param);
+	if (rc)
+		DP_ERR(p_hwfn, "Failed to send eswitch mode, rc = %d\n", rc);
+
+	return rc;
+}
+
 int qed_mcp_set_led(struct qed_hwfn *p_hwfn,
 		    struct qed_ptt *p_ptt, enum qed_led_mode mode)
 {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index dff520ed069b..89507190628d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -92,6 +92,8 @@ struct qed_mcp_function_info {
 
 #define QED_MCP_VLAN_UNSET              (0xffff)
 	u16				ovlan;
+
+	u16				mtu;
 };
 
 struct qed_mcp_nvm_common {
@@ -147,6 +149,30 @@ union qed_mcp_protocol_stats {
 	struct qed_mcp_rdma_stats rdma_stats;
 };
 
+enum qed_ov_eswitch {
+	QED_OV_ESWITCH_NONE,
+	QED_OV_ESWITCH_VEB,
+	QED_OV_ESWITCH_VEPA
+};
+
+enum qed_ov_client {
+	QED_OV_CLIENT_DRV,
+	QED_OV_CLIENT_USER,
+	QED_OV_CLIENT_VENDOR_SPEC
+};
+
+enum qed_ov_driver_state {
+	QED_OV_DRIVER_STATE_NOT_LOADED,
+	QED_OV_DRIVER_STATE_DISABLED,
+	QED_OV_DRIVER_STATE_ACTIVE
+};
+
+enum qed_ov_wol {
+	QED_OV_WOL_DEFAULT,
+	QED_OV_WOL_DISABLED,
+	QED_OV_WOL_ENABLED
+};
+
 /**
  * @brief - returns the link params of the hw function
  *
@@ -277,6 +303,69 @@ qed_mcp_send_drv_version(struct qed_hwfn *p_hwfn,
 			 struct qed_ptt *p_ptt,
 			 struct qed_mcp_drv_version *p_ver);
 
+/**
+ * @brief Notify MFW about the change in base device properties
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param client - qed client type
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_ov_update_current_config(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     enum qed_ov_client client);
+
+/**
+ * @brief Notify MFW about the driver state
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param drv_state - Driver state
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_ov_update_driver_state(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt,
+				   enum qed_ov_driver_state drv_state);
+
+/**
+ * @brief Send MTU size to MFW
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param mtu - MTU size
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_ov_update_mtu(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u16 mtu);
+
+/**
+ * @brief Send MAC address to MFW
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param mac - MAC address
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_ov_update_mac(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt, u8 *mac);
+
+/**
+ * @brief Send WOL mode to MFW
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param wol - WOL mode
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_ov_update_wol(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  enum qed_ov_wol wol);
+
 /**
  * @brief Set LED status
  *
@@ -546,4 +635,17 @@ int __qed_configure_pf_min_bandwidth(struct qed_hwfn *p_hwfn,
 int qed_mcp_mask_parities(struct qed_hwfn *p_hwfn,
 			  struct qed_ptt *p_ptt, u32 mask_parities);
 
+/**
+ * @brief Send eswitch mode to MFW
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param eswitch - eswitch mode
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_ov_update_eswitch(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt,
+			      enum qed_ov_eswitch eswitch);
+
 #endif
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 0100f5c0a4ec..775fdaafd24d 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -739,6 +739,8 @@ int qede_change_mtu(struct net_device *ndev, int new_mtu)
 
 	qede_update_mtu(edev, &args);
 
+	edev->ops->common->update_mtu(edev->cdev, args.mtu);
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 8488ad36a2b8..df0bd0ce2b18 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -2396,6 +2396,8 @@ static void qede_init_ndev(struct qede_dev *edev)
 
 	/* Set network device HW mac */
 	ether_addr_copy(edev->ndev->dev_addr, edev->dev_info.common.hw_mac);
+
+	ndev->mtu = edev->dev_info.common.mtu;
 }
 
 /* This function converts from 32b param to two params of level and module
@@ -3751,6 +3753,8 @@ static int qede_open(struct net_device *ndev)
 
 	udp_tunnel_get_rx_info(ndev);
 
+	edev->ops->common->update_drv_state(edev->cdev, true);
+
 	return 0;
 }
 
@@ -3760,6 +3764,8 @@ static int qede_close(struct net_device *ndev)
 
 	qede_unload(edev, QEDE_UNLOAD_NORMAL);
 
+	edev->ops->common->update_drv_state(edev->cdev, false);
+
 	return 0;
 }
 
@@ -3820,6 +3826,8 @@ static int qede_set_mac_addr(struct net_device *ndev, void *p)
 	if (rc)
 		return rc;
 
+	edev->ops->common->update_mac(edev->cdev, addr->sa_data);
+
 	/* Add MAC filter according to the new unicast HW MAC address */
 	ether_addr_copy(edev->primary_mac, ndev->dev_addr);
 	return qede_set_ucast_rx_mac(edev, QED_FILTER_XCAST_TYPE_ADD,
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 8978a60371f4..5c909cd02764 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -267,6 +267,7 @@ struct qed_dev_info {
 	u8		mf_mode;
 	bool		tx_switching;
 	bool		rdma_supported;
+	u16		mtu;
 };
 
 enum qed_sb_type {
@@ -554,6 +555,33 @@ struct qed_common_ops {
  */
 	int (*set_led)(struct qed_dev *cdev,
 		       enum qed_led_mode mode);
+
+/**
+ * @brief update_drv_state - API to inform the change in the driver state.
+ *
+ * @param cdev
+ * @param active
+ *
+ */
+	int (*update_drv_state)(struct qed_dev *cdev, bool active);
+
+/**
+ * @brief update_mac - API to inform the change in the mac address
+ *
+ * @param cdev
+ * @param mac
+ *
+ */
+	int (*update_mac)(struct qed_dev *cdev, u8 *mac);
+
+/**
+ * @brief update_mtu - API to inform the change in the mtu
+ *
+ * @param cdev
+ * @param mtu
+ *
+ */
+	int (*update_mtu)(struct qed_dev *cdev, u16 mtu);
 };
 
 #define MASK_FIELD(_name, _value) \
-- 
cgit v1.2.3


From 7a4b21b7d1f0644456501e33d3917c9aaee76a75 Mon Sep 17 00:00:00 2001
From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
Date: Mon, 31 Oct 2016 07:14:22 +0200
Subject: qed: Add nvram selftest

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_hsi.h       |   4 +
 drivers/net/ethernet/qlogic/qed/qed_main.c      |   1 +
 drivers/net/ethernet/qlogic/qed/qed_mcp.c       |  94 ++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h       |  41 ++++++++++
 drivers/net/ethernet/qlogic/qed/qed_selftest.c  | 101 ++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_selftest.h  |  10 +++
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |   7 ++
 include/linux/qed/qed_if.h                      |   9 +++
 8 files changed, 267 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 36de87a1befa..f7dfa2ec2d19 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -8666,6 +8666,8 @@ struct public_drv_mb {
 
 #define DRV_MB_PARAM_BIST_REGISTER_TEST		1
 #define DRV_MB_PARAM_BIST_CLOCK_TEST		2
+#define DRV_MB_PARAM_BIST_NVM_TEST_NUM_IMAGES	3
+#define DRV_MB_PARAM_BIST_NVM_TEST_IMAGE_BY_INDEX	4
 
 #define DRV_MB_PARAM_BIST_RC_UNKNOWN		0
 #define DRV_MB_PARAM_BIST_RC_PASSED		1
@@ -8674,6 +8676,8 @@ struct public_drv_mb {
 
 #define DRV_MB_PARAM_BIST_TEST_INDEX_SHIFT	0
 #define DRV_MB_PARAM_BIST_TEST_INDEX_MASK	0x000000FF
+#define DRV_MB_PARAM_BIST_TEST_IMAGE_INDEX_SHIFT	8
+#define DRV_MB_PARAM_BIST_TEST_IMAGE_INDEX_MASK		0x0000FF00
 
 	u32 fw_mb_header;
 #define FW_MSG_CODE_MASK			0xffff0000
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index d9fa52a22667..31f8e420c830 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1508,6 +1508,7 @@ static struct qed_selftest_ops qed_selftest_ops_pass = {
 	.selftest_interrupt = &qed_selftest_interrupt,
 	.selftest_register = &qed_selftest_register,
 	.selftest_clock = &qed_selftest_clock,
+	.selftest_nvram = &qed_selftest_nvram,
 };
 
 const struct qed_common_ops qed_common_ops_pass = {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 98dc913fd76d..8be61570ce6b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -1434,6 +1434,52 @@ int qed_mcp_mask_parities(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
+int qed_mcp_nvm_read(struct qed_dev *cdev, u32 addr, u8 *p_buf, u32 len)
+{
+	u32 bytes_left = len, offset = 0, bytes_to_copy, read_len = 0;
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	u32 resp = 0, resp_param = 0;
+	struct qed_ptt *p_ptt;
+	int rc = 0;
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EBUSY;
+
+	while (bytes_left > 0) {
+		bytes_to_copy = min_t(u32, bytes_left, MCP_DRV_NVM_BUF_LEN);
+
+		rc = qed_mcp_nvm_rd_cmd(p_hwfn, p_ptt,
+					DRV_MSG_CODE_NVM_READ_NVRAM,
+					addr + offset +
+					(bytes_to_copy <<
+					 DRV_MB_PARAM_NVM_LEN_SHIFT),
+					&resp, &resp_param,
+					&read_len,
+					(u32 *)(p_buf + offset));
+
+		if (rc || (resp != FW_MSG_CODE_NVM_OK)) {
+			DP_NOTICE(cdev, "MCP command rc = %d\n", rc);
+			break;
+		}
+
+		/* This can be a lengthy process, and it's possible scheduler
+		 * isn't preemptable. Sleep a bit to prevent CPU hogging.
+		 */
+		if (bytes_left % 0x1000 <
+		    (bytes_left - read_len) % 0x1000)
+			usleep_range(1000, 2000);
+
+		offset += read_len;
+		bytes_left -= read_len;
+	}
+
+	cdev->mcp_nvm_resp = resp;
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return rc;
+}
+
 int qed_mcp_bist_register_test(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 {
 	u32 drv_mb_param = 0, rsp, param;
@@ -1475,3 +1521,51 @@ int qed_mcp_bist_clock_test(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 
 	return rc;
 }
+
+int qed_mcp_bist_nvm_test_get_num_images(struct qed_hwfn *p_hwfn,
+					 struct qed_ptt *p_ptt,
+					 u32 *num_images)
+{
+	u32 drv_mb_param = 0, rsp;
+	int rc = 0;
+
+	drv_mb_param = (DRV_MB_PARAM_BIST_NVM_TEST_NUM_IMAGES <<
+			DRV_MB_PARAM_BIST_TEST_INDEX_SHIFT);
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_BIST_TEST,
+			 drv_mb_param, &rsp, num_images);
+	if (rc)
+		return rc;
+
+	if (((rsp & FW_MSG_CODE_MASK) != FW_MSG_CODE_OK))
+		rc = -EINVAL;
+
+	return rc;
+}
+
+int qed_mcp_bist_nvm_test_get_image_att(struct qed_hwfn *p_hwfn,
+					struct qed_ptt *p_ptt,
+					struct bist_nvm_image_att *p_image_att,
+					u32 image_index)
+{
+	u32 buf_size = 0, param, resp = 0, resp_param = 0;
+	int rc;
+
+	param = DRV_MB_PARAM_BIST_NVM_TEST_IMAGE_BY_INDEX <<
+		DRV_MB_PARAM_BIST_TEST_INDEX_SHIFT;
+	param |= image_index << DRV_MB_PARAM_BIST_TEST_IMAGE_INDEX_SHIFT;
+
+	rc = qed_mcp_nvm_rd_cmd(p_hwfn, p_ptt,
+				DRV_MSG_CODE_BIST_TEST, param,
+				&resp, &resp_param,
+				&buf_size,
+				(u32 *)p_image_att);
+	if (rc)
+		return rc;
+
+	if (((resp & FW_MSG_CODE_MASK) != FW_MSG_CODE_OK) ||
+	    (p_image_att->return_code != 1))
+		rc = -EINVAL;
+
+	return rc;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 89507190628d..be8152d49de2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -379,6 +379,18 @@ int qed_mcp_set_led(struct qed_hwfn *p_hwfn,
 		    struct qed_ptt *p_ptt,
 		    enum qed_led_mode mode);
 
+/**
+ * @brief Read from nvm
+ *
+ *  @param cdev
+ *  @param addr - nvm offset
+ *  @param p_buf - nvm read buffer
+ *  @param len - buffer len
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_nvm_read(struct qed_dev *cdev, u32 addr, u8 *p_buf, u32 len);
+
 /**
  * @brief Bist register test
  *
@@ -401,6 +413,35 @@ int qed_mcp_bist_register_test(struct qed_hwfn *p_hwfn,
 int qed_mcp_bist_clock_test(struct qed_hwfn *p_hwfn,
 			    struct qed_ptt *p_ptt);
 
+/**
+ * @brief Bist nvm test - get number of images
+ *
+ *  @param p_hwfn       - hw function
+ *  @param p_ptt        - PTT required for register access
+ *  @param num_images   - number of images if operation was
+ *			  successful. 0 if not.
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_bist_nvm_test_get_num_images(struct qed_hwfn *p_hwfn,
+					 struct qed_ptt *p_ptt,
+					 u32 *num_images);
+
+/**
+ * @brief Bist nvm test - get image attributes by index
+ *
+ *  @param p_hwfn      - hw function
+ *  @param p_ptt       - PTT required for register access
+ *  @param p_image_att - Attributes of image
+ *  @param image_index - Index of image to get information for
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_bist_nvm_test_get_image_att(struct qed_hwfn *p_hwfn,
+					struct qed_ptt *p_ptt,
+					struct bist_nvm_image_att *p_image_att,
+					u32 image_index);
+
 /* Using hwfn number (and not pf_num) is required since in CMT mode,
  * same pf_num may be used by two different hwfn
  * TODO - this shouldn't really be in .h file, but until all fields
diff --git a/drivers/net/ethernet/qlogic/qed/qed_selftest.c b/drivers/net/ethernet/qlogic/qed/qed_selftest.c
index 9b7678f26909..48bfaecaf6dc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_selftest.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_selftest.c
@@ -1,3 +1,4 @@
+#include <linux/crc32.h>
 #include "qed.h"
 #include "qed_dev_api.h"
 #include "qed_mcp.h"
@@ -75,3 +76,103 @@ int qed_selftest_clock(struct qed_dev *cdev)
 
 	return rc;
 }
+
+int qed_selftest_nvram(struct qed_dev *cdev)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *p_ptt = qed_ptt_acquire(p_hwfn);
+	u32 num_images, i, j, nvm_crc, calc_crc;
+	struct bist_nvm_image_att image_att;
+	u8 *buf = NULL;
+	__be32 val;
+	int rc;
+
+	if (!p_ptt) {
+		DP_ERR(p_hwfn, "failed to acquire ptt\n");
+		return -EBUSY;
+	}
+
+	/* Acquire from MFW the amount of available images */
+	rc = qed_mcp_bist_nvm_test_get_num_images(p_hwfn, p_ptt, &num_images);
+	if (rc || !num_images) {
+		DP_ERR(p_hwfn, "Failed getting number of images\n");
+		return -EINVAL;
+	}
+
+	/* Iterate over images and validate CRC */
+	for (i = 0; i < num_images; i++) {
+		/* This mailbox returns information about the image required for
+		 * reading it.
+		 */
+		rc = qed_mcp_bist_nvm_test_get_image_att(p_hwfn, p_ptt,
+							 &image_att, i);
+		if (rc) {
+			DP_ERR(p_hwfn,
+			       "Failed getting image index %d attributes\n",
+			       i);
+			goto err0;
+		}
+
+		/* After MFW crash dump is collected - the image's CRC stops
+		 * being valid.
+		 */
+		if (image_att.image_type == NVM_TYPE_MDUMP)
+			continue;
+
+		DP_VERBOSE(p_hwfn, QED_MSG_SP, "image index %d, size %x\n",
+			   i, image_att.len);
+
+		/* Allocate a buffer for holding the nvram image */
+		buf = kzalloc(image_att.len, GFP_KERNEL);
+		if (!buf) {
+			rc = -ENOMEM;
+			goto err0;
+		}
+
+		/* Read image into buffer */
+		rc = qed_mcp_nvm_read(p_hwfn->cdev, image_att.nvm_start_addr,
+				      buf, image_att.len);
+		if (rc) {
+			DP_ERR(p_hwfn,
+			       "Failed reading image index %d from nvm.\n", i);
+			goto err1;
+		}
+
+		/* Convert the buffer into big-endian format (excluding the
+		 * closing 4 bytes of CRC).
+		 */
+		for (j = 0; j < image_att.len - 4; j += 4) {
+			val = cpu_to_be32(*(u32 *)&buf[j]);
+			*(u32 *)&buf[j] = (__force u32)val;
+		}
+
+		/* Calc CRC for the "actual" image buffer, i.e. not including
+		 * the last 4 CRC bytes.
+		 */
+		nvm_crc = *(u32 *)(buf + image_att.len - 4);
+		calc_crc = crc32(0xffffffff, buf, image_att.len - 4);
+		calc_crc = (__force u32)~cpu_to_be32(calc_crc);
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "nvm crc 0x%x, calc_crc 0x%x\n", nvm_crc, calc_crc);
+
+		if (calc_crc != nvm_crc) {
+			rc = -EINVAL;
+			goto err1;
+		}
+
+		/* Done with this image; Free to prevent double release
+		 * on subsequent failure.
+		 */
+		kfree(buf);
+		buf = NULL;
+	}
+
+	qed_ptt_release(p_hwfn, p_ptt);
+	return 0;
+
+err1:
+	kfree(buf);
+err0:
+	qed_ptt_release(p_hwfn, p_ptt);
+	return rc;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_selftest.h b/drivers/net/ethernet/qlogic/qed/qed_selftest.h
index 50eb0b49950f..739ddb730967 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_selftest.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_selftest.h
@@ -37,4 +37,14 @@ int qed_selftest_register(struct qed_dev *cdev);
  * @return int
  */
 int qed_selftest_clock(struct qed_dev *cdev);
+
+/**
+ * @brief qed_selftest_nvram - Perform nvram test
+ *
+ * @param cdev
+ *
+ * @return int
+ */
+int qed_selftest_nvram(struct qed_dev *cdev);
+
 #endif
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 775fdaafd24d..a8094088b9ac 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -157,6 +157,7 @@ enum qede_ethtool_tests {
 	QEDE_ETHTOOL_MEMORY_TEST,
 	QEDE_ETHTOOL_REGISTER_TEST,
 	QEDE_ETHTOOL_CLOCK_TEST,
+	QEDE_ETHTOOL_NVRAM_TEST,
 	QEDE_ETHTOOL_TEST_MAX
 };
 
@@ -166,6 +167,7 @@ static const char qede_tests_str_arr[QEDE_ETHTOOL_TEST_MAX][ETH_GSTRING_LEN] = {
 	"Memory (online)\t\t",
 	"Register (online)\t",
 	"Clock (online)\t\t",
+	"Nvram (online)\t\t",
 };
 
 static void qede_get_strings_stats(struct qede_dev *edev, u8 *buf)
@@ -1392,6 +1394,11 @@ static void qede_self_test(struct net_device *dev,
 		buf[QEDE_ETHTOOL_CLOCK_TEST] = 1;
 		etest->flags |= ETH_TEST_FL_FAILED;
 	}
+
+	if (edev->ops->common->selftest->selftest_nvram(edev->cdev)) {
+		buf[QEDE_ETHTOOL_NVRAM_TEST] = 1;
+		etest->flags |= ETH_TEST_FL_FAILED;
+	}
 }
 
 static int qede_set_tunable(struct net_device *dev,
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 5c909cd02764..ffc2d2f5e88f 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -402,6 +402,15 @@ struct qed_selftest_ops {
  * @return 0 on success, error otherwise.
  */
 	int (*selftest_clock)(struct qed_dev *cdev);
+
+/**
+ * @brief selftest_nvram - Perform nvram test
+ *
+ * @param cdev
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*selftest_nvram) (struct qed_dev *cdev);
 };
 
 struct qed_common_ops {
-- 
cgit v1.2.3


From 14d39648cbfc6289e3f873d30f282b9517ebe860 Mon Sep 17 00:00:00 2001
From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
Date: Mon, 31 Oct 2016 07:14:23 +0200
Subject: qed*: Add support for WoL

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h           | 11 ++++-
 drivers/net/ethernet/qlogic/qed/qed_dev.c       | 19 ++++++++-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h       |  4 ++
 drivers/net/ethernet/qlogic/qed/qed_main.c      | 29 +++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.c       | 56 ++++++++++++++++++++++++-
 drivers/net/ethernet/qlogic/qede/qede.h         |  2 +
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 41 ++++++++++++++++++
 drivers/net/ethernet/qlogic/qede/qede_main.c    |  9 ++++
 include/linux/qed/qed_if.h                      | 10 +++++
 9 files changed, 176 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index f20243c1fb0b..8828ffac4b23 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -195,6 +195,11 @@ enum qed_dev_cap {
 	QED_DEV_CAP_ROCE,
 };
 
+enum qed_wol_support {
+	QED_WOL_SUPPORT_NONE,
+	QED_WOL_SUPPORT_PME,
+};
+
 struct qed_hw_info {
 	/* PCI personality */
 	enum qed_pci_personality	personality;
@@ -227,6 +232,8 @@ struct qed_hw_info {
 	u32				hw_mode;
 	unsigned long		device_capabilities;
 	u16				mtu;
+
+	enum qed_wol_support b_wol_support;
 };
 
 struct qed_hw_cid_data {
@@ -539,7 +546,9 @@ struct qed_dev {
 	u8				mcp_rev;
 	u8				boot_mode;
 
-	u8				wol;
+	/* WoL related configurations */
+	u8 wol_config;
+	u8 wol_mac[ETH_ALEN];
 
 	u32				int_mode;
 	enum qed_coalescing_mode	int_coalescing_mode;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 33fd69e24bae..127ed5f27d8d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1364,8 +1364,24 @@ int qed_hw_reset(struct qed_dev *cdev)
 {
 	int rc = 0;
 	u32 unload_resp, unload_param;
+	u32 wol_param;
 	int i;
 
+	switch (cdev->wol_config) {
+	case QED_OV_WOL_DISABLED:
+		wol_param = DRV_MB_PARAM_UNLOAD_WOL_DISABLED;
+		break;
+	case QED_OV_WOL_ENABLED:
+		wol_param = DRV_MB_PARAM_UNLOAD_WOL_ENABLED;
+		break;
+	default:
+		DP_NOTICE(cdev,
+			  "Unknown WoL configuration %02x\n", cdev->wol_config);
+		/* Fallthrough */
+	case QED_OV_WOL_DEFAULT:
+		wol_param = DRV_MB_PARAM_UNLOAD_WOL_MCP;
+	}
+
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
@@ -1394,8 +1410,7 @@ int qed_hw_reset(struct qed_dev *cdev)
 
 		/* Send unload command to MCP */
 		rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt,
-				 DRV_MSG_CODE_UNLOAD_REQ,
-				 DRV_MB_PARAM_UNLOAD_WOL_MCP,
+				 DRV_MSG_CODE_UNLOAD_REQ, wol_param,
 				 &unload_resp, &unload_param);
 		if (rc) {
 			DP_NOTICE(p_hwfn, "qed_hw_reset: UNLOAD_REQ failed\n");
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index f7dfa2ec2d19..fdb7a099955b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -8601,6 +8601,7 @@ struct public_drv_mb {
 
 #define DRV_MSG_CODE_BIST_TEST			0x001e0000
 #define DRV_MSG_CODE_SET_LED_MODE		0x00200000
+#define DRV_MSG_CODE_OS_WOL			0x002e0000
 
 #define DRV_MSG_SEQ_NUMBER_MASK			0x0000ffff
 
@@ -8697,6 +8698,9 @@ struct public_drv_mb {
 #define FW_MSG_CODE_NVM_OK			0x00010000
 #define FW_MSG_CODE_OK				0x00160000
 
+#define FW_MSG_CODE_OS_WOL_SUPPORTED            0x00800000
+#define FW_MSG_CODE_OS_WOL_NOT_SUPPORTED        0x00810000
+
 #define FW_MSG_SEQ_NUMBER_MASK			0x0000ffff
 
 	u32 fw_mb_param;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 31f8e420c830..b71d73a41b10 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -221,6 +221,10 @@ int qed_fill_dev_info(struct qed_dev *cdev,
 		dev_info->fw_eng = FW_ENGINEERING_VERSION;
 		dev_info->mf_mode = cdev->mf_mode;
 		dev_info->tx_switching = true;
+
+		if (QED_LEADING_HWFN(cdev)->hw_info.b_wol_support ==
+		    QED_WOL_SUPPORT_PME)
+			dev_info->wol_support = true;
 	} else {
 		qed_vf_get_fw_version(&cdev->hwfns[0], &dev_info->fw_major,
 				      &dev_info->fw_minor, &dev_info->fw_rev,
@@ -1433,6 +1437,30 @@ static int qed_set_led(struct qed_dev *cdev, enum qed_led_mode mode)
 	return status;
 }
 
+static int qed_update_wol(struct qed_dev *cdev, bool enabled)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_ptt *ptt;
+	int rc = 0;
+
+	if (IS_VF(cdev))
+		return 0;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EAGAIN;
+
+	rc = qed_mcp_ov_update_wol(hwfn, ptt, enabled ? QED_OV_WOL_ENABLED
+				   : QED_OV_WOL_DISABLED);
+	if (rc)
+		goto out;
+	rc = qed_mcp_ov_update_current_config(hwfn, ptt, QED_OV_CLIENT_DRV);
+
+out:
+	qed_ptt_release(hwfn, ptt);
+	return rc;
+}
+
 static int qed_update_drv_state(struct qed_dev *cdev, bool active)
 {
 	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
@@ -1541,6 +1569,7 @@ const struct qed_common_ops qed_common_ops_pass = {
 	.update_drv_state = &qed_update_drv_state,
 	.update_mac = &qed_update_mac,
 	.update_mtu = &qed_update_mtu,
+	.update_wol = &qed_update_wol,
 };
 
 void qed_get_protocol_stats(struct qed_dev *cdev,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 8be61570ce6b..768b35b1dca0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -330,6 +330,7 @@ static int qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn,
 				 struct qed_mcp_mb_params *p_mb_params)
 {
 	u32 union_data_addr;
+
 	int rc;
 
 	/* MCP not initialized */
@@ -375,11 +376,32 @@ int qed_mcp_cmd(struct qed_hwfn *p_hwfn,
 		u32 *o_mcp_param)
 {
 	struct qed_mcp_mb_params mb_params;
+	union drv_union_data data_src;
 	int rc;
 
 	memset(&mb_params, 0, sizeof(mb_params));
+	memset(&data_src, 0, sizeof(data_src));
 	mb_params.cmd = cmd;
 	mb_params.param = param;
+
+	/* In case of UNLOAD_DONE, set the primary MAC */
+	if ((cmd == DRV_MSG_CODE_UNLOAD_DONE) &&
+	    (p_hwfn->cdev->wol_config == QED_OV_WOL_ENABLED)) {
+		u8 *p_mac = p_hwfn->cdev->wol_mac;
+
+		data_src.wol_mac.mac_upper = p_mac[0] << 8 | p_mac[1];
+		data_src.wol_mac.mac_lower = p_mac[2] << 24 | p_mac[3] << 16 |
+					     p_mac[4] << 8 | p_mac[5];
+
+		DP_VERBOSE(p_hwfn,
+			   (QED_MSG_SP | NETIF_MSG_IFDOWN),
+			   "Setting WoL MAC: %pM --> [%08x,%08x]\n",
+			   p_mac, data_src.wol_mac.mac_upper,
+			   data_src.wol_mac.mac_lower);
+
+		mb_params.p_data_src = &data_src;
+	}
+
 	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
 	if (rc)
 		return rc;
@@ -1058,6 +1080,9 @@ int qed_mcp_fill_shmem_func_info(struct qed_hwfn *p_hwfn,
 		info->mac[3] = (u8)(shmem_info.mac_lower >> 16);
 		info->mac[4] = (u8)(shmem_info.mac_lower >> 8);
 		info->mac[5] = (u8)(shmem_info.mac_lower);
+
+		/* Store primary MAC for later possible WoL */
+		memcpy(&p_hwfn->cdev->wol_mac, info->mac, ETH_ALEN);
 	} else {
 		DP_NOTICE(p_hwfn, "MAC is 0 in shmem\n");
 	}
@@ -1071,13 +1096,28 @@ int qed_mcp_fill_shmem_func_info(struct qed_hwfn *p_hwfn,
 
 	info->mtu = (u16)shmem_info.mtu_size;
 
+	p_hwfn->hw_info.b_wol_support = QED_WOL_SUPPORT_NONE;
+	p_hwfn->cdev->wol_config = (u8)QED_OV_WOL_DEFAULT;
+	if (qed_mcp_is_init(p_hwfn)) {
+		u32 resp = 0, param = 0;
+		int rc;
+
+		rc = qed_mcp_cmd(p_hwfn, p_ptt,
+				 DRV_MSG_CODE_OS_WOL, 0, &resp, &param);
+		if (rc)
+			return rc;
+		if (resp == FW_MSG_CODE_OS_WOL_SUPPORTED)
+			p_hwfn->hw_info.b_wol_support = QED_WOL_SUPPORT_PME;
+	}
+
 	DP_VERBOSE(p_hwfn, (QED_MSG_SP | NETIF_MSG_IFUP),
-		   "Read configuration from shmem: pause_on_host %02x protocol %02x BW [%02x - %02x] MAC %02x:%02x:%02x:%02x:%02x:%02x wwn port %llx node %llx ovlan %04x\n",
+		   "Read configuration from shmem: pause_on_host %02x protocol %02x BW [%02x - %02x] MAC %02x:%02x:%02x:%02x:%02x:%02x wwn port %llx node %llx ovlan %04x wol %02x\n",
 		info->pause_on_host, info->protocol,
 		info->bandwidth_min, info->bandwidth_max,
 		info->mac[0], info->mac[1], info->mac[2],
 		info->mac[3], info->mac[4], info->mac[5],
-		info->wwn_port, info->wwn_node, info->ovlan);
+		info->wwn_port, info->wwn_node,
+		info->ovlan, (u8)p_hwfn->hw_info.b_wol_support);
 
 	return 0;
 }
@@ -1322,6 +1362,9 @@ int qed_mcp_ov_update_mac(struct qed_hwfn *p_hwfn,
 	if (rc)
 		DP_ERR(p_hwfn, "Failed to send mac address, rc = %d\n", rc);
 
+	/* Store primary MAC for later possible WoL */
+	memcpy(p_hwfn->cdev->wol_mac, mac, ETH_ALEN);
+
 	return rc;
 }
 
@@ -1332,6 +1375,12 @@ int qed_mcp_ov_update_wol(struct qed_hwfn *p_hwfn,
 	u32 drv_mb_param;
 	int rc;
 
+	if (p_hwfn->hw_info.b_wol_support == QED_WOL_SUPPORT_NONE) {
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "Can't change WoL configuration when WoL isn't supported\n");
+		return -EINVAL;
+	}
+
 	switch (wol) {
 	case QED_OV_WOL_DEFAULT:
 		drv_mb_param = DRV_MB_PARAM_WOL_DEFAULT;
@@ -1352,6 +1401,9 @@ int qed_mcp_ov_update_wol(struct qed_hwfn *p_hwfn,
 	if (rc)
 		DP_ERR(p_hwfn, "Failed to send wol mode, rc = %d\n", rc);
 
+	/* Store the WoL update for a future unload */
+	p_hwfn->cdev->wol_config = (u8)wol;
+
 	return rc;
 }
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index cf8d3547aecf..0cba21bf9d5f 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -193,6 +193,8 @@ struct qede_dev {
 	u16				vxlan_dst_port;
 	u16				geneve_dst_port;
 
+	bool wol_enabled;
+
 	struct qede_rdma_dev		rdma_info;
 };
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index a8094088b9ac..327c614e76aa 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -483,6 +483,45 @@ static void qede_get_drvinfo(struct net_device *ndev,
 	strlcpy(info->bus_info, pci_name(edev->pdev), sizeof(info->bus_info));
 }
 
+static void qede_get_wol(struct net_device *ndev, struct ethtool_wolinfo *wol)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+
+	if (edev->dev_info.common.wol_support) {
+		wol->supported = WAKE_MAGIC;
+		wol->wolopts = edev->wol_enabled ? WAKE_MAGIC : 0;
+	}
+}
+
+static int qede_set_wol(struct net_device *ndev, struct ethtool_wolinfo *wol)
+{
+	struct qede_dev *edev = netdev_priv(ndev);
+	bool wol_requested;
+	int rc;
+
+	if (wol->wolopts & ~WAKE_MAGIC) {
+		DP_INFO(edev,
+			"Can't support WoL options other than magic-packet\n");
+		return -EINVAL;
+	}
+
+	wol_requested = !!(wol->wolopts & WAKE_MAGIC);
+	if (wol_requested == edev->wol_enabled)
+		return 0;
+
+	/* Need to actually change configuration */
+	if (!edev->dev_info.common.wol_support) {
+		DP_INFO(edev, "Device doesn't support WoL\n");
+		return -EINVAL;
+	}
+
+	rc = edev->ops->common->update_wol(edev->cdev, wol_requested);
+	if (!rc)
+		edev->wol_enabled = wol_requested;
+
+	return rc;
+}
+
 static u32 qede_get_msglevel(struct net_device *ndev)
 {
 	struct qede_dev *edev = netdev_priv(ndev);
@@ -1449,6 +1488,8 @@ static const struct ethtool_ops qede_ethtool_ops = {
 	.get_drvinfo = qede_get_drvinfo,
 	.get_regs_len = qede_get_regs_len,
 	.get_regs = qede_get_regs,
+	.get_wol = qede_get_wol,
+	.set_wol = qede_set_wol,
 	.get_msglevel = qede_get_msglevel,
 	.set_msglevel = qede_set_msglevel,
 	.nway_reset = qede_nway_reset,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index df0bd0ce2b18..873f2ebe249e 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -95,6 +95,7 @@ static int qede_probe(struct pci_dev *pdev, const struct pci_device_id *id);
 #define TX_TIMEOUT		(5 * HZ)
 
 static void qede_remove(struct pci_dev *pdev);
+static void qede_shutdown(struct pci_dev *pdev);
 static int qede_alloc_rx_buffer(struct qede_dev *edev,
 				struct qede_rx_queue *rxq);
 static void qede_link_update(void *dev, struct qed_link_output *link);
@@ -166,6 +167,7 @@ static struct pci_driver qede_pci_driver = {
 	.id_table = qede_pci_tbl,
 	.probe = qede_probe,
 	.remove = qede_remove,
+	.shutdown = qede_shutdown,
 #ifdef CONFIG_QED_SRIOV
 	.sriov_configure = qede_sriov_configure,
 #endif
@@ -2705,6 +2707,8 @@ static void __qede_remove(struct pci_dev *pdev, enum qede_remove_mode mode)
 
 	/* Use global ops since we've freed edev */
 	qed_ops->common->slowpath_stop(cdev);
+	if (system_state == SYSTEM_POWER_OFF)
+		return;
 	qed_ops->common->remove(cdev);
 
 	dev_info(&pdev->dev, "Ending qede_remove successfully\n");
@@ -2715,6 +2719,11 @@ static void qede_remove(struct pci_dev *pdev)
 	__qede_remove(pdev, QEDE_REMOVE_NORMAL);
 }
 
+static void qede_shutdown(struct pci_dev *pdev)
+{
+	__qede_remove(pdev, QEDE_REMOVE_NORMAL);
+}
+
 /* -------------------------------------------------------------------------
  * START OF LOAD / UNLOAD
  * -------------------------------------------------------------------------
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index ffc2d2f5e88f..ea095b4893aa 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -268,6 +268,8 @@ struct qed_dev_info {
 	bool		tx_switching;
 	bool		rdma_supported;
 	u16		mtu;
+
+	bool wol_support;
 };
 
 enum qed_sb_type {
@@ -591,6 +593,14 @@ struct qed_common_ops {
  *
  */
 	int (*update_mtu)(struct qed_dev *cdev, u16 mtu);
+
+/**
+ * @brief update_wol - update of changes in the WoL configuration
+ *
+ * @param cdev
+ * @param enabled - true iff WoL should be enabled.
+ */
+	int (*update_wol) (struct qed_dev *cdev, bool enabled);
 };
 
 #define MASK_FIELD(_name, _value) \
-- 
cgit v1.2.3


From 2edbff8dcb5da324fd4c4fe953629e4f6ca73c99 Mon Sep 17 00:00:00 2001
From: Tomer Tayar <Tomer.Tayar@cavium.com>
Date: Mon, 31 Oct 2016 07:14:27 +0200
Subject: qed: Learn resources from management firmware

Currently, each interfaces assumes it receives an equal portion
of HW/FW resources, but this is wasteful - different partitions
[and specifically, parititions exposing different protocol support]
might require different resources.

Implement a new resource learning scheme where the information is
received directly from the management firmware [which has knowledge
of all of the functions and can serve as arbiter].

Signed-off-by: Tomer Tayar <Tomer.Tayar@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h     |   6 +-
 drivers/net/ethernet/qlogic/qed/qed_dev.c | 291 ++++++++++++++++++++++++------
 drivers/net/ethernet/qlogic/qed/qed_hsi.h |  46 +++++
 drivers/net/ethernet/qlogic/qed/qed_l2.c  |   2 +-
 drivers/net/ethernet/qlogic/qed/qed_mcp.c |  42 +++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h |  15 ++
 include/linux/qed/qed_eth_if.h            |   2 +-
 7 files changed, 341 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 6d3013fe6987..50b8a01ff512 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -154,7 +154,10 @@ struct qed_qm_iids {
 	u32 tids;
 };
 
-enum QED_RESOURCES {
+/* HW / FW resources, output of features supported below, most information
+ * is received from MFW.
+ */
+enum qed_resources {
 	QED_SB,
 	QED_L2_QUEUE,
 	QED_VPORT,
@@ -166,6 +169,7 @@ enum QED_RESOURCES {
 	QED_RDMA_CNQ_RAM,
 	QED_ILT,
 	QED_LL2_QUEUE,
+	QED_CMDQS_CQS,
 	QED_RDMA_STATS_QUEUE,
 	QED_MAX_RESC,
 };
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index d996afe833ee..5be7b8a25425 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1512,47 +1512,240 @@ static void qed_hw_set_feat(struct qed_hwfn *p_hwfn)
 		   RESC_NUM(p_hwfn, QED_SB), num_features);
 }
 
-static int qed_hw_get_resc(struct qed_hwfn *p_hwfn)
+static enum resource_id_enum qed_hw_get_mfw_res_id(enum qed_resources res_id)
+{
+	enum resource_id_enum mfw_res_id = RESOURCE_NUM_INVALID;
+
+	switch (res_id) {
+	case QED_SB:
+		mfw_res_id = RESOURCE_NUM_SB_E;
+		break;
+	case QED_L2_QUEUE:
+		mfw_res_id = RESOURCE_NUM_L2_QUEUE_E;
+		break;
+	case QED_VPORT:
+		mfw_res_id = RESOURCE_NUM_VPORT_E;
+		break;
+	case QED_RSS_ENG:
+		mfw_res_id = RESOURCE_NUM_RSS_ENGINES_E;
+		break;
+	case QED_PQ:
+		mfw_res_id = RESOURCE_NUM_PQ_E;
+		break;
+	case QED_RL:
+		mfw_res_id = RESOURCE_NUM_RL_E;
+		break;
+	case QED_MAC:
+	case QED_VLAN:
+		/* Each VFC resource can accommodate both a MAC and a VLAN */
+		mfw_res_id = RESOURCE_VFC_FILTER_E;
+		break;
+	case QED_ILT:
+		mfw_res_id = RESOURCE_ILT_E;
+		break;
+	case QED_LL2_QUEUE:
+		mfw_res_id = RESOURCE_LL2_QUEUE_E;
+		break;
+	case QED_RDMA_CNQ_RAM:
+	case QED_CMDQS_CQS:
+		/* CNQ/CMDQS are the same resource */
+		mfw_res_id = RESOURCE_CQS_E;
+		break;
+	case QED_RDMA_STATS_QUEUE:
+		mfw_res_id = RESOURCE_RDMA_STATS_QUEUE_E;
+		break;
+	default:
+		break;
+	}
+
+	return mfw_res_id;
+}
+
+static u32 qed_hw_get_dflt_resc_num(struct qed_hwfn *p_hwfn,
+				    enum qed_resources res_id)
 {
-	u8 enabled_func_idx = p_hwfn->enabled_func_idx;
-	u32 *resc_start = p_hwfn->hw_info.resc_start;
 	u8 num_funcs = p_hwfn->num_funcs_on_engine;
-	u32 *resc_num = p_hwfn->hw_info.resc_num;
 	struct qed_sb_cnt_info sb_cnt_info;
-	int i, max_vf_vlan_filters;
+	u32 dflt_resc_num = 0;
 
-	memset(&sb_cnt_info, 0, sizeof(sb_cnt_info));
+	switch (res_id) {
+	case QED_SB:
+		memset(&sb_cnt_info, 0, sizeof(sb_cnt_info));
+		qed_int_get_num_sbs(p_hwfn, &sb_cnt_info);
+		dflt_resc_num = sb_cnt_info.sb_cnt;
+		break;
+	case QED_L2_QUEUE:
+		dflt_resc_num = MAX_NUM_L2_QUEUES_BB / num_funcs;
+		break;
+	case QED_VPORT:
+		dflt_resc_num = MAX_NUM_VPORTS_BB / num_funcs;
+		break;
+	case QED_RSS_ENG:
+		dflt_resc_num = ETH_RSS_ENGINE_NUM_BB / num_funcs;
+		break;
+	case QED_PQ:
+		/* The granularity of the PQs is 8 */
+		dflt_resc_num = MAX_QM_TX_QUEUES_BB / num_funcs;
+		dflt_resc_num &= ~0x7;
+		break;
+	case QED_RL:
+		dflt_resc_num = MAX_QM_GLOBAL_RLS / num_funcs;
+		break;
+	case QED_MAC:
+	case QED_VLAN:
+		/* Each VFC resource can accommodate both a MAC and a VLAN */
+		dflt_resc_num = ETH_NUM_MAC_FILTERS / num_funcs;
+		break;
+	case QED_ILT:
+		dflt_resc_num = PXP_NUM_ILT_RECORDS_BB / num_funcs;
+		break;
+	case QED_LL2_QUEUE:
+		dflt_resc_num = MAX_NUM_LL2_RX_QUEUES / num_funcs;
+		break;
+	case QED_RDMA_CNQ_RAM:
+	case QED_CMDQS_CQS:
+		/* CNQ/CMDQS are the same resource */
+		dflt_resc_num = NUM_OF_CMDQS_CQS / num_funcs;
+		break;
+	case QED_RDMA_STATS_QUEUE:
+		dflt_resc_num = RDMA_NUM_STATISTIC_COUNTERS_BB / num_funcs;
+		break;
+	default:
+		break;
+	}
 
-#ifdef CONFIG_QED_SRIOV
-	max_vf_vlan_filters = QED_ETH_MAX_VF_NUM_VLAN_FILTERS;
-#else
-	max_vf_vlan_filters = 0;
-#endif
+	return dflt_resc_num;
+}
+
+static const char *qed_hw_get_resc_name(enum qed_resources res_id)
+{
+	switch (res_id) {
+	case QED_SB:
+		return "SB";
+	case QED_L2_QUEUE:
+		return "L2_QUEUE";
+	case QED_VPORT:
+		return "VPORT";
+	case QED_RSS_ENG:
+		return "RSS_ENG";
+	case QED_PQ:
+		return "PQ";
+	case QED_RL:
+		return "RL";
+	case QED_MAC:
+		return "MAC";
+	case QED_VLAN:
+		return "VLAN";
+	case QED_RDMA_CNQ_RAM:
+		return "RDMA_CNQ_RAM";
+	case QED_ILT:
+		return "ILT";
+	case QED_LL2_QUEUE:
+		return "LL2_QUEUE";
+	case QED_CMDQS_CQS:
+		return "CMDQS_CQS";
+	case QED_RDMA_STATS_QUEUE:
+		return "RDMA_STATS_QUEUE";
+	default:
+		return "UNKNOWN_RESOURCE";
+	}
+}
 
-	qed_int_get_num_sbs(p_hwfn, &sb_cnt_info);
+static int qed_hw_set_resc_info(struct qed_hwfn *p_hwfn,
+				enum qed_resources res_id)
+{
+	u32 dflt_resc_num = 0, dflt_resc_start = 0, mcp_resp, mcp_param;
+	u32 *p_resc_num, *p_resc_start;
+	struct resource_info resc_info;
+	int rc;
+
+	p_resc_num = &RESC_NUM(p_hwfn, res_id);
+	p_resc_start = &RESC_START(p_hwfn, res_id);
+
+	/* Default values assumes that each function received equal share */
+	dflt_resc_num = qed_hw_get_dflt_resc_num(p_hwfn, res_id);
+	if (!dflt_resc_num) {
+		DP_ERR(p_hwfn,
+		       "Failed to get default amount for resource %d [%s]\n",
+		       res_id, qed_hw_get_resc_name(res_id));
+		return -EINVAL;
+	}
+	dflt_resc_start = dflt_resc_num * p_hwfn->enabled_func_idx;
+
+	memset(&resc_info, 0, sizeof(resc_info));
+	resc_info.res_id = qed_hw_get_mfw_res_id(res_id);
+	if (resc_info.res_id == RESOURCE_NUM_INVALID) {
+		DP_ERR(p_hwfn,
+		       "Failed to match resource %d [%s] with the MFW resources\n",
+		       res_id, qed_hw_get_resc_name(res_id));
+		return -EINVAL;
+	}
+
+	rc = qed_mcp_get_resc_info(p_hwfn, p_hwfn->p_main_ptt, &resc_info,
+				   &mcp_resp, &mcp_param);
+	if (rc) {
+		DP_NOTICE(p_hwfn,
+			  "MFW response failure for an allocation request for resource %d [%s]\n",
+			  res_id, qed_hw_get_resc_name(res_id));
+		return rc;
+	}
+
+	/* Default driver values are applied in the following cases:
+	 * - The resource allocation MB command is not supported by the MFW
+	 * - There is an internal error in the MFW while processing the request
+	 * - The resource ID is unknown to the MFW
+	 */
+	if (mcp_resp != FW_MSG_CODE_RESOURCE_ALLOC_OK &&
+	    mcp_resp != FW_MSG_CODE_RESOURCE_ALLOC_DEPRECATED) {
+		DP_NOTICE(p_hwfn,
+			  "Resource %d [%s]: No allocation info was received [mcp_resp 0x%x]. Applying default values [num %d, start %d].\n",
+			  res_id,
+			  qed_hw_get_resc_name(res_id),
+			  mcp_resp, dflt_resc_num, dflt_resc_start);
+		*p_resc_num = dflt_resc_num;
+		*p_resc_start = dflt_resc_start;
+		goto out;
+	}
+
+	/* Special handling for status blocks; Would be revised in future */
+	if (res_id == QED_SB) {
+		resc_info.size -= 1;
+		resc_info.offset -= p_hwfn->enabled_func_idx;
+	}
+
+	*p_resc_num = resc_info.size;
+	*p_resc_start = resc_info.offset;
+
+out:
+	/* PQs have to divide by 8 [that's the HW granularity].
+	 * Reduce number so it would fit.
+	 */
+	if ((res_id == QED_PQ) && ((*p_resc_num % 8) || (*p_resc_start % 8))) {
+		DP_INFO(p_hwfn,
+			"PQs need to align by 8; Number %08x --> %08x, Start %08x --> %08x\n",
+			*p_resc_num,
+			(*p_resc_num) & ~0x7,
+			*p_resc_start, (*p_resc_start) & ~0x7);
+		*p_resc_num &= ~0x7;
+		*p_resc_start &= ~0x7;
+	}
 
-	resc_num[QED_SB] = min_t(u32,
-				 (MAX_SB_PER_PATH_BB / num_funcs),
-				 sb_cnt_info.sb_cnt);
-	resc_num[QED_L2_QUEUE] = MAX_NUM_L2_QUEUES_BB / num_funcs;
-	resc_num[QED_VPORT] = MAX_NUM_VPORTS_BB / num_funcs;
-	resc_num[QED_RSS_ENG] = ETH_RSS_ENGINE_NUM_BB / num_funcs;
-	resc_num[QED_PQ] = MAX_QM_TX_QUEUES_BB / num_funcs;
-	resc_num[QED_RL] = min_t(u32, 64, resc_num[QED_VPORT]);
-	resc_num[QED_MAC] = ETH_NUM_MAC_FILTERS / num_funcs;
-	resc_num[QED_VLAN] = (ETH_NUM_VLAN_FILTERS - 1 /*For vlan0*/) /
-			     num_funcs;
-	resc_num[QED_ILT] = PXP_NUM_ILT_RECORDS_BB / num_funcs;
-	resc_num[QED_LL2_QUEUE] = MAX_NUM_LL2_RX_QUEUES / num_funcs;
-	resc_num[QED_RDMA_CNQ_RAM] = NUM_OF_CMDQS_CQS / num_funcs;
-	resc_num[QED_RDMA_STATS_QUEUE] = RDMA_NUM_STATISTIC_COUNTERS_BB /
-					 num_funcs;
-
-	for (i = 0; i < QED_MAX_RESC; i++)
-		resc_start[i] = resc_num[i] * enabled_func_idx;
+	return 0;
+}
+
+static int qed_hw_get_resc(struct qed_hwfn *p_hwfn)
+{
+	u8 res_id;
+	int rc;
+
+	for (res_id = 0; res_id < QED_MAX_RESC; res_id++) {
+		rc = qed_hw_set_resc_info(p_hwfn, res_id);
+		if (rc)
+			return rc;
+	}
 
 	/* Sanity for ILT */
-	if (RESC_END(p_hwfn, QED_ILT) > PXP_NUM_ILT_RECORDS_BB) {
+	if ((RESC_END(p_hwfn, QED_ILT) > PXP_NUM_ILT_RECORDS_BB)) {
 		DP_NOTICE(p_hwfn, "Can't assign ILT pages [%08x,...,%08x]\n",
 			  RESC_START(p_hwfn, QED_ILT),
 			  RESC_END(p_hwfn, QED_ILT) - 1);
@@ -1562,34 +1755,12 @@ static int qed_hw_get_resc(struct qed_hwfn *p_hwfn)
 	qed_hw_set_feat(p_hwfn);
 
 	DP_VERBOSE(p_hwfn, NETIF_MSG_PROBE,
-		   "The numbers for each resource are:\n"
-		   "SB = %d start = %d\n"
-		   "L2_QUEUE = %d start = %d\n"
-		   "VPORT = %d start = %d\n"
-		   "PQ = %d start = %d\n"
-		   "RL = %d start = %d\n"
-		   "MAC = %d start = %d\n"
-		   "VLAN = %d start = %d\n"
-		   "ILT = %d start = %d\n"
-		   "LL2_QUEUE = %d start = %d\n",
-		   p_hwfn->hw_info.resc_num[QED_SB],
-		   p_hwfn->hw_info.resc_start[QED_SB],
-		   p_hwfn->hw_info.resc_num[QED_L2_QUEUE],
-		   p_hwfn->hw_info.resc_start[QED_L2_QUEUE],
-		   p_hwfn->hw_info.resc_num[QED_VPORT],
-		   p_hwfn->hw_info.resc_start[QED_VPORT],
-		   p_hwfn->hw_info.resc_num[QED_PQ],
-		   p_hwfn->hw_info.resc_start[QED_PQ],
-		   p_hwfn->hw_info.resc_num[QED_RL],
-		   p_hwfn->hw_info.resc_start[QED_RL],
-		   p_hwfn->hw_info.resc_num[QED_MAC],
-		   p_hwfn->hw_info.resc_start[QED_MAC],
-		   p_hwfn->hw_info.resc_num[QED_VLAN],
-		   p_hwfn->hw_info.resc_start[QED_VLAN],
-		   p_hwfn->hw_info.resc_num[QED_ILT],
-		   p_hwfn->hw_info.resc_start[QED_ILT],
-		   RESC_NUM(p_hwfn, QED_LL2_QUEUE),
-		   RESC_START(p_hwfn, QED_LL2_QUEUE));
+		   "The numbers for each resource are:\n");
+	for (res_id = 0; res_id < QED_MAX_RESC; res_id++)
+		DP_VERBOSE(p_hwfn, NETIF_MSG_PROBE, "%s = %d start = %d\n",
+			   qed_hw_get_resc_name(res_id),
+			   RESC_NUM(p_hwfn, res_id),
+			   RESC_START(p_hwfn, res_id));
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 1d113ce814e1..048f9a342413 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -8529,6 +8529,41 @@ struct mdump_config_stc {
 	u32 valid_logs;
 };
 
+enum resource_id_enum {
+	RESOURCE_NUM_SB_E = 0,
+	RESOURCE_NUM_L2_QUEUE_E = 1,
+	RESOURCE_NUM_VPORT_E = 2,
+	RESOURCE_NUM_VMQ_E = 3,
+	RESOURCE_FACTOR_NUM_RSS_PF_E = 4,
+	RESOURCE_FACTOR_RSS_PER_VF_E = 5,
+	RESOURCE_NUM_RL_E = 6,
+	RESOURCE_NUM_PQ_E = 7,
+	RESOURCE_NUM_VF_E = 8,
+	RESOURCE_VFC_FILTER_E = 9,
+	RESOURCE_ILT_E = 10,
+	RESOURCE_CQS_E = 11,
+	RESOURCE_GFT_PROFILES_E = 12,
+	RESOURCE_NUM_TC_E = 13,
+	RESOURCE_NUM_RSS_ENGINES_E = 14,
+	RESOURCE_LL2_QUEUE_E = 15,
+	RESOURCE_RDMA_STATS_QUEUE_E = 16,
+	RESOURCE_MAX_NUM,
+	RESOURCE_NUM_INVALID = 0xFFFFFFFF
+};
+
+/* Resource ID is to be filled by the driver in the MB request
+ * Size, offset & flags to be filled by the MFW in the MB response
+ */
+struct resource_info {
+	enum resource_id_enum res_id;
+	u32 size;		/* number of allocated resources */
+	u32 offset;		/* Offset of the 1st resource */
+	u32 vf_size;
+	u32 vf_offset;
+	u32 flags;
+#define RESOURCE_ELEMENT_STRICT (1 << 0)
+};
+
 union drv_union_data {
 	u32 ver_str[MCP_DRV_VER_STR_SIZE_DWORD];
 	struct mcp_mac wol_mac;
@@ -8549,6 +8584,7 @@ union drv_union_data {
 	u64 reserved_stats[11];
 	struct ocbb_data_stc ocbb_info;
 	struct temperature_status_stc temp_info;
+	struct resource_info resource;
 	struct bist_nvm_image_att nvm_image_att;
 	struct mdump_config_stc mdump_config;
 };
@@ -8576,6 +8612,7 @@ struct public_drv_mb {
 
 #define DRV_MSG_CODE_BW_UPDATE_ACK		0x32000000
 #define DRV_MSG_CODE_NIG_DRAIN			0x30000000
+#define DRV_MSG_GET_RESOURCE_ALLOC_MSG          0x34000000
 #define DRV_MSG_CODE_VF_DISABLED_DONE		0xc0000000
 #define DRV_MSG_CODE_CFG_VF_MSIX		0xc0010000
 #define DRV_MSG_CODE_NVM_GET_FILE_ATT		0x00030000
@@ -8666,6 +8703,12 @@ struct public_drv_mb {
 #define DRV_MB_PARAM_SET_LED_MODE_ON		0x1
 #define DRV_MB_PARAM_SET_LED_MODE_OFF		0x2
 
+	/* Resource Allocation params - Driver version support */
+#define DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_MASK	0xFFFF0000
+#define DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_SHIFT	16
+#define DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MINOR_MASK	0x0000FFFF
+#define DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MINOR_SHIFT	0
+
 #define DRV_MB_PARAM_BIST_REGISTER_TEST		1
 #define DRV_MB_PARAM_BIST_CLOCK_TEST		2
 #define DRV_MB_PARAM_BIST_NVM_TEST_NUM_IMAGES	3
@@ -8694,6 +8737,9 @@ struct public_drv_mb {
 #define FW_MSG_CODE_DRV_UNLOAD_PORT		0x20120000
 #define FW_MSG_CODE_DRV_UNLOAD_FUNCTION		0x20130000
 #define FW_MSG_CODE_DRV_UNLOAD_DONE		0x21100000
+#define FW_MSG_CODE_RESOURCE_ALLOC_OK           0x34000000
+#define FW_MSG_CODE_RESOURCE_ALLOC_UNKNOWN      0x35000000
+#define FW_MSG_CODE_RESOURCE_ALLOC_DEPRECATED   0x36000000
 #define FW_MSG_CODE_DRV_CFG_VF_MSIX_DONE	0xb0010000
 
 #define FW_MSG_CODE_NVM_OK			0x00010000
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 6b0e22d9fe4c..1e3a16edd16d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1691,7 +1691,7 @@ static int qed_fill_eth_dev_info(struct qed_dev *cdev,
 		}
 
 		qed_vf_get_num_vlan_filters(&cdev->hwfns[0],
-					    &info->num_vlan_filters);
+					    (u8 *)&info->num_vlan_filters);
 		qed_vf_get_port_mac(&cdev->hwfns[0], info->port_mac);
 
 		info->is_legacy = !!cdev->hwfns[0].vf_iov_info->b_pre_fp_hsi;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 092748832caf..d8e499ebb99d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -1683,3 +1683,45 @@ int qed_mcp_bist_nvm_test_get_image_att(struct qed_hwfn *p_hwfn,
 
 	return rc;
 }
+
+#define QED_RESC_ALLOC_VERSION_MAJOR    1
+#define QED_RESC_ALLOC_VERSION_MINOR    0
+#define QED_RESC_ALLOC_VERSION				     \
+	((QED_RESC_ALLOC_VERSION_MAJOR <<		     \
+	  DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MAJOR_SHIFT) | \
+	 (QED_RESC_ALLOC_VERSION_MINOR <<		     \
+	  DRV_MB_PARAM_RESOURCE_ALLOC_VERSION_MINOR_SHIFT))
+int qed_mcp_get_resc_info(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  struct resource_info *p_resc_info,
+			  u32 *p_mcp_resp, u32 *p_mcp_param)
+{
+	struct qed_mcp_mb_params mb_params;
+	union drv_union_data *p_union_data;
+	int rc;
+
+	memset(&mb_params, 0, sizeof(mb_params));
+	mb_params.cmd = DRV_MSG_GET_RESOURCE_ALLOC_MSG;
+	mb_params.param = QED_RESC_ALLOC_VERSION;
+	p_union_data = (union drv_union_data *)p_resc_info;
+	mb_params.p_data_src = p_union_data;
+	mb_params.p_data_dst = p_union_data;
+	rc = qed_mcp_cmd_and_union(p_hwfn, p_ptt, &mb_params);
+	if (rc)
+		return rc;
+
+	*p_mcp_resp = mb_params.mcp_resp;
+	*p_mcp_param = mb_params.mcp_param;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_SP,
+		   "MFW resource_info: version 0x%x, res_id 0x%x, size 0x%x, offset 0x%x, vf_size 0x%x, vf_offset 0x%x, flags 0x%x\n",
+		   *p_mcp_param,
+		   p_resc_info->res_id,
+		   p_resc_info->size,
+		   p_resc_info->offset,
+		   p_resc_info->vf_size,
+		   p_resc_info->vf_offset, p_resc_info->flags);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index be8152d49de2..407a2c1830fb 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -689,4 +689,19 @@ int qed_mcp_ov_update_eswitch(struct qed_hwfn *p_hwfn,
 			      struct qed_ptt *p_ptt,
 			      enum qed_ov_eswitch eswitch);
 
+/**
+ * @brief - Gets the MFW allocation info for the given resource
+ *
+ *  @param p_hwfn
+ *  @param p_ptt
+ *  @param p_resc_info - descriptor of requested resource
+ *  @param p_mcp_resp
+ *  @param p_mcp_param
+ *
+ * @return int - 0 - operation was successful.
+ */
+int qed_mcp_get_resc_info(struct qed_hwfn *p_hwfn,
+			  struct qed_ptt *p_ptt,
+			  struct resource_info *p_resc_info,
+			  u32 *p_mcp_resp, u32 *p_mcp_param);
 #endif
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 15130805d792..9755a3feb52e 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -22,7 +22,7 @@ struct qed_dev_eth_info {
 	u8	num_tc;
 
 	u8	port_mac[ETH_ALEN];
-	u8	num_vlan_filters;
+	u16	num_vlan_filters;
 	u16	num_mac_filters;
 
 	/* Legacy VF - this affects the datapath, so qede has to know */
-- 
cgit v1.2.3


From 556d299fcb4af8f2e8eacf311c4eee352c746788 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Mon, 31 Oct 2016 13:21:02 +0100
Subject: net: pim: add common pimhdr struct and helpers

Add the common pimhdr structure and helpers to access it, also cleanup the
format of the header file.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pim.h | 44 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pim.h b/include/linux/pim.h
index e1d756f81348..354235a2691b 100644
--- a/include/linux/pim.h
+++ b/include/linux/pim.h
@@ -1,6 +1,7 @@
 #ifndef __LINUX_PIM_H
 #define __LINUX_PIM_H
 
+#include <linux/skbuff.h>
 #include <asm/byteorder.h>
 
 /* Message types - V1 */
@@ -13,20 +14,47 @@
 
 #define PIM_NULL_REGISTER	cpu_to_be32(0x40000000)
 
-static inline bool ipmr_pimsm_enabled(void)
-{
-	return IS_BUILTIN(CONFIG_IP_PIMSM_V1) || IS_BUILTIN(CONFIG_IP_PIMSM_V2);
-}
+/* RFC7761, sec 4.9:
+ * The PIM header common to all PIM messages is:
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |PIM Ver| Type  |   Reserved    |           Checksum            |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct pimhdr {
+	__u8	type;
+	__u8	reserved;
+	__be16	csum;
+};
 
 /* PIMv2 register message header layout (ietf-draft-idmr-pimvsm-v2-00.ps */
-struct pimreghdr
-{
+struct pimreghdr {
 	__u8	type;
 	__u8	reserved;
 	__be16	csum;
 	__be32	flags;
 };
 
-struct sk_buff;
-extern int pim_rcv_v1(struct sk_buff *);
+int pim_rcv_v1(struct sk_buff *skb);
+
+static inline bool ipmr_pimsm_enabled(void)
+{
+	return IS_BUILTIN(CONFIG_IP_PIMSM_V1) || IS_BUILTIN(CONFIG_IP_PIMSM_V2);
+}
+
+static inline struct pimhdr *pim_hdr(const struct sk_buff *skb)
+{
+	return (struct pimhdr *)skb_transport_header(skb);
+}
+
+static inline u8 pim_hdr_version(const struct pimhdr *pimhdr)
+{
+	return pimhdr->type >> 4;
+}
+
+static inline u8 pim_hdr_type(const struct pimhdr *pimhdr)
+{
+	return pimhdr->type & 0xf;
+}
 #endif
-- 
cgit v1.2.3


From 20bb6ce9879e19eee7539329eaa2408d12b00306 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Mon, 31 Oct 2016 13:21:03 +0100
Subject: net: pim: add a helper to check for IPv4 all pim routers address

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pim.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pim.h b/include/linux/pim.h
index 354235a2691b..1b6c0dbba94e 100644
--- a/include/linux/pim.h
+++ b/include/linux/pim.h
@@ -57,4 +57,10 @@ static inline u8 pim_hdr_type(const struct pimhdr *pimhdr)
 {
 	return pimhdr->type & 0xf;
 }
+
+/* check if the address is 224.0.0.13, RFC7761 sec 4.3.1 */
+static inline bool pim_ipv4_all_pim_routers(__be32 addr)
+{
+	return addr == htonl(0xE000000D);
+}
 #endif
-- 
cgit v1.2.3


From 56245cae19f5ccb371fa63b09bb6b9ce7c0f1266 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Mon, 31 Oct 2016 13:21:04 +0100
Subject: net: pim: add all RFC7761 message types

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pim.h | 31 ++++++++++++++++++++++++++++++-
 net/ipv4/ipmr.c     |  2 +-
 net/ipv6/ip6mr.c    |  2 +-
 3 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pim.h b/include/linux/pim.h
index 1b6c0dbba94e..0e81b2778ae0 100644
--- a/include/linux/pim.h
+++ b/include/linux/pim.h
@@ -10,7 +10,36 @@
 
 /* Message types - V2 */
 #define PIM_VERSION		2
-#define PIM_REGISTER		1
+
+/* RFC7761, sec 4.9:
+ *  Type
+ *        Types for specific PIM messages.  PIM Types are:
+ *
+ *  Message Type                          Destination
+ *  ---------------------------------------------------------------------
+ *  0 = Hello                             Multicast to ALL-PIM-ROUTERS
+ *  1 = Register                          Unicast to RP
+ *  2 = Register-Stop                     Unicast to source of Register
+ *                                        packet
+ *  3 = Join/Prune                        Multicast to ALL-PIM-ROUTERS
+ *  4 = Bootstrap                         Multicast to ALL-PIM-ROUTERS
+ *  5 = Assert                            Multicast to ALL-PIM-ROUTERS
+ *  6 = Graft (used in PIM-DM only)       Unicast to RPF'(S)
+ *  7 = Graft-Ack (used in PIM-DM only)   Unicast to source of Graft
+ *                                        packet
+ *  8 = Candidate-RP-Advertisement        Unicast to Domain's BSR
+ */
+enum {
+	PIM_TYPE_HELLO,
+	PIM_TYPE_REGISTER,
+	PIM_TYPE_REGISTER_STOP,
+	PIM_TYPE_JOIN_PRUNE,
+	PIM_TYPE_BOOTSTRAP,
+	PIM_TYPE_ASSERT,
+	PIM_TYPE_GRAFT,
+	PIM_TYPE_GRAFT_ACK,
+	PIM_TYPE_CANDIDATE_RP_ADV
+};
 
 #define PIM_NULL_REGISTER	cpu_to_be32(0x40000000)
 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5f006e13de56..51d71a70fbbe 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2053,7 +2053,7 @@ static int pim_rcv(struct sk_buff *skb)
 		goto drop;
 
 	pim = (struct pimreghdr *)skb_transport_header(skb);
-	if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
+	if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) ||
 	    (pim->flags & PIM_NULL_REGISTER) ||
 	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
 	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 7f4265b1649b..52101b37ad6e 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -636,7 +636,7 @@ static int pim6_rcv(struct sk_buff *skb)
 		goto drop;
 
 	pim = (struct pimreghdr *)skb_transport_header(skb);
-	if (pim->type != ((PIM_VERSION << 4) | PIM_REGISTER) ||
+	if (pim->type != ((PIM_VERSION << 4) | PIM_TYPE_REGISTER) ||
 	    (pim->flags & PIM_NULL_REGISTER) ||
 	    (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
 			     sizeof(*pim), IPPROTO_PIM,
-- 
cgit v1.2.3


From bc8ee596afe8f35b379f87575c46d800dd8e7e68 Mon Sep 17 00:00:00 2001
From: Philippe Reynes <tremyfr@gmail.com>
Date: Tue, 1 Nov 2016 16:32:25 +0100
Subject: net: mii: add generic function to support ksetting support

The old ethtool api (get_setting and set_setting) has generic mii
functions mii_ethtool_sset and mii_ethtool_gset.

To support the new ethtool api ({get|set}_link_ksettings), we add
two generics mii function mii_ethtool_{get|set}_link_ksettings_get.

Signed-off-by: Philippe Reynes <tremyfr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mii.c   | 195 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mii.h |   4 ++
 2 files changed, 199 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/mii.c b/drivers/net/mii.c
index 993570b1e2ae..0443546fc427 100644
--- a/drivers/net/mii.c
+++ b/drivers/net/mii.c
@@ -134,6 +134,101 @@ int mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd)
 	return 0;
 }
 
+/**
+ * mii_ethtool_get_link_ksettings - get settings that are specified in @cmd
+ * @mii: MII interface
+ * @cmd: requested ethtool_link_ksettings
+ *
+ * The @cmd parameter is expected to have been cleared before calling
+ * mii_ethtool_get_link_ksettings().
+ *
+ * Returns 0 for success, negative on error.
+ */
+int mii_ethtool_get_link_ksettings(struct mii_if_info *mii,
+				   struct ethtool_link_ksettings *cmd)
+{
+	struct net_device *dev = mii->dev;
+	u16 bmcr, bmsr, ctrl1000 = 0, stat1000 = 0;
+	u32 nego, supported, advertising, lp_advertising;
+
+	supported = (SUPPORTED_10baseT_Half | SUPPORTED_10baseT_Full |
+		     SUPPORTED_100baseT_Half | SUPPORTED_100baseT_Full |
+		     SUPPORTED_Autoneg | SUPPORTED_TP | SUPPORTED_MII);
+	if (mii->supports_gmii)
+		supported |= SUPPORTED_1000baseT_Half |
+			SUPPORTED_1000baseT_Full;
+
+	/* only supports twisted-pair */
+	cmd->base.port = PORT_MII;
+
+	/* this isn't fully supported at higher layers */
+	cmd->base.phy_address = mii->phy_id;
+	cmd->base.mdio_support = ETH_MDIO_SUPPORTS_C22;
+
+	advertising = ADVERTISED_TP | ADVERTISED_MII;
+
+	bmcr = mii->mdio_read(dev, mii->phy_id, MII_BMCR);
+	bmsr = mii->mdio_read(dev, mii->phy_id, MII_BMSR);
+	if (mii->supports_gmii) {
+		ctrl1000 = mii->mdio_read(dev, mii->phy_id, MII_CTRL1000);
+		stat1000 = mii->mdio_read(dev, mii->phy_id, MII_STAT1000);
+	}
+	if (bmcr & BMCR_ANENABLE) {
+		advertising |= ADVERTISED_Autoneg;
+		cmd->base.autoneg = AUTONEG_ENABLE;
+
+		advertising |= mii_get_an(mii, MII_ADVERTISE);
+		if (mii->supports_gmii)
+			advertising |= mii_ctrl1000_to_ethtool_adv_t(ctrl1000);
+
+		if (bmsr & BMSR_ANEGCOMPLETE) {
+			lp_advertising = mii_get_an(mii, MII_LPA);
+			lp_advertising |=
+					mii_stat1000_to_ethtool_lpa_t(stat1000);
+		} else {
+			lp_advertising = 0;
+		}
+
+		nego = advertising & lp_advertising;
+
+		if (nego & (ADVERTISED_1000baseT_Full |
+			    ADVERTISED_1000baseT_Half)) {
+			cmd->base.speed = SPEED_1000;
+			cmd->base.duplex = !!(nego & ADVERTISED_1000baseT_Full);
+		} else if (nego & (ADVERTISED_100baseT_Full |
+				   ADVERTISED_100baseT_Half)) {
+			cmd->base.speed = SPEED_100;
+			cmd->base.duplex = !!(nego & ADVERTISED_100baseT_Full);
+		} else {
+			cmd->base.speed = SPEED_10;
+			cmd->base.duplex = !!(nego & ADVERTISED_10baseT_Full);
+		}
+	} else {
+		cmd->base.autoneg = AUTONEG_DISABLE;
+
+		cmd->base.speed = ((bmcr & BMCR_SPEED1000 &&
+				    (bmcr & BMCR_SPEED100) == 0) ?
+				   SPEED_1000 :
+				   ((bmcr & BMCR_SPEED100) ?
+				    SPEED_100 : SPEED_10));
+		cmd->base.duplex = (bmcr & BMCR_FULLDPLX) ?
+			DUPLEX_FULL : DUPLEX_HALF;
+	}
+
+	mii->full_duplex = cmd->base.duplex;
+
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
+						supported);
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
+						advertising);
+	ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.lp_advertising,
+						lp_advertising);
+
+	/* ignore maxtxpkt, maxrxpkt for now */
+
+	return 0;
+}
+
 /**
  * mii_ethtool_sset - set settings that are specified in @ecmd
  * @mii: MII interface
@@ -226,6 +321,104 @@ int mii_ethtool_sset(struct mii_if_info *mii, struct ethtool_cmd *ecmd)
 	return 0;
 }
 
+/**
+ * mii_ethtool_set_link_ksettings - set settings that are specified in @cmd
+ * @mii: MII interfaces
+ * @cmd: requested ethtool_link_ksettings
+ *
+ * Returns 0 for success, negative on error.
+ */
+int mii_ethtool_set_link_ksettings(struct mii_if_info *mii,
+				   const struct ethtool_link_ksettings *cmd)
+{
+	struct net_device *dev = mii->dev;
+	u32 speed = cmd->base.speed;
+
+	if (speed != SPEED_10 &&
+	    speed != SPEED_100 &&
+	    speed != SPEED_1000)
+		return -EINVAL;
+	if (cmd->base.duplex != DUPLEX_HALF && cmd->base.duplex != DUPLEX_FULL)
+		return -EINVAL;
+	if (cmd->base.port != PORT_MII)
+		return -EINVAL;
+	if (cmd->base.phy_address != mii->phy_id)
+		return -EINVAL;
+	if (cmd->base.autoneg != AUTONEG_DISABLE &&
+	    cmd->base.autoneg != AUTONEG_ENABLE)
+		return -EINVAL;
+	if ((speed == SPEED_1000) && (!mii->supports_gmii))
+		return -EINVAL;
+
+	/* ignore supported, maxtxpkt, maxrxpkt */
+
+	if (cmd->base.autoneg == AUTONEG_ENABLE) {
+		u32 bmcr, advert, tmp;
+		u32 advert2 = 0, tmp2 = 0;
+		u32 advertising;
+
+		ethtool_convert_link_mode_to_legacy_u32(
+			&advertising, cmd->link_modes.advertising);
+
+		if ((advertising & (ADVERTISED_10baseT_Half |
+				    ADVERTISED_10baseT_Full |
+				    ADVERTISED_100baseT_Half |
+				    ADVERTISED_100baseT_Full |
+				    ADVERTISED_1000baseT_Half |
+				    ADVERTISED_1000baseT_Full)) == 0)
+			return -EINVAL;
+
+		/* advertise only what has been requested */
+		advert = mii->mdio_read(dev, mii->phy_id, MII_ADVERTISE);
+		tmp = advert & ~(ADVERTISE_ALL | ADVERTISE_100BASE4);
+		if (mii->supports_gmii) {
+			advert2 = mii->mdio_read(dev, mii->phy_id,
+						 MII_CTRL1000);
+			tmp2 = advert2 &
+				~(ADVERTISE_1000HALF | ADVERTISE_1000FULL);
+		}
+		tmp |= ethtool_adv_to_mii_adv_t(advertising);
+
+		if (mii->supports_gmii)
+			tmp2 |= ethtool_adv_to_mii_ctrl1000_t(advertising);
+		if (advert != tmp) {
+			mii->mdio_write(dev, mii->phy_id, MII_ADVERTISE, tmp);
+			mii->advertising = tmp;
+		}
+		if ((mii->supports_gmii) && (advert2 != tmp2))
+			mii->mdio_write(dev, mii->phy_id, MII_CTRL1000, tmp2);
+
+		/* turn on autonegotiation, and force a renegotiate */
+		bmcr = mii->mdio_read(dev, mii->phy_id, MII_BMCR);
+		bmcr |= (BMCR_ANENABLE | BMCR_ANRESTART);
+		mii->mdio_write(dev, mii->phy_id, MII_BMCR, bmcr);
+
+		mii->force_media = 0;
+	} else {
+		u32 bmcr, tmp;
+
+		/* turn off auto negotiation, set speed and duplexity */
+		bmcr = mii->mdio_read(dev, mii->phy_id, MII_BMCR);
+		tmp = bmcr & ~(BMCR_ANENABLE | BMCR_SPEED100 |
+			       BMCR_SPEED1000 | BMCR_FULLDPLX);
+		if (speed == SPEED_1000)
+			tmp |= BMCR_SPEED1000;
+		else if (speed == SPEED_100)
+			tmp |= BMCR_SPEED100;
+		if (cmd->base.duplex == DUPLEX_FULL) {
+			tmp |= BMCR_FULLDPLX;
+			mii->full_duplex = 1;
+		} else {
+			mii->full_duplex = 0;
+		}
+		if (bmcr != tmp)
+			mii->mdio_write(dev, mii->phy_id, MII_BMCR, tmp);
+
+		mii->force_media = 1;
+	}
+	return 0;
+}
+
 /**
  * mii_check_gmii_support - check if the MII supports Gb interfaces
  * @mii: the MII interface
@@ -466,7 +659,9 @@ MODULE_LICENSE("GPL");
 EXPORT_SYMBOL(mii_link_ok);
 EXPORT_SYMBOL(mii_nway_restart);
 EXPORT_SYMBOL(mii_ethtool_gset);
+EXPORT_SYMBOL(mii_ethtool_get_link_ksettings);
 EXPORT_SYMBOL(mii_ethtool_sset);
+EXPORT_SYMBOL(mii_ethtool_set_link_ksettings);
 EXPORT_SYMBOL(mii_check_link);
 EXPORT_SYMBOL(mii_check_media);
 EXPORT_SYMBOL(mii_check_gmii_support);
diff --git a/include/linux/mii.h b/include/linux/mii.h
index 47492c9631b3..1629a0c32679 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -31,7 +31,11 @@ struct mii_if_info {
 extern int mii_link_ok (struct mii_if_info *mii);
 extern int mii_nway_restart (struct mii_if_info *mii);
 extern int mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd);
+extern int mii_ethtool_get_link_ksettings(
+	struct mii_if_info *mii, struct ethtool_link_ksettings *cmd);
 extern int mii_ethtool_sset(struct mii_if_info *mii, struct ethtool_cmd *ecmd);
+extern int mii_ethtool_set_link_ksettings(
+	struct mii_if_info *mii, const struct ethtool_link_ksettings *cmd);
 extern int mii_check_gmii_support(struct mii_if_info *mii);
 extern void mii_check_link (struct mii_if_info *mii);
 extern unsigned int mii_check_media (struct mii_if_info *mii,
-- 
cgit v1.2.3


From 1610a73c4175e7d63985316b52ac932b65a4dc90 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 3 Nov 2016 10:56:12 +0100
Subject: netfilter: kill NF_HOOK_THRESH() and state->tresh

Patch c5136b15ea36 ("netfilter: bridge: add and use br_nf_hook_thresh")
introduced br_nf_hook_thresh().

Replace NF_HOOK_THRESH() by br_nf_hook_thresh from
br_nf_forward_finish(), so we have no more callers for this macro.

As a result, state->thresh and explicit thresh parameter in the hook
state structure is not required anymore. And we can get rid of
skip-hook-under-thresh loop in nf_iterate() in the core path that is
only used by br_netfilter to search for the filter hook.

Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h             | 50 +++++++++--------------------------
 include/linux/netfilter_ingress.h     |  2 +-
 net/bridge/br_netfilter_hooks.c       |  8 +++---
 net/bridge/netfilter/ebtable_broute.c |  2 +-
 net/netfilter/core.c                  |  4 ---
 net/netfilter/nf_queue.c              |  2 --
 6 files changed, 19 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index abc7fdcb9eb1..e0d000f6c9bf 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -49,7 +49,6 @@ struct sock;
 
 struct nf_hook_state {
 	unsigned int hook;
-	int thresh;
 	u_int8_t pf;
 	struct net_device *in;
 	struct net_device *out;
@@ -84,7 +83,7 @@ struct nf_hook_entry {
 static inline void nf_hook_state_init(struct nf_hook_state *p,
 				      struct nf_hook_entry *hook_entry,
 				      unsigned int hook,
-				      int thresh, u_int8_t pf,
+				      u_int8_t pf,
 				      struct net_device *indev,
 				      struct net_device *outdev,
 				      struct sock *sk,
@@ -92,7 +91,6 @@ static inline void nf_hook_state_init(struct nf_hook_state *p,
 				      int (*okfn)(struct net *, struct sock *, struct sk_buff *))
 {
 	p->hook = hook;
-	p->thresh = thresh;
 	p->pf = pf;
 	p->in = indev;
 	p->out = outdev;
@@ -155,20 +153,16 @@ extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state);
 
 /**
- *	nf_hook_thresh - call a netfilter hook
+ *	nf_hook - call a netfilter hook
  *
  *	Returns 1 if the hook has allowed the packet to pass.  The function
  *	okfn must be invoked by the caller in this case.  Any other return
  *	value indicates the packet has been consumed by the hook.
  */
-static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
-				 struct net *net,
-				 struct sock *sk,
-				 struct sk_buff *skb,
-				 struct net_device *indev,
-				 struct net_device *outdev,
-				 int (*okfn)(struct net *, struct sock *, struct sk_buff *),
-				 int thresh)
+static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
+			  struct sock *sk, struct sk_buff *skb,
+			  struct net_device *indev, struct net_device *outdev,
+			  int (*okfn)(struct net *, struct sock *, struct sk_buff *))
 {
 	struct nf_hook_entry *hook_head;
 	int ret = 1;
@@ -185,8 +179,8 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 	if (hook_head) {
 		struct nf_hook_state state;
 
-		nf_hook_state_init(&state, hook_head, hook, thresh,
-				   pf, indev, outdev, sk, net, okfn);
+		nf_hook_state_init(&state, hook_head, hook, pf, indev, outdev,
+				   sk, net, okfn);
 
 		ret = nf_hook_slow(skb, &state);
 	}
@@ -195,14 +189,6 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 	return ret;
 }
 
-static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
-			  struct sock *sk, struct sk_buff *skb,
-			  struct net_device *indev, struct net_device *outdev,
-			  int (*okfn)(struct net *, struct sock *, struct sk_buff *))
-{
-	return nf_hook_thresh(pf, hook, net, sk, skb, indev, outdev, okfn, INT_MIN);
-}
-                   
 /* Activate hook; either okfn or kfree_skb called, unless a hook
    returns NF_STOLEN (in which case, it's up to the hook to deal with
    the consequences).
@@ -220,19 +206,6 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
    coders :)
 */
 
-static inline int
-NF_HOOK_THRESH(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
-	       struct sk_buff *skb, struct net_device *in,
-	       struct net_device *out,
-	       int (*okfn)(struct net *, struct sock *, struct sk_buff *),
-	       int thresh)
-{
-	int ret = nf_hook_thresh(pf, hook, net, sk, skb, in, out, okfn, thresh);
-	if (ret == 1)
-		ret = okfn(net, sk, skb);
-	return ret;
-}
-
 static inline int
 NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
 	     struct sk_buff *skb, struct net_device *in, struct net_device *out,
@@ -242,7 +215,7 @@ NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
 	int ret;
 
 	if (!cond ||
-	    ((ret = nf_hook_thresh(pf, hook, net, sk, skb, in, out, okfn, INT_MIN)) == 1))
+	    ((ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn)) == 1))
 		ret = okfn(net, sk, skb);
 	return ret;
 }
@@ -252,7 +225,10 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct
 	struct net_device *in, struct net_device *out,
 	int (*okfn)(struct net *, struct sock *, struct sk_buff *))
 {
-	return NF_HOOK_THRESH(pf, hook, net, sk, skb, in, out, okfn, INT_MIN);
+	int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
+	if (ret == 1)
+		ret = okfn(net, sk, skb);
+	return ret;
 }
 
 /* Call setsockopt() */
diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
index 33e37fb41d5d..fd44e4131710 100644
--- a/include/linux/netfilter_ingress.h
+++ b/include/linux/netfilter_ingress.h
@@ -26,7 +26,7 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 	if (unlikely(!e))
 		return 0;
 
-	nf_hook_state_init(&state, e, NF_NETDEV_INGRESS, INT_MIN,
+	nf_hook_state_init(&state, e, NF_NETDEV_INGRESS,
 			   NFPROTO_NETDEV, skb->dev, NULL, NULL,
 			   dev_net(skb->dev), NULL);
 	return nf_hook_slow(skb, &state);
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 2fe9345c1407..d0d66faebe90 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -561,8 +561,8 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff
 	}
 	nf_bridge_push_encap_header(skb);
 
-	NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, net, sk, skb,
-		       in, skb->dev, br_forward_finish, 1);
+	br_nf_hook_thresh(NF_BR_FORWARD, net, sk, skb, in, skb->dev,
+			  br_forward_finish);
 	return 0;
 }
 
@@ -1016,8 +1016,8 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net,
 
 	/* We may already have this, but read-locks nest anyway */
 	rcu_read_lock();
-	nf_hook_state_init(&state, elem, hook, NF_BR_PRI_BRNF + 1,
-			   NFPROTO_BRIDGE, indev, outdev, sk, net, okfn);
+	nf_hook_state_init(&state, elem, hook, NFPROTO_BRIDGE, indev, outdev,
+			   sk, net, okfn);
 
 	ret = nf_hook_slow(skb, &state);
 	rcu_read_unlock();
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index ec94c6f1ae88..599679e3498d 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -53,7 +53,7 @@ static int ebt_broute(struct sk_buff *skb)
 	struct nf_hook_state state;
 	int ret;
 
-	nf_hook_state_init(&state, NULL, NF_BR_BROUTING, INT_MIN,
+	nf_hook_state_init(&state, NULL, NF_BR_BROUTING,
 			   NFPROTO_BRIDGE, skb->dev, NULL, NULL,
 			   dev_net(skb->dev), NULL);
 
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 76014ad72ec5..cb0232c11bc8 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -309,10 +309,6 @@ unsigned int nf_iterate(struct sk_buff *skb,
 	unsigned int verdict;
 
 	while (*entryp) {
-		if (state->thresh > (*entryp)->ops.priority) {
-			*entryp = rcu_dereference((*entryp)->next);
-			continue;
-		}
 repeat:
 		verdict = (*entryp)->ops.hook((*entryp)->ops.priv, skb, state);
 		if (verdict != NF_ACCEPT) {
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 8f08d759844a..0fb38966e5bf 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -200,8 +200,6 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 			verdict = NF_DROP;
 	}
 
-	entry->state.thresh = INT_MIN;
-
 	if (verdict == NF_ACCEPT) {
 		hook_entry = rcu_dereference(hook_entry->next);
 		if (hook_entry)
-- 
cgit v1.2.3


From 613dbd95723aee7abd16860745691b6c7bda20dc Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 3 Nov 2016 10:56:21 +0100
Subject: netfilter: x_tables: move hook state into xt_action_param structure

Place pointer to hook state in xt_action_param structure instead of
copying the fields that we need. After this change xt_action_param fits
into one cacheline.

This patch also adds a set of new wrapper functions to fetch relevant
hook state structure fields.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h         | 48 +++++++++++++++++++++++-------
 include/net/netfilter/nf_tables.h          | 11 +++----
 net/bridge/netfilter/ebt_arpreply.c        |  3 +-
 net/bridge/netfilter/ebt_log.c             | 11 +++----
 net/bridge/netfilter/ebt_nflog.c           |  6 ++--
 net/bridge/netfilter/ebt_redirect.c        |  6 ++--
 net/bridge/netfilter/ebtables.c            |  6 +---
 net/ipv4/netfilter/arp_tables.c            |  6 +---
 net/ipv4/netfilter/ip_tables.c             |  6 +---
 net/ipv4/netfilter/ipt_MASQUERADE.c        |  3 +-
 net/ipv4/netfilter/ipt_REJECT.c            |  4 +--
 net/ipv4/netfilter/ipt_SYNPROXY.c          |  4 +--
 net/ipv4/netfilter/ipt_rpfilter.c          |  2 +-
 net/ipv6/netfilter/ip6_tables.c            |  6 +---
 net/ipv6/netfilter/ip6t_MASQUERADE.c       |  2 +-
 net/ipv6/netfilter/ip6t_REJECT.c           | 23 ++++++++------
 net/ipv6/netfilter/ip6t_SYNPROXY.c         |  4 +--
 net/ipv6/netfilter/ip6t_rpfilter.c         |  3 +-
 net/netfilter/ipset/ip_set_core.c          |  6 ++--
 net/netfilter/ipset/ip_set_hash_netiface.c |  2 +-
 net/netfilter/xt_AUDIT.c                   | 10 +++----
 net/netfilter/xt_LOG.c                     |  6 ++--
 net/netfilter/xt_NETMAP.c                  | 20 ++++++-------
 net/netfilter/xt_NFLOG.c                   |  6 ++--
 net/netfilter/xt_NFQUEUE.c                 |  4 +--
 net/netfilter/xt_REDIRECT.c                |  4 +--
 net/netfilter/xt_TCPMSS.c                  |  4 +--
 net/netfilter/xt_TEE.c                     |  4 +--
 net/netfilter/xt_TPROXY.c                  | 16 +++++-----
 net/netfilter/xt_addrtype.c                | 10 +++----
 net/netfilter/xt_cluster.c                 |  2 +-
 net/netfilter/xt_connlimit.c               |  8 ++---
 net/netfilter/xt_conntrack.c               |  8 ++---
 net/netfilter/xt_devgroup.c                |  4 +--
 net/netfilter/xt_dscp.c                    |  2 +-
 net/netfilter/xt_ipvs.c                    |  4 +--
 net/netfilter/xt_nfacct.c                  |  2 +-
 net/netfilter/xt_osf.c                     | 10 +++----
 net/netfilter/xt_owner.c                   |  2 +-
 net/netfilter/xt_pkttype.c                 |  4 +--
 net/netfilter/xt_policy.c                  |  4 +--
 net/netfilter/xt_recent.c                  | 10 +++----
 net/netfilter/xt_set.c                     | 26 ++++++++--------
 net/netfilter/xt_socket.c                  |  4 +--
 net/sched/act_ipt.c                        | 12 ++++----
 net/sched/em_ipset.c                       | 17 ++++++-----
 46 files changed, 196 insertions(+), 169 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 2ad1a2b289b5..cd4eaf8df445 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -4,6 +4,7 @@
 
 #include <linux/netdevice.h>
 #include <linux/static_key.h>
+#include <linux/netfilter.h>
 #include <uapi/linux/netfilter/x_tables.h>
 
 /* Test a struct->invflags and a boolean for inequality */
@@ -17,14 +18,9 @@
  * @target:	the target extension
  * @matchinfo:	per-match data
  * @targetinfo:	per-target data
- * @net		network namespace through which the action was invoked
- * @in:		input netdevice
- * @out:	output netdevice
+ * @state:	pointer to hook state this packet came from
  * @fragoff:	packet is a fragment, this is the data offset
  * @thoff:	position of transport header relative to skb->data
- * @hook:	hook number given packet came from
- * @family:	Actual NFPROTO_* through which the function is invoked
- * 		(helpful when match->family == NFPROTO_UNSPEC)
  *
  * Fields written to by extensions:
  *
@@ -38,15 +34,47 @@ struct xt_action_param {
 	union {
 		const void *matchinfo, *targinfo;
 	};
-	struct net *net;
-	const struct net_device *in, *out;
+	const struct nf_hook_state *state;
 	int fragoff;
 	unsigned int thoff;
-	unsigned int hooknum;
-	u_int8_t family;
 	bool hotdrop;
 };
 
+static inline struct net *xt_net(const struct xt_action_param *par)
+{
+	return par->state->net;
+}
+
+static inline struct net_device *xt_in(const struct xt_action_param *par)
+{
+	return par->state->in;
+}
+
+static inline const char *xt_inname(const struct xt_action_param *par)
+{
+	return par->state->in->name;
+}
+
+static inline struct net_device *xt_out(const struct xt_action_param *par)
+{
+	return par->state->out;
+}
+
+static inline const char *xt_outname(const struct xt_action_param *par)
+{
+	return par->state->out->name;
+}
+
+static inline unsigned int xt_hooknum(const struct xt_action_param *par)
+{
+	return par->state->hook;
+}
+
+static inline u_int8_t xt_family(const struct xt_action_param *par)
+{
+	return par->state->pf;
+}
+
 /**
  * struct xt_mtchk_param - parameters for match extensions'
  * checkentry functions
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 5031e072567b..44060344f958 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -30,11 +30,12 @@ static inline void nft_set_pktinfo(struct nft_pktinfo *pkt,
 				   const struct nf_hook_state *state)
 {
 	pkt->skb = skb;
-	pkt->net = pkt->xt.net = state->net;
-	pkt->in = pkt->xt.in = state->in;
-	pkt->out = pkt->xt.out = state->out;
-	pkt->hook = pkt->xt.hooknum = state->hook;
-	pkt->pf = pkt->xt.family = state->pf;
+	pkt->net = state->net;
+	pkt->in = state->in;
+	pkt->out = state->out;
+	pkt->hook = state->hook;
+	pkt->pf = state->pf;
+	pkt->xt.state = state;
 }
 
 static inline void nft_set_pktinfo_proto_unspec(struct nft_pktinfo *pkt,
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index 070cf134a22f..5929309beaa1 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -51,7 +51,8 @@ ebt_arpreply_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	if (diptr == NULL)
 		return EBT_DROP;
 
-	arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr, (struct net_device *)par->in,
+	arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr,
+		 (struct net_device *)xt_in(par),
 		 *diptr, shp, info->mac, shp);
 
 	return info->target;
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 9a11086ba6ff..e88bd4827ac1 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -179,7 +179,7 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct ebt_log_info *info = par->targinfo;
 	struct nf_loginfo li;
-	struct net *net = par->net;
+	struct net *net = xt_net(par);
 
 	li.type = NF_LOG_TYPE_LOG;
 	li.u.log.level = info->loglevel;
@@ -190,11 +190,12 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	 * nf_log_packet() with NFT_LOG_TYPE_LOG here. --Pablo
 	 */
 	if (info->bitmask & EBT_LOG_NFLOG)
-		nf_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb,
-			      par->in, par->out, &li, "%s", info->prefix);
+		nf_log_packet(net, NFPROTO_BRIDGE, xt_hooknum(par), skb,
+			      xt_in(par), xt_out(par), &li, "%s",
+			      info->prefix);
 	else
-		ebt_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb, par->in,
-			       par->out, &li, info->prefix);
+		ebt_log_packet(net, NFPROTO_BRIDGE, xt_hooknum(par), skb,
+			       xt_in(par), xt_out(par), &li, info->prefix);
 	return EBT_CONTINUE;
 }
 
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index 54816150608e..c1dc48686200 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -23,16 +23,16 @@ static unsigned int
 ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct ebt_nflog_info *info = par->targinfo;
+	struct net *net = xt_net(par);
 	struct nf_loginfo li;
-	struct net *net = par->net;
 
 	li.type = NF_LOG_TYPE_ULOG;
 	li.u.ulog.copy_len = info->len;
 	li.u.ulog.group = info->group;
 	li.u.ulog.qthreshold = info->threshold;
 
-	nf_log_packet(net, PF_BRIDGE, par->hooknum, skb, par->in,
-		      par->out, &li, "%s", info->prefix);
+	nf_log_packet(net, PF_BRIDGE, xt_hooknum(par), skb, xt_in(par),
+		      xt_out(par), &li, "%s", info->prefix);
 	return EBT_CONTINUE;
 }
 
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 2e7c4f974340..8d2a85e0594e 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -23,12 +23,12 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	if (!skb_make_writable(skb, 0))
 		return EBT_DROP;
 
-	if (par->hooknum != NF_BR_BROUTING)
+	if (xt_hooknum(par) != NF_BR_BROUTING)
 		/* rcu_read_lock()ed by nf_hook_thresh */
 		ether_addr_copy(eth_hdr(skb)->h_dest,
-				br_port_get_rcu(par->in)->br->dev->dev_addr);
+				br_port_get_rcu(xt_in(par))->br->dev->dev_addr);
 	else
-		ether_addr_copy(eth_hdr(skb)->h_dest, par->in->dev_addr);
+		ether_addr_copy(eth_hdr(skb)->h_dest, xt_in(par)->dev_addr);
 	skb->pkt_type = PACKET_HOST;
 	return info->target;
 }
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index f5c11bbe27db..1ab6014cf0f8 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -194,12 +194,8 @@ unsigned int ebt_do_table(struct sk_buff *skb,
 	const struct ebt_table_info *private;
 	struct xt_action_param acpar;
 
-	acpar.family  = NFPROTO_BRIDGE;
-	acpar.net     = state->net;
-	acpar.in      = state->in;
-	acpar.out     = state->out;
+	acpar.state   = state;
 	acpar.hotdrop = false;
-	acpar.hooknum = hook;
 
 	read_lock_bh(&table->lock);
 	private = table->private;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index b31df597fd37..e76ab23a2deb 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -217,11 +217,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	 */
 	e = get_entry(table_base, private->hook_entry[hook]);
 
-	acpar.net     = state->net;
-	acpar.in      = state->in;
-	acpar.out     = state->out;
-	acpar.hooknum = hook;
-	acpar.family  = NFPROTO_ARP;
+	acpar.state   = state;
 	acpar.hotdrop = false;
 
 	arp = arp_hdr(skb);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 7c00ce90adb8..de4fa03f46f3 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -261,11 +261,7 @@ ipt_do_table(struct sk_buff *skb,
 	acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
 	acpar.thoff   = ip_hdrlen(skb);
 	acpar.hotdrop = false;
-	acpar.net     = state->net;
-	acpar.in      = state->in;
-	acpar.out     = state->out;
-	acpar.family  = NFPROTO_IPV4;
-	acpar.hooknum = hook;
+	acpar.state   = state;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 	local_bh_disable();
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index da7f02a0b868..34cfb9b0bc0a 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -55,7 +55,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	range.min_proto = mr->range[0].min;
 	range.max_proto = mr->range[0].max;
 
-	return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out);
+	return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range,
+				      xt_out(par));
 }
 
 static struct xt_target masquerade_tg_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 1d16c0f28df0..8bd0d7b26632 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -34,7 +34,7 @@ static unsigned int
 reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct ipt_reject_info *reject = par->targinfo;
-	int hook = par->hooknum;
+	int hook = xt_hooknum(par);
 
 	switch (reject->with) {
 	case IPT_ICMP_NET_UNREACHABLE:
@@ -59,7 +59,7 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
 		break;
 	case IPT_TCP_RESET:
-		nf_send_reset(par->net, skb, hook);
+		nf_send_reset(xt_net(par), skb, hook);
 	case IPT_ICMP_ECHOREPLY:
 		/* Doesn't happen. */
 		break;
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index db5b87509446..361411688221 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -263,12 +263,12 @@ static unsigned int
 synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_synproxy_info *info = par->targinfo;
-	struct net *net = par->net;
+	struct net *net = xt_net(par);
 	struct synproxy_net *snet = synproxy_pernet(net);
 	struct synproxy_options opts = {};
 	struct tcphdr *th, _th;
 
-	if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
+	if (nf_ip_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP))
 		return NF_DROP;
 
 	th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 78cc64eddfc1..59b49945b481 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -95,7 +95,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	flow.flowi4_tos = RT_TOS(iph->tos);
 	flow.flowi4_scope = RT_SCOPE_UNIVERSE;
 
-	return rpfilter_lookup_reverse(par->net, &flow, par->in, info->flags) ^ invert;
+	return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert;
 }
 
 static int rpfilter_check(const struct xt_mtchk_param *par)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 55aacea24396..7eac01d5d621 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -291,11 +291,7 @@ ip6t_do_table(struct sk_buff *skb,
 	 * rule is also a fragment-specific rule, non-fragments won't
 	 * match it. */
 	acpar.hotdrop = false;
-	acpar.net     = state->net;
-	acpar.in      = state->in;
-	acpar.out     = state->out;
-	acpar.family  = NFPROTO_IPV6;
-	acpar.hooknum = hook;
+	acpar.state   = state;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
index 7f9f45d829d2..2b1a15846f9a 100644
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -24,7 +24,7 @@
 static unsigned int
 masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	return nf_nat_masquerade_ipv6(skb, par->targinfo, par->out);
+	return nf_nat_masquerade_ipv6(skb, par->targinfo, xt_out(par));
 }
 
 static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index db29bbf41b59..fa51a205918d 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -39,35 +39,40 @@ static unsigned int
 reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct ip6t_reject_info *reject = par->targinfo;
-	struct net *net = par->net;
+	struct net *net = xt_net(par);
 
 	switch (reject->with) {
 	case IP6T_ICMP6_NO_ROUTE:
-		nf_send_unreach6(net, skb, ICMPV6_NOROUTE, par->hooknum);
+		nf_send_unreach6(net, skb, ICMPV6_NOROUTE, xt_hooknum(par));
 		break;
 	case IP6T_ICMP6_ADM_PROHIBITED:
-		nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED, par->hooknum);
+		nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED,
+				 xt_hooknum(par));
 		break;
 	case IP6T_ICMP6_NOT_NEIGHBOUR:
-		nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR, par->hooknum);
+		nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR,
+				 xt_hooknum(par));
 		break;
 	case IP6T_ICMP6_ADDR_UNREACH:
-		nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH, par->hooknum);
+		nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH,
+				 xt_hooknum(par));
 		break;
 	case IP6T_ICMP6_PORT_UNREACH:
-		nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH, par->hooknum);
+		nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH,
+				 xt_hooknum(par));
 		break;
 	case IP6T_ICMP6_ECHOREPLY:
 		/* Do nothing */
 		break;
 	case IP6T_TCP_RESET:
-		nf_send_reset6(net, skb, par->hooknum);
+		nf_send_reset6(net, skb, xt_hooknum(par));
 		break;
 	case IP6T_ICMP6_POLICY_FAIL:
-		nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, par->hooknum);
+		nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, xt_hooknum(par));
 		break;
 	case IP6T_ICMP6_REJECT_ROUTE:
-		nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE, par->hooknum);
+		nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE,
+				 xt_hooknum(par));
 		break;
 	}
 
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 06bed74cf5ee..99a1216287c8 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -277,12 +277,12 @@ static unsigned int
 synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_synproxy_info *info = par->targinfo;
-	struct net *net = par->net;
+	struct net *net = xt_net(par);
 	struct synproxy_net *snet = synproxy_pernet(net);
 	struct synproxy_options opts = {};
 	struct tcphdr *th, _th;
 
-	if (nf_ip6_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
+	if (nf_ip6_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP))
 		return NF_DROP;
 
 	th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index 1ee1b25df096..d5263dc364a9 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -93,7 +93,8 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	if (unlikely(saddrtype == IPV6_ADDR_ANY))
 		return true ^ invert; /* not routable: forward path will drop it */
 
-	return rpfilter_lookup_reverse6(par->net, skb, par->in, info->flags) ^ invert;
+	return rpfilter_lookup_reverse6(xt_net(par), skb, xt_in(par),
+					info->flags) ^ invert;
 }
 
 static int rpfilter_check(const struct xt_mtchk_param *par)
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index a748b0c2c981..3f1b945a24d5 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -541,7 +541,7 @@ int
 ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
 	    const struct xt_action_param *par, struct ip_set_adt_opt *opt)
 {
-	struct ip_set *set = ip_set_rcu_get(par->net, index);
+	struct ip_set *set = ip_set_rcu_get(xt_net(par), index);
 	int ret = 0;
 
 	BUG_ON(!set);
@@ -579,7 +579,7 @@ int
 ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
 	   const struct xt_action_param *par, struct ip_set_adt_opt *opt)
 {
-	struct ip_set *set = ip_set_rcu_get(par->net, index);
+	struct ip_set *set = ip_set_rcu_get(xt_net(par), index);
 	int ret;
 
 	BUG_ON(!set);
@@ -601,7 +601,7 @@ int
 ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
 	   const struct xt_action_param *par, struct ip_set_adt_opt *opt)
 {
-	struct ip_set *set = ip_set_rcu_get(par->net, index);
+	struct ip_set *set = ip_set_rcu_get(xt_net(par), index);
 	int ret = 0;
 
 	BUG_ON(!set);
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index f0f688db6213..aa1a776613b9 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -170,7 +170,7 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
 	e.ip &= ip_set_netmask(e.cidr);
 
-#define IFACE(dir)	(par->dir ? par->dir->name : "")
+#define IFACE(dir)	(par->state->dir ? par->state->dir->name : "")
 #define SRCDIR		(opt->flags & IPSET_DIM_TWO_SRC)
 
 	if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
index 4973cbddc446..19247a17e511 100644
--- a/net/netfilter/xt_AUDIT.c
+++ b/net/netfilter/xt_AUDIT.c
@@ -132,9 +132,9 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		goto errout;
 
 	audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s",
-			 info->type, par->hooknum, skb->len,
-			 par->in ? par->in->name : "?",
-			 par->out ? par->out->name : "?");
+			 info->type, xt_hooknum(par), skb->len,
+			 xt_in(par) ? xt_inname(par) : "?",
+			 xt_out(par) ? xt_outname(par) : "?");
 
 	if (skb->mark)
 		audit_log_format(ab, " mark=%#x", skb->mark);
@@ -144,7 +144,7 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
 				 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
 				 ntohs(eth_hdr(skb)->h_proto));
 
-		if (par->family == NFPROTO_BRIDGE) {
+		if (xt_family(par) == NFPROTO_BRIDGE) {
 			switch (eth_hdr(skb)->h_proto) {
 			case htons(ETH_P_IP):
 				audit_ip4(ab, skb);
@@ -157,7 +157,7 @@ audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		}
 	}
 
-	switch (par->family) {
+	switch (xt_family(par)) {
 	case NFPROTO_IPV4:
 		audit_ip4(ab, skb);
 		break;
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index 1763ab82bcd7..c3b2017ebe41 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -32,15 +32,15 @@ static unsigned int
 log_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_log_info *loginfo = par->targinfo;
+	struct net *net = xt_net(par);
 	struct nf_loginfo li;
-	struct net *net = par->net;
 
 	li.type = NF_LOG_TYPE_LOG;
 	li.u.log.level = loginfo->level;
 	li.u.log.logflags = loginfo->logflags;
 
-	nf_log_packet(net, par->family, par->hooknum, skb, par->in, par->out,
-		      &li, "%s", loginfo->prefix);
+	nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par),
+		      xt_out(par), &li, "%s", loginfo->prefix);
 	return XT_CONTINUE;
 }
 
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
index b253e07cb1c5..94d0b5411192 100644
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -33,8 +33,8 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 		netmask.ip6[i] = ~(range->min_addr.ip6[i] ^
 				   range->max_addr.ip6[i]);
 
-	if (par->hooknum == NF_INET_PRE_ROUTING ||
-	    par->hooknum == NF_INET_LOCAL_OUT)
+	if (xt_hooknum(par) == NF_INET_PRE_ROUTING ||
+	    xt_hooknum(par) == NF_INET_LOCAL_OUT)
 		new_addr.in6 = ipv6_hdr(skb)->daddr;
 	else
 		new_addr.in6 = ipv6_hdr(skb)->saddr;
@@ -51,7 +51,7 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 	newrange.min_proto	= range->min_proto;
 	newrange.max_proto	= range->max_proto;
 
-	return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
+	return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par)));
 }
 
 static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
@@ -72,16 +72,16 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
 	struct nf_nat_range newrange;
 
-	NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
-		     par->hooknum == NF_INET_POST_ROUTING ||
-		     par->hooknum == NF_INET_LOCAL_OUT ||
-		     par->hooknum == NF_INET_LOCAL_IN);
+	NF_CT_ASSERT(xt_hooknum(par) == NF_INET_PRE_ROUTING ||
+		     xt_hooknum(par) == NF_INET_POST_ROUTING ||
+		     xt_hooknum(par) == NF_INET_LOCAL_OUT ||
+		     xt_hooknum(par) == NF_INET_LOCAL_IN);
 	ct = nf_ct_get(skb, &ctinfo);
 
 	netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
 
-	if (par->hooknum == NF_INET_PRE_ROUTING ||
-	    par->hooknum == NF_INET_LOCAL_OUT)
+	if (xt_hooknum(par) == NF_INET_PRE_ROUTING ||
+	    xt_hooknum(par) == NF_INET_LOCAL_OUT)
 		new_ip = ip_hdr(skb)->daddr & ~netmask;
 	else
 		new_ip = ip_hdr(skb)->saddr & ~netmask;
@@ -96,7 +96,7 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	newrange.max_proto   = mr->range[0].max;
 
 	/* Hand modified range to generic setup. */
-	return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(par->hooknum));
+	return nf_nat_setup_info(ct, &newrange, HOOK2MANIP(xt_hooknum(par)));
 }
 
 static int netmap_tg4_check(const struct xt_tgchk_param *par)
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index 8668a5c18dc3..c7f8958cea4a 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -25,8 +25,8 @@ static unsigned int
 nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_nflog_info *info = par->targinfo;
+	struct net *net = xt_net(par);
 	struct nf_loginfo li;
-	struct net *net = par->net;
 
 	li.type		     = NF_LOG_TYPE_ULOG;
 	li.u.ulog.copy_len   = info->len;
@@ -37,8 +37,8 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	if (info->flags & XT_NFLOG_F_COPY_LEN)
 		li.u.ulog.flags |= NF_LOG_F_COPY_LEN;
 
-	nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in,
-			  par->out, &li, info->prefix);
+	nfulnl_log_packet(net, xt_family(par), xt_hooknum(par), skb,
+			  xt_in(par), xt_out(par), &li, info->prefix);
 	return XT_CONTINUE;
 }
 
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 8f1779ff7e30..a360b99a958a 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -43,7 +43,7 @@ nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
 
 	if (info->queues_total > 1) {
 		queue = nfqueue_hash(skb, queue, info->queues_total,
-				     par->family, jhash_initval);
+				     xt_family(par), jhash_initval);
 	}
 	return NF_QUEUE_NR(queue);
 }
@@ -98,7 +98,7 @@ nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
 			queue = info->queuenum + cpu % info->queues_total;
 		} else {
 			queue = nfqueue_hash(skb, queue, info->queues_total,
-					     par->family, jhash_initval);
+					     xt_family(par), jhash_initval);
 		}
 	}
 
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 03f0b370e178..651dce65a30b 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -31,7 +31,7 @@
 static unsigned int
 redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	return nf_nat_redirect_ipv6(skb, par->targinfo, par->hooknum);
+	return nf_nat_redirect_ipv6(skb, par->targinfo, xt_hooknum(par));
 }
 
 static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
@@ -62,7 +62,7 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par)
 static unsigned int
 redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	return nf_nat_redirect_ipv4(skb, par->targinfo, par->hooknum);
+	return nf_nat_redirect_ipv4(skb, par->targinfo, xt_hooknum(par));
 }
 
 static struct xt_target redirect_tg_reg[] __read_mostly = {
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 872db2d0e2a9..27241a767f17 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -108,7 +108,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
 		return -1;
 
 	if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
-		struct net *net = par->net;
+		struct net *net = xt_net(par);
 		unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family);
 		unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu);
 
@@ -172,7 +172,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
 	 * length IPv6 header of 60, ergo the default MSS value is 1220
 	 * Since no MSS was provided, we must use the default values
 	 */
-	if (par->family == NFPROTO_IPV4)
+	if (xt_family(par) == NFPROTO_IPV4)
 		newmss = min(newmss, (u16)536);
 	else
 		newmss = min(newmss, (u16)1220);
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 0471db4032c5..1c57ace75ae6 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -33,7 +33,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	const struct xt_tee_tginfo *info = par->targinfo;
 	int oif = info->priv ? info->priv->oif : 0;
 
-	nf_dup_ipv4(par->net, skb, par->hooknum, &info->gw.in, oif);
+	nf_dup_ipv4(xt_net(par), skb, xt_hooknum(par), &info->gw.in, oif);
 
 	return XT_CONTINUE;
 }
@@ -45,7 +45,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 	const struct xt_tee_tginfo *info = par->targinfo;
 	int oif = info->priv ? info->priv->oif : 0;
 
-	nf_dup_ipv6(par->net, skb, par->hooknum, &info->gw.in6, oif);
+	nf_dup_ipv6(xt_net(par), skb, xt_hooknum(par), &info->gw.in6, oif);
 
 	return XT_CONTINUE;
 }
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 663c4c3c9072..dbd72cc40e42 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -364,7 +364,8 @@ tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_tproxy_target_info *tgi = par->targinfo;
 
-	return tproxy_tg4(par->net, skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
+	return tproxy_tg4(xt_net(par), skb, tgi->laddr, tgi->lport,
+			  tgi->mark_mask, tgi->mark_value);
 }
 
 static unsigned int
@@ -372,7 +373,8 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
 
-	return tproxy_tg4(par->net, skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
+	return tproxy_tg4(xt_net(par), skb, tgi->laddr.ip, tgi->lport,
+			  tgi->mark_mask, tgi->mark_value);
 }
 
 #ifdef XT_TPROXY_HAVE_IPV6
@@ -442,7 +444,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
 		 * to a listener socket if there's one */
 		struct sock *sk2;
 
-		sk2 = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto,
+		sk2 = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
 					    &iph->saddr,
 					    tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
 					    hp->source,
@@ -485,10 +487,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	 * addresses, this happens if the redirect already happened
 	 * and the current packet belongs to an already established
 	 * connection */
-	sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto,
+	sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
 				   &iph->saddr, &iph->daddr,
 				   hp->source, hp->dest,
-				   par->in, NFT_LOOKUP_ESTABLISHED);
+				   xt_in(par), NFT_LOOKUP_ESTABLISHED);
 
 	laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
 	lport = tgi->lport ? tgi->lport : hp->dest;
@@ -500,10 +502,10 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	else if (!sk)
 		/* no there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
-		sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp,
+		sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp,
 					   tproto, &iph->saddr, laddr,
 					   hp->source, lport,
-					   par->in, NFT_LOOKUP_LISTENER);
+					   xt_in(par), NFT_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
 	if (sk && tproxy_sk_is_transparent(sk)) {
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index 11d6091991a4..e329dabde35f 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -125,7 +125,7 @@ static inline bool match_type(struct net *net, const struct net_device *dev,
 static bool
 addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
 {
-	struct net *net = par->net;
+	struct net *net = xt_net(par);
 	const struct xt_addrtype_info *info = par->matchinfo;
 	const struct iphdr *iph = ip_hdr(skb);
 	bool ret = true;
@@ -143,19 +143,19 @@ addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
 static bool
 addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
-	struct net *net = par->net;
+	struct net *net = xt_net(par);
 	const struct xt_addrtype_info_v1 *info = par->matchinfo;
 	const struct iphdr *iph;
 	const struct net_device *dev = NULL;
 	bool ret = true;
 
 	if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN)
-		dev = par->in;
+		dev = xt_in(par);
 	else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT)
-		dev = par->out;
+		dev = xt_out(par);
 
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
-	if (par->family == NFPROTO_IPV6)
+	if (xt_family(par) == NFPROTO_IPV6)
 		return addrtype_mt6(net, dev, skb, info);
 #endif
 	iph = ip_hdr(skb);
diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
index 96fa26b20b67..9a9884a39c0e 100644
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -112,7 +112,7 @@ xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	 * know, matches should not alter packets, but we are doing this here
 	 * because we would need to add a PKTTYPE target for this sole purpose.
 	 */
-	if (!xt_cluster_is_multicast_addr(skb, par->family) &&
+	if (!xt_cluster_is_multicast_addr(skb, xt_family(par)) &&
 	    skb->pkt_type == PACKET_MULTICAST) {
 	    	pskb->pkt_type = PACKET_HOST;
 	}
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index b6dc322593a3..bb3845339efd 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -317,7 +317,7 @@ static int count_them(struct net *net,
 static bool
 connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
-	struct net *net = par->net;
+	struct net *net = xt_net(par);
 	const struct xt_connlimit_info *info = par->matchinfo;
 	union nf_inet_addr addr;
 	struct nf_conntrack_tuple tuple;
@@ -332,11 +332,11 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
 		zone = nf_ct_zone(ct);
 	} else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
-				      par->family, net, &tuple)) {
+				      xt_family(par), net, &tuple)) {
 		goto hotdrop;
 	}
 
-	if (par->family == NFPROTO_IPV6) {
+	if (xt_family(par) == NFPROTO_IPV6) {
 		const struct ipv6hdr *iph = ipv6_hdr(skb);
 		memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ?
 		       &iph->daddr : &iph->saddr, sizeof(addr.ip6));
@@ -347,7 +347,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	}
 
 	connections = count_them(net, info->data, tuple_ptr, &addr,
-	                         &info->mask, par->family, zone);
+	                         &info->mask, xt_family(par), zone);
 	if (connections == 0)
 		/* kmalloc failed, drop it entirely */
 		goto hotdrop;
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index a3b8f697cfc5..2dea15ebc55b 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -200,22 +200,22 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
 		return false;
 
 	if (info->match_flags & XT_CONNTRACK_ORIGSRC)
-		if (conntrack_mt_origsrc(ct, info, par->family) ^
+		if (conntrack_mt_origsrc(ct, info, xt_family(par)) ^
 		    !(info->invert_flags & XT_CONNTRACK_ORIGSRC))
 			return false;
 
 	if (info->match_flags & XT_CONNTRACK_ORIGDST)
-		if (conntrack_mt_origdst(ct, info, par->family) ^
+		if (conntrack_mt_origdst(ct, info, xt_family(par)) ^
 		    !(info->invert_flags & XT_CONNTRACK_ORIGDST))
 			return false;
 
 	if (info->match_flags & XT_CONNTRACK_REPLSRC)
-		if (conntrack_mt_replsrc(ct, info, par->family) ^
+		if (conntrack_mt_replsrc(ct, info, xt_family(par)) ^
 		    !(info->invert_flags & XT_CONNTRACK_REPLSRC))
 			return false;
 
 	if (info->match_flags & XT_CONNTRACK_REPLDST)
-		if (conntrack_mt_repldst(ct, info, par->family) ^
+		if (conntrack_mt_repldst(ct, info, xt_family(par)) ^
 		    !(info->invert_flags & XT_CONNTRACK_REPLDST))
 			return false;
 
diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c
index d9202cdd25c9..96ebe1cdefec 100644
--- a/net/netfilter/xt_devgroup.c
+++ b/net/netfilter/xt_devgroup.c
@@ -24,12 +24,12 @@ static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	const struct xt_devgroup_info *info = par->matchinfo;
 
 	if (info->flags & XT_DEVGROUP_MATCH_SRC &&
-	    (((info->src_group ^ par->in->group) & info->src_mask ? 1 : 0) ^
+	    (((info->src_group ^ xt_in(par)->group) & info->src_mask ? 1 : 0) ^
 	     ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0)))
 		return false;
 
 	if (info->flags & XT_DEVGROUP_MATCH_DST &&
-	    (((info->dst_group ^ par->out->group) & info->dst_mask ? 1 : 0) ^
+	    (((info->dst_group ^ xt_out(par)->group) & info->dst_mask ? 1 : 0) ^
 	     ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0)))
 		return false;
 
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
index 64670fc5d0e1..236ac8008909 100644
--- a/net/netfilter/xt_dscp.c
+++ b/net/netfilter/xt_dscp.c
@@ -58,7 +58,7 @@ static bool tos_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_tos_match_info *info = par->matchinfo;
 
-	if (par->family == NFPROTO_IPV4)
+	if (xt_family(par) == NFPROTO_IPV4)
 		return ((ip_hdr(skb)->tos & info->tos_mask) ==
 		       info->tos_value) ^ !!info->invert;
 	else
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
index 71a9d95e0a81..0fdc89064488 100644
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -48,9 +48,9 @@ static bool
 ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_ipvs_mtinfo *data = par->matchinfo;
-	struct netns_ipvs *ipvs = net_ipvs(par->net);
+	struct netns_ipvs *ipvs = net_ipvs(xt_net(par));
 	/* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */
-	const u_int8_t family = par->family;
+	const u_int8_t family = xt_family(par);
 	struct ip_vs_iphdr iph;
 	struct ip_vs_protocol *pp;
 	struct ip_vs_conn *cp;
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index cf327593852a..cc0518fe598e 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -26,7 +26,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
 
 	nfnl_acct_update(skb, info->nfacct);
 
-	overquota = nfnl_acct_overquota(par->net, skb, info->nfacct);
+	overquota = nfnl_acct_overquota(xt_net(par), skb, info->nfacct);
 
 	return overquota == NFACCT_UNDERQUOTA ? false : true;
 }
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index 2455b69b5810..c05fefcec238 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -201,7 +201,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
 	unsigned char opts[MAX_IPOPTLEN];
 	const struct xt_osf_finger *kf;
 	const struct xt_osf_user_finger *f;
-	struct net *net = p->net;
+	struct net *net = xt_net(p);
 
 	if (!info)
 		return false;
@@ -326,8 +326,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
 		fcount++;
 
 		if (info->flags & XT_OSF_LOG)
-			nf_log_packet(net, p->family, p->hooknum, skb,
-				      p->in, p->out, NULL,
+			nf_log_packet(net, xt_family(p), xt_hooknum(p), skb,
+				      xt_in(p), xt_out(p), NULL,
 				      "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
 				      f->genre, f->version, f->subtype,
 				      &ip->saddr, ntohs(tcp->source),
@@ -341,8 +341,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
 	rcu_read_unlock();
 
 	if (!fcount && (info->flags & XT_OSF_LOG))
-		nf_log_packet(net, p->family, p->hooknum, skb, p->in,
-			      p->out, NULL,
+		nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, xt_in(p),
+			      xt_out(p), NULL,
 			"Remote OS is not known: %pI4:%u -> %pI4:%u\n",
 				&ip->saddr, ntohs(tcp->source),
 				&ip->daddr, ntohs(tcp->dest));
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index a20e731b5b6c..16477df45b3b 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -63,7 +63,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	const struct xt_owner_match_info *info = par->matchinfo;
 	const struct file *filp;
 	struct sock *sk = skb_to_full_sk(skb);
-	struct net *net = par->net;
+	struct net *net = xt_net(par);
 
 	if (sk == NULL || sk->sk_socket == NULL)
 		return (info->match ^ info->invert) == 0;
diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c
index 5b645cb598fc..57efb703ff18 100644
--- a/net/netfilter/xt_pkttype.c
+++ b/net/netfilter/xt_pkttype.c
@@ -30,10 +30,10 @@ pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par)
 
 	if (skb->pkt_type != PACKET_LOOPBACK)
 		type = skb->pkt_type;
-	else if (par->family == NFPROTO_IPV4 &&
+	else if (xt_family(par) == NFPROTO_IPV4 &&
 	    ipv4_is_multicast(ip_hdr(skb)->daddr))
 		type = PACKET_MULTICAST;
-	else if (par->family == NFPROTO_IPV6 &&
+	else if (xt_family(par) == NFPROTO_IPV6 &&
 	    ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF)
 		type = PACKET_MULTICAST;
 	else
diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c
index f23e97bb42d7..2b4ab189bba7 100644
--- a/net/netfilter/xt_policy.c
+++ b/net/netfilter/xt_policy.c
@@ -116,9 +116,9 @@ policy_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	int ret;
 
 	if (info->flags & XT_POLICY_MATCH_IN)
-		ret = match_policy_in(skb, info, par->family);
+		ret = match_policy_in(skb, info, xt_family(par));
 	else
-		ret = match_policy_out(skb, info, par->family);
+		ret = match_policy_out(skb, info, xt_family(par));
 
 	if (ret < 0)
 		ret = info->flags & XT_POLICY_MATCH_NONE ? true : false;
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index e3b7a09b103e..bf250000e084 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -236,7 +236,7 @@ static void recent_table_flush(struct recent_table *t)
 static bool
 recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
-	struct net *net = par->net;
+	struct net *net = xt_net(par);
 	struct recent_net *recent_net = recent_pernet(net);
 	const struct xt_recent_mtinfo_v1 *info = par->matchinfo;
 	struct recent_table *t;
@@ -245,7 +245,7 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	u_int8_t ttl;
 	bool ret = info->invert;
 
-	if (par->family == NFPROTO_IPV4) {
+	if (xt_family(par) == NFPROTO_IPV4) {
 		const struct iphdr *iph = ip_hdr(skb);
 
 		if (info->side == XT_RECENT_DEST)
@@ -266,7 +266,7 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	}
 
 	/* use TTL as seen before forwarding */
-	if (par->out != NULL && skb->sk == NULL)
+	if (xt_out(par) != NULL && skb->sk == NULL)
 		ttl++;
 
 	spin_lock_bh(&recent_lock);
@@ -274,12 +274,12 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
 
 	nf_inet_addr_mask(&addr, &addr_mask, &t->mask);
 
-	e = recent_entry_lookup(t, &addr_mask, par->family,
+	e = recent_entry_lookup(t, &addr_mask, xt_family(par),
 				(info->check_set & XT_RECENT_TTL) ? ttl : 0);
 	if (e == NULL) {
 		if (!(info->check_set & XT_RECENT_SET))
 			goto out;
-		e = recent_entry_init(t, &addr_mask, par->family, ttl);
+		e = recent_entry_init(t, &addr_mask, xt_family(par), ttl);
 		if (e == NULL)
 			par->hotdrop = true;
 		ret = !ret;
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 5669e5b453f4..1bfede7be418 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -55,7 +55,7 @@ set_match_v0(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_set_info_match_v0 *info = par->matchinfo;
 
-	ADT_OPT(opt, par->family, info->match_set.u.compat.dim,
+	ADT_OPT(opt, xt_family(par), info->match_set.u.compat.dim,
 		info->match_set.u.compat.flags, 0, UINT_MAX);
 
 	return match_set(info->match_set.index, skb, par, &opt,
@@ -118,7 +118,7 @@ set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_set_info_match_v1 *info = par->matchinfo;
 
-	ADT_OPT(opt, par->family, info->match_set.dim,
+	ADT_OPT(opt, xt_family(par), info->match_set.dim,
 		info->match_set.flags, 0, UINT_MAX);
 
 	if (opt.flags & IPSET_RETURN_NOMATCH)
@@ -184,7 +184,7 @@ set_match_v3(const struct sk_buff *skb, struct xt_action_param *par)
 	const struct xt_set_info_match_v3 *info = par->matchinfo;
 	int ret;
 
-	ADT_OPT(opt, par->family, info->match_set.dim,
+	ADT_OPT(opt, xt_family(par), info->match_set.dim,
 		info->match_set.flags, info->flags, UINT_MAX);
 
 	if (info->packets.op != IPSET_COUNTER_NONE ||
@@ -231,7 +231,7 @@ set_match_v4(const struct sk_buff *skb, struct xt_action_param *par)
 	const struct xt_set_info_match_v4 *info = par->matchinfo;
 	int ret;
 
-	ADT_OPT(opt, par->family, info->match_set.dim,
+	ADT_OPT(opt, xt_family(par), info->match_set.dim,
 		info->match_set.flags, info->flags, UINT_MAX);
 
 	if (info->packets.op != IPSET_COUNTER_NONE ||
@@ -259,9 +259,9 @@ set_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_set_info_target_v0 *info = par->targinfo;
 
-	ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim,
+	ADT_OPT(add_opt, xt_family(par), info->add_set.u.compat.dim,
 		info->add_set.u.compat.flags, 0, UINT_MAX);
-	ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim,
+	ADT_OPT(del_opt, xt_family(par), info->del_set.u.compat.dim,
 		info->del_set.u.compat.flags, 0, UINT_MAX);
 
 	if (info->add_set.index != IPSET_INVALID_ID)
@@ -332,9 +332,9 @@ set_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_set_info_target_v1 *info = par->targinfo;
 
-	ADT_OPT(add_opt, par->family, info->add_set.dim,
+	ADT_OPT(add_opt, xt_family(par), info->add_set.dim,
 		info->add_set.flags, 0, UINT_MAX);
-	ADT_OPT(del_opt, par->family, info->del_set.dim,
+	ADT_OPT(del_opt, xt_family(par), info->del_set.dim,
 		info->del_set.flags, 0, UINT_MAX);
 
 	if (info->add_set.index != IPSET_INVALID_ID)
@@ -401,9 +401,9 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_set_info_target_v2 *info = par->targinfo;
 
-	ADT_OPT(add_opt, par->family, info->add_set.dim,
+	ADT_OPT(add_opt, xt_family(par), info->add_set.dim,
 		info->add_set.flags, info->flags, info->timeout);
-	ADT_OPT(del_opt, par->family, info->del_set.dim,
+	ADT_OPT(del_opt, xt_family(par), info->del_set.dim,
 		info->del_set.flags, 0, UINT_MAX);
 
 	/* Normalize to fit into jiffies */
@@ -429,11 +429,11 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
 	const struct xt_set_info_target_v3 *info = par->targinfo;
 	int ret;
 
-	ADT_OPT(add_opt, par->family, info->add_set.dim,
+	ADT_OPT(add_opt, xt_family(par), info->add_set.dim,
 		info->add_set.flags, info->flags, info->timeout);
-	ADT_OPT(del_opt, par->family, info->del_set.dim,
+	ADT_OPT(del_opt, xt_family(par), info->del_set.dim,
 		info->del_set.flags, 0, UINT_MAX);
-	ADT_OPT(map_opt, par->family, info->map_set.dim,
+	ADT_OPT(map_opt, xt_family(par), info->map_set.dim,
 		info->map_set.flags, 0, UINT_MAX);
 
 	/* Normalize to fit into jiffies */
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 018c369c9f0d..2198914707f5 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -57,7 +57,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 	struct sock *sk = skb->sk;
 
 	if (!sk)
-		sk = nf_sk_lookup_slow_v4(par->net, skb, par->in);
+		sk = nf_sk_lookup_slow_v4(xt_net(par), skb, xt_in(par));
 	if (sk) {
 		bool wildcard;
 		bool transparent = true;
@@ -114,7 +114,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
 	struct sock *sk = skb->sk;
 
 	if (!sk)
-		sk = nf_sk_lookup_slow_v6(par->net, skb, par->in);
+		sk = nf_sk_lookup_slow_v6(xt_net(par), skb, xt_in(par));
 	if (sk) {
 		bool wildcard;
 		bool transparent = true;
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 378c1c976058..ce7ea6c1c50d 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -213,6 +213,12 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
 	int ret = 0, result = 0;
 	struct tcf_ipt *ipt = to_ipt(a);
 	struct xt_action_param par;
+	struct nf_hook_state state = {
+		.net	= dev_net(skb->dev),
+		.in	= skb->dev,
+		.hook	= ipt->tcfi_hook,
+		.pf	= NFPROTO_IPV4,
+	};
 
 	if (skb_unclone(skb, GFP_ATOMIC))
 		return TC_ACT_UNSPEC;
@@ -226,13 +232,9 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
 	 * worry later - danger - this API seems to have changed
 	 * from earlier kernels
 	 */
-	par.net	     = dev_net(skb->dev);
-	par.in       = skb->dev;
-	par.out      = NULL;
-	par.hooknum  = ipt->tcfi_hook;
+	par.state    = &state;
 	par.target   = ipt->tcfi_t->u.kernel.target;
 	par.targinfo = ipt->tcfi_t->data;
-	par.family   = NFPROTO_IPV4;
 	ret = par.target->target(skb, &par);
 
 	switch (ret) {
diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c
index c66ca9400ab4..c1b23e3060b8 100644
--- a/net/sched/em_ipset.c
+++ b/net/sched/em_ipset.c
@@ -57,17 +57,20 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
 	struct xt_action_param acpar;
 	const struct xt_set_info *set = (const void *) em->data;
 	struct net_device *dev, *indev = NULL;
+	struct nf_hook_state state = {
+		.net	= em->net,
+	};
 	int ret, network_offset;
 
 	switch (tc_skb_protocol(skb)) {
 	case htons(ETH_P_IP):
-		acpar.family = NFPROTO_IPV4;
+		state.pf = NFPROTO_IPV4;
 		if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
 			return 0;
 		acpar.thoff = ip_hdrlen(skb);
 		break;
 	case htons(ETH_P_IPV6):
-		acpar.family = NFPROTO_IPV6;
+		state.pf = NFPROTO_IPV6;
 		if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
 			return 0;
 		/* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */
@@ -77,9 +80,7 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
 		return 0;
 	}
 
-	acpar.hooknum = 0;
-
-	opt.family = acpar.family;
+	opt.family = state.pf;
 	opt.dim = set->dim;
 	opt.flags = set->flags;
 	opt.cmdflags = 0;
@@ -95,9 +96,9 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
 	if (skb->skb_iif)
 		indev = dev_get_by_index_rcu(em->net, skb->skb_iif);
 
-	acpar.net     = em->net;
-	acpar.in      = indev ? indev : dev;
-	acpar.out     = dev;
+	state.in      = indev ? indev : dev;
+	state.out     = dev;
+	acpar.state   = &state;
 
 	ret = ip_set_test(set->index, skb, &acpar, &opt);
 
-- 
cgit v1.2.3


From 01886bd91f1ba418ce669dfe97a06ca9504e482a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 3 Nov 2016 10:56:35 +0100
Subject: netfilter: remove hook_entries field from nf_hook_state

This field is only useful for nf_queue, so store it in the
nf_queue_entry structure instead, away from the core path. Pass
hook_head to nf_hook_slow().

Since we always have a valid entry on the first iteration in
nf_iterate(), we can use 'do { ... } while (entry)' loop instead.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h             | 10 ++++------
 include/linux/netfilter_ingress.h     |  4 ++--
 include/net/netfilter/nf_queue.h      |  1 +
 net/bridge/br_netfilter_hooks.c       |  4 ++--
 net/bridge/netfilter/ebtable_broute.c |  2 +-
 net/netfilter/core.c                  |  9 ++++-----
 net/netfilter/nf_queue.c              | 13 +++++--------
 net/netfilter/nfnetlink_queue.c       |  2 +-
 8 files changed, 20 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index e0d000f6c9bf..69230140215b 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -54,7 +54,6 @@ struct nf_hook_state {
 	struct net_device *out;
 	struct sock *sk;
 	struct net *net;
-	struct nf_hook_entry __rcu *hook_entries;
 	int (*okfn)(struct net *, struct sock *, struct sk_buff *);
 };
 
@@ -81,7 +80,6 @@ struct nf_hook_entry {
 };
 
 static inline void nf_hook_state_init(struct nf_hook_state *p,
-				      struct nf_hook_entry *hook_entry,
 				      unsigned int hook,
 				      u_int8_t pf,
 				      struct net_device *indev,
@@ -96,7 +94,6 @@ static inline void nf_hook_state_init(struct nf_hook_state *p,
 	p->out = outdev;
 	p->sk = sk;
 	p->net = net;
-	RCU_INIT_POINTER(p->hook_entries, hook_entry);
 	p->okfn = okfn;
 }
 
@@ -150,7 +147,8 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg);
 extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 #endif
 
-int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state);
+int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
+		 struct nf_hook_entry *entry);
 
 /**
  *	nf_hook - call a netfilter hook
@@ -179,10 +177,10 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 	if (hook_head) {
 		struct nf_hook_state state;
 
-		nf_hook_state_init(&state, hook_head, hook, pf, indev, outdev,
+		nf_hook_state_init(&state, hook, pf, indev, outdev,
 				   sk, net, okfn);
 
-		ret = nf_hook_slow(skb, &state);
+		ret = nf_hook_slow(skb, &state, hook_head);
 	}
 	rcu_read_unlock();
 
diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
index fd44e4131710..2dc3b49b804a 100644
--- a/include/linux/netfilter_ingress.h
+++ b/include/linux/netfilter_ingress.h
@@ -26,10 +26,10 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 	if (unlikely(!e))
 		return 0;
 
-	nf_hook_state_init(&state, e, NF_NETDEV_INGRESS,
+	nf_hook_state_init(&state, NF_NETDEV_INGRESS,
 			   NFPROTO_NETDEV, skb->dev, NULL, NULL,
 			   dev_net(skb->dev), NULL);
-	return nf_hook_slow(skb, &state);
+	return nf_hook_slow(skb, &state, e);
 }
 
 static inline void nf_hook_ingress_init(struct net_device *dev)
diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 2280cfe86c56..09948d10e38e 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -12,6 +12,7 @@ struct nf_queue_entry {
 	unsigned int		id;
 
 	struct nf_hook_state	state;
+	struct nf_hook_entry	*hook;
 	u16			size; /* sizeof(entry) + saved route keys */
 
 	/* extra space to store route keys */
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 7e3645fa6339..8155bd2a5138 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -1018,10 +1018,10 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net,
 
 	/* We may already have this, but read-locks nest anyway */
 	rcu_read_lock();
-	nf_hook_state_init(&state, elem, hook, NFPROTO_BRIDGE, indev, outdev,
+	nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev,
 			   sk, net, okfn);
 
-	ret = nf_hook_slow(skb, &state);
+	ret = nf_hook_slow(skb, &state, elem);
 	rcu_read_unlock();
 	if (ret == 1)
 		ret = okfn(net, sk, skb);
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 599679e3498d..8fe36dc3aab2 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -53,7 +53,7 @@ static int ebt_broute(struct sk_buff *skb)
 	struct nf_hook_state state;
 	int ret;
 
-	nf_hook_state_init(&state, NULL, NF_BR_BROUTING,
+	nf_hook_state_init(&state, NF_BR_BROUTING,
 			   NFPROTO_BRIDGE, skb->dev, NULL, NULL,
 			   dev_net(skb->dev), NULL);
 
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 64623374bc5f..ebece48b8392 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -308,7 +308,7 @@ unsigned int nf_iterate(struct sk_buff *skb,
 {
 	unsigned int verdict;
 
-	while (*entryp) {
+	do {
 repeat:
 		verdict = (*entryp)->ops.hook((*entryp)->ops.priv, skb, state);
 		if (verdict != NF_ACCEPT) {
@@ -317,20 +317,19 @@ repeat:
 			goto repeat;
 		}
 		*entryp = rcu_dereference((*entryp)->next);
-	}
+	} while (*entryp);
 	return NF_ACCEPT;
 }
 
 
 /* Returns 1 if okfn() needs to be executed by the caller,
  * -EPERM for NF_DROP, 0 otherwise.  Caller must hold rcu_read_lock. */
-int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state)
+int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
+		 struct nf_hook_entry *entry)
 {
-	struct nf_hook_entry *entry;
 	unsigned int verdict;
 	int ret;
 
-	entry = rcu_dereference(state->hook_entries);
 next_hook:
 	verdict = nf_iterate(skb, state, &entry);
 	switch (verdict & NF_VERDICT_MASK) {
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 0fb38966e5bf..2e39e38ae1c7 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -108,7 +108,7 @@ void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry)
 }
 
 static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
-		      unsigned int queuenum)
+		      struct nf_hook_entry *hook_entry, unsigned int queuenum)
 {
 	int status = -ENOENT;
 	struct nf_queue_entry *entry = NULL;
@@ -136,6 +136,7 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
 	*entry = (struct nf_queue_entry) {
 		.skb	= skb,
 		.state	= *state,
+		.hook	= hook_entry,
 		.size	= sizeof(*entry) + afinfo->route_key_size,
 	};
 
@@ -163,8 +164,7 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
 	struct nf_hook_entry *entry = *entryp;
 	int ret;
 
-	RCU_INIT_POINTER(state->hook_entries, entry);
-	ret = __nf_queue(skb, state, verdict >> NF_VERDICT_QBITS);
+	ret = __nf_queue(skb, state, entry, verdict >> NF_VERDICT_QBITS);
 	if (ret < 0) {
 		if (ret == -ESRCH &&
 		    (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) {
@@ -179,15 +179,12 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
 
 void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 {
-	struct nf_hook_entry *hook_entry;
+	struct nf_hook_entry *hook_entry = entry->hook;
+	struct nf_hook_ops *elem = &hook_entry->ops;
 	struct sk_buff *skb = entry->skb;
 	const struct nf_afinfo *afinfo;
-	struct nf_hook_ops *elem;
 	int err;
 
-	hook_entry = rcu_dereference(entry->state.hook_entries);
-	elem = &hook_entry->ops;
-
 	nf_queue_entry_release_refs(entry);
 
 	/* Continue traversal iff userspace said ok... */
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 5379f788a372..1e33115b399f 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -919,7 +919,7 @@ static struct notifier_block nfqnl_dev_notifier = {
 
 static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long entry_ptr)
 {
-	return rcu_access_pointer(entry->state.hook_entries) ==
+	return rcu_access_pointer(entry->hook) ==
 		(struct nf_hook_entry *)entry_ptr;
 }
 
-- 
cgit v1.2.3


From 0cc0aa614b4c24b21b2492c0a1753035ee8c6edb Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 2 Nov 2016 11:02:17 -0400
Subject: ipv6: add IPV6_RECVFRAGSIZE cmsg

When reading a datagram or raw packet that arrived fragmented, expose
the maximum fragment size if recorded to allow applications to
estimate receive path MTU.

At this point, the field is only recorded when ipv6 connection
tracking is enabled. A follow-up patch will record this field also
in the ipv6 input path.

Tested using the test for IP_RECVFRAGSIZE plus

  ip netns exec to ip addr add dev veth1 fc07::1/64
  ip netns exec from ip addr add dev veth0 fc07::2/64

  ip netns exec to ./recv_cmsg_recvfragsize -6 -u -p 6000 &
  ip netns exec from nc -q 1 -u fc07::1 6000 < payload

Both with and without enabling connection tracking

  ip6tables -A INPUT -m state --state NEW -p udp -j LOG

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     | 5 +++--
 include/uapi/linux/in6.h | 1 +
 net/ipv6/datagram.c      | 5 +++++
 net/ipv6/ipv6_sockglue.c | 8 ++++++++
 4 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ca1ad9ebbc92..1afb6e8d35c3 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -229,8 +229,9 @@ struct ipv6_pinfo {
                                 rxflow:1,
 				rxtclass:1,
 				rxpmtu:1,
-				rxorigdstaddr:1;
-				/* 2 bits hole */
+				rxorigdstaddr:1,
+				recvfragsize:1;
+				/* 1 bits hole */
 		} bits;
 		__u16		all;
 	} rxopt;
diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index b39ea4f2e701..46444f8fbee4 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -283,6 +283,7 @@ struct in6_flowlabel_req {
 #define IPV6_RECVORIGDSTADDR    IPV6_ORIGDSTADDR
 #define IPV6_TRANSPARENT        75
 #define IPV6_UNICAST_IF         76
+#define IPV6_RECVFRAGSIZE	77
 
 /*
  * Multicast Routing:
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 37874e2f30ed..620c79a0130a 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -715,6 +715,11 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
 			put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6);
 		}
 	}
+	if (np->rxopt.bits.recvfragsize && opt->frag_max_size) {
+		int val = opt->frag_max_size;
+
+		put_cmsg(msg, SOL_IPV6, IPV6_RECVFRAGSIZE, sizeof(val), &val);
+	}
 }
 
 void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 636ec56f5f50..6c126780fcf2 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -868,6 +868,10 @@ pref_skip_coa:
 		np->autoflowlabel = valbool;
 		retv = 0;
 		break;
+	case IPV6_RECVFRAGSIZE:
+		np->rxopt.bits.recvfragsize = valbool;
+		retv = 0;
+		break;
 	}
 
 	release_sock(sk);
@@ -1310,6 +1314,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 		val = np->autoflowlabel;
 		break;
 
+	case IPV6_RECVFRAGSIZE:
+		val = np->rxopt.bits.recvfragsize;
+		break;
+
 	default:
 		return -ENOPROTOOPT;
 	}
-- 
cgit v1.2.3


From 68f929ff2654bced015ccb9b5555667f46f88dfa Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 Nov 2016 17:12:06 +0000
Subject: debugfs: constify argument to debugfs_real_fops()

seq_file users can only access const version of file pointer,
because the ->file member of struct seq_operations is marked
as such.  Make parameter to debugfs_real_fops() const.

CC: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
CC: Nicolai Stange <nicstange@gmail.com>
CC: Christian Lamparter <chunkeey@gmail.com>
CC: LKML <linux-kernel@vger.kernel.org>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/debugfs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 4d3f0d1aec73..bf1907d96097 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -52,7 +52,8 @@ extern struct srcu_struct debugfs_srcu;
  * Must only be called under the protection established by
  * debugfs_use_file_start().
  */
-static inline const struct file_operations *debugfs_real_fops(struct file *filp)
+static inline const struct file_operations *
+debugfs_real_fops(const struct file *filp)
 	__must_hold(&debugfs_srcu)
 {
 	/*
-- 
cgit v1.2.3


From 5b4e2900512321435a5cd7dd77f58f23f3109950 Mon Sep 17 00:00:00 2001
From: Jon Mason <jon.mason@broadcom.com>
Date: Fri, 4 Nov 2016 01:10:56 -0400
Subject: net: phy: broadcom: add bcm54xx_auxctl_read

Add a helper function to read the AUXCTL register for the BCM54xx.  This
mirrors the bcm54xx_auxctl_write function already present in the code.

Signed-off-by: Jon Mason <jon.mason@broadcom.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/broadcom.c | 10 ++++++++++
 include/linux/brcmphy.h    |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 583ef8a2ec8d..3a64b3d8eca8 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -30,6 +30,16 @@ MODULE_DESCRIPTION("Broadcom PHY driver");
 MODULE_AUTHOR("Maciej W. Rozycki");
 MODULE_LICENSE("GPL");
 
+static int bcm54xx_auxctl_read(struct phy_device *phydev, u16 regnum)
+{
+	/* The register must be written to both the Shadow Register Select and
+	 * the Shadow Read Register Selector
+	 */
+	phy_write(phydev, MII_BCM54XX_AUX_CTL, regnum |
+		  regnum << MII_BCM54XX_AUXCTL_SHDWSEL_READ_SHIFT);
+	return phy_read(phydev, MII_BCM54XX_AUX_CTL);
+}
+
 static int bcm54xx_auxctl_write(struct phy_device *phydev, u16 regnum, u16 val)
 {
 	return phy_write(phydev, MII_BCM54XX_AUX_CTL, regnum | val);
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 60def78c4e12..0ed66914b61c 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -110,6 +110,7 @@
 #define MII_BCM54XX_AUXCTL_MISC_FORCE_AMDIX	0x0200
 #define MII_BCM54XX_AUXCTL_MISC_RDSEL_MISC	0x7000
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC	0x0007
+#define MII_BCM54XX_AUXCTL_SHDWSEL_READ_SHIFT	12
 
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MASK	0x0007
 
-- 
cgit v1.2.3


From b14995ac2527b43a75c9190fbd4efd43fb1f4562 Mon Sep 17 00:00:00 2001
From: Jon Mason <jon.mason@broadcom.com>
Date: Fri, 4 Nov 2016 01:10:58 -0400
Subject: net: phy: broadcom: Add BCM54810 PHY entry

The BCM54810 PHY requires some semi-unique configuration, which results
in some additional configuration in addition to the standard config.
Also, some users of the BCM54810 require the PHY lanes to be swapped.
Since there is no way to detect this, add a device tree query to see if
it is applicable.

Inspired-by: Vikas Soni <vsoni@broadcom.com>
Signed-off-by: Jon Mason <jon.mason@broadcom.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig    |  2 +-
 drivers/net/phy/broadcom.c | 58 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/brcmphy.h    |  9 +++++++
 3 files changed, 67 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index ff31c10a3485..d3fcfd291913 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -217,7 +217,7 @@ config BROADCOM_PHY
 	select BCM_NET_PHYLIB
 	---help---
 	  Currently supports the BCM5411, BCM5421, BCM5461, BCM54616S, BCM5464,
-	  BCM5481 and BCM5482 PHYs.
+	  BCM5481, BCM54810 and BCM5482 PHYs.
 
 config CICADA_PHY
 	tristate "Cicada PHYs"
diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 3a64b3d8eca8..b1e32e9be1b3 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -18,7 +18,7 @@
 #include <linux/module.h>
 #include <linux/phy.h>
 #include <linux/brcmphy.h>
-
+#include <linux/of.h>
 
 #define BRCM_PHY_MODEL(phydev) \
 	((phydev)->drv->phy_id & (phydev)->drv->phy_id_mask)
@@ -45,6 +45,34 @@ static int bcm54xx_auxctl_write(struct phy_device *phydev, u16 regnum, u16 val)
 	return phy_write(phydev, MII_BCM54XX_AUX_CTL, regnum | val);
 }
 
+static int bcm54810_config(struct phy_device *phydev)
+{
+	int rc, val;
+
+	val = bcm_phy_read_exp(phydev, BCM54810_EXP_BROADREACH_LRE_MISC_CTL);
+	val &= ~BCM54810_EXP_BROADREACH_LRE_MISC_CTL_EN;
+	rc = bcm_phy_write_exp(phydev, BCM54810_EXP_BROADREACH_LRE_MISC_CTL,
+			       val);
+	if (rc < 0)
+		return rc;
+
+	val = bcm54xx_auxctl_read(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC);
+	val &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN;
+	val |= MII_BCM54XX_AUXCTL_MISC_WREN;
+	rc = bcm54xx_auxctl_write(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC,
+				  val);
+	if (rc < 0)
+		return rc;
+
+	val = bcm_phy_read_shadow(phydev, BCM54810_SHD_CLK_CTL);
+	val &= ~BCM54810_SHD_CLK_CTL_GTXCLK_EN;
+	rc = bcm_phy_write_shadow(phydev, BCM54810_SHD_CLK_CTL, val);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+
 /* Needs SMDSP clock enabled via bcm54xx_phydsp_config() */
 static int bcm50610_a0_workaround(struct phy_device *phydev)
 {
@@ -217,6 +245,12 @@ static int bcm54xx_config_init(struct phy_device *phydev)
 	    (phydev->dev_flags & PHY_BRCM_AUTO_PWRDWN_ENABLE))
 		bcm54xx_adjust_rxrefclk(phydev);
 
+	if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810) {
+		err = bcm54810_config(phydev);
+		if (err)
+			return err;
+	}
+
 	bcm54xx_phydsp_config(phydev);
 
 	return 0;
@@ -314,6 +348,7 @@ static int bcm5482_read_status(struct phy_device *phydev)
 
 static int bcm5481_config_aneg(struct phy_device *phydev)
 {
+	struct device_node *np = phydev->mdio.dev.of_node;
 	int ret;
 
 	/* Aneg firsly. */
@@ -344,6 +379,14 @@ static int bcm5481_config_aneg(struct phy_device *phydev)
 		phy_write(phydev, 0x18, reg);
 	}
 
+	if (of_property_read_bool(np, "enet-phy-lane-swap")) {
+		/* Lane Swap - Undocumented register...magic! */
+		ret = bcm_phy_write_exp(phydev, MII_BCM54XX_EXP_SEL_ER + 0x9,
+					0x11B);
+		if (ret < 0)
+			return ret;
+	}
+
 	return ret;
 }
 
@@ -577,6 +620,18 @@ static struct phy_driver broadcom_drivers[] = {
 	.read_status	= genphy_read_status,
 	.ack_interrupt	= bcm_phy_ack_intr,
 	.config_intr	= bcm_phy_config_intr,
+}, {
+	.phy_id         = PHY_ID_BCM54810,
+	.phy_id_mask    = 0xfffffff0,
+	.name           = "Broadcom BCM54810",
+	.features       = PHY_GBIT_FEATURES |
+			  SUPPORTED_Pause | SUPPORTED_Asym_Pause,
+	.flags          = PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT,
+	.config_init    = bcm54xx_config_init,
+	.config_aneg    = bcm5481_config_aneg,
+	.read_status    = genphy_read_status,
+	.ack_interrupt  = bcm_phy_ack_intr,
+	.config_intr    = bcm_phy_config_intr,
 }, {
 	.phy_id		= PHY_ID_BCM5482,
 	.phy_id_mask	= 0xfffffff0,
@@ -661,6 +716,7 @@ static struct mdio_device_id __maybe_unused broadcom_tbl[] = {
 	{ PHY_ID_BCM54616S, 0xfffffff0 },
 	{ PHY_ID_BCM5464, 0xfffffff0 },
 	{ PHY_ID_BCM5481, 0xfffffff0 },
+	{ PHY_ID_BCM54810, 0xfffffff0 },
 	{ PHY_ID_BCM5482, 0xfffffff0 },
 	{ PHY_ID_BCM50610, 0xfffffff0 },
 	{ PHY_ID_BCM50610M, 0xfffffff0 },
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 0ed66914b61c..848dc508ef57 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -13,6 +13,7 @@
 #define PHY_ID_BCM5241			0x0143bc30
 #define PHY_ID_BCMAC131			0x0143bc70
 #define PHY_ID_BCM5481			0x0143bca0
+#define PHY_ID_BCM54810			0x03625d00
 #define PHY_ID_BCM5482			0x0143bcb0
 #define PHY_ID_BCM5411			0x00206070
 #define PHY_ID_BCM5421			0x002060e0
@@ -56,6 +57,7 @@
 #define PHY_BRCM_EXT_IBND_TX_ENABLE	0x00002000
 #define PHY_BRCM_CLEAR_RGMII_MODE	0x00004000
 #define PHY_BRCM_DIS_TXCRXC_NOENRGY	0x00008000
+
 /* Broadcom BCM7xxx specific workarounds */
 #define PHY_BRCM_7XXX_REV(x)		(((x) >> 8) & 0xff)
 #define PHY_BRCM_7XXX_PATCH(x)		((x) & 0xff)
@@ -111,6 +113,7 @@
 #define MII_BCM54XX_AUXCTL_MISC_RDSEL_MISC	0x7000
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC	0x0007
 #define MII_BCM54XX_AUXCTL_SHDWSEL_READ_SHIFT	12
+#define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN	(1 << 8)
 
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MASK	0x0007
 
@@ -192,6 +195,12 @@
 #define BCM5482_SSD_SGMII_SLAVE_EN	0x0002	/* Slave mode enable */
 #define BCM5482_SSD_SGMII_SLAVE_AD	0x0001	/* Slave auto-detection */
 
+/* BCM54810 Registers */
+#define BCM54810_EXP_BROADREACH_LRE_MISC_CTL	(MII_BCM54XX_EXP_SEL_ER + 0x90)
+#define BCM54810_EXP_BROADREACH_LRE_MISC_CTL_EN	(1 << 0)
+#define BCM54810_SHD_CLK_CTL			0x3
+#define BCM54810_SHD_CLK_CTL_GTXCLK_EN		(1 << 9)
+
 
 /*****************************************************************************/
 /* Fast Ethernet Transceiver definitions. */
-- 
cgit v1.2.3


From 7c13f97ffde63cc792c49ec1513f3974f2f05229 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 4 Nov 2016 11:28:59 +0100
Subject: udp: do fwd memory scheduling on dequeue

A new argument is added to __skb_recv_datagram to provide
an explicit skb destructor, invoked under the receive queue
lock.
The UDP protocol uses such argument to perform memory
reclaiming on dequeue, so that the UDP protocol does not
set anymore skb->desctructor.
Instead explicit memory reclaiming is performed at close() time and
when skbs are removed from the receive queue.
The in kernel UDP protocol users now need to call a
skb_recv_udp() variant instead of skb_recv_datagram() to
properly perform memory accounting on dequeue.

Overall, this allows acquiring only once the receive queue
lock on dequeue.

Tested using pktgen with random src port, 64 bytes packet,
wire-speed on a 10G link as sender and udp_sink as the receiver,
using an l4 tuple rxhash to stress the contention, and one or more
udp_sink instances with reuseport.

nr sinks	vanilla		patched
1		440		560
3		2150		2300
6		3650		3800
9		4450		4600
12		6250		6450

v1 -> v2:
 - do rmem and allocated memory scheduling under the receive lock
 - do bulk scheduling in first_packet_length() and in udp_destruct_sock()
 - avoid the typdef for the dequeue callback

Suggested-by: Eric Dumazet <edumazet@google.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  4 ++++
 include/net/udp.h      | 15 +++++++++++++++
 net/core/datagram.c    | 17 ++++++++++++-----
 net/ipv4/udp.c         | 42 ++++++++++++++++++++++++------------------
 net/ipv6/udp.c         |  3 +--
 net/rxrpc/input.c      |  7 +++----
 net/sunrpc/svcsock.c   |  2 +-
 net/sunrpc/xprtsock.c  |  2 +-
 net/unix/af_unix.c     |  4 ++--
 9 files changed, 63 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cc6e23eaac91..a4aeeca7e805 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3033,9 +3033,13 @@ static inline void skb_frag_list_init(struct sk_buff *skb)
 int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
 				const struct sk_buff *skb);
 struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags,
+					void (*destructor)(struct sock *sk,
+							   struct sk_buff *skb),
 					int *peeked, int *off, int *err,
 					struct sk_buff **last);
 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
+				    void (*destructor)(struct sock *sk,
+						       struct sk_buff *skb),
 				    int *peeked, int *off, int *err);
 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
 				  int *err);
diff --git a/include/net/udp.h b/include/net/udp.h
index 6134f37ba3ab..e6e4e19be387 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -248,6 +248,21 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
 /* net/ipv4/udp.c */
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
+void udp_skb_destructor(struct sock *sk, struct sk_buff *skb);
+static inline struct sk_buff *
+__skb_recv_udp(struct sock *sk, unsigned int flags, int noblock, int *peeked,
+	       int *off, int *err)
+{
+	return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+				   udp_skb_destructor, peeked, off, err);
+}
+static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
+					   int noblock, int *err)
+{
+	int peeked, off = 0;
+
+	return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err);
+}
 
 void udp_v4_early_demux(struct sk_buff *skb);
 int udp_get_port(struct sock *sk, unsigned short snum,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index bfb973aebb5b..49816af8586b 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -165,6 +165,7 @@ done:
  *	__skb_try_recv_datagram - Receive a datagram skbuff
  *	@sk: socket
  *	@flags: MSG_ flags
+ *	@destructor: invoked under the receive lock on successful dequeue
  *	@peeked: returns non-zero if this packet has been seen before
  *	@off: an offset in bytes to peek skb from. Returns an offset
  *	      within an skb where data actually starts
@@ -197,6 +198,8 @@ done:
  *	the standard around please.
  */
 struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
+					void (*destructor)(struct sock *sk,
+							   struct sk_buff *skb),
 					int *peeked, int *off, int *err,
 					struct sk_buff **last)
 {
@@ -241,9 +244,11 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
 				}
 
 				atomic_inc(&skb->users);
-			} else
+			} else {
 				__skb_unlink(skb, queue);
-
+				if (destructor)
+					destructor(sk, skb);
+			}
 			spin_unlock_irqrestore(&queue->lock, cpu_flags);
 			*off = _off;
 			return skb;
@@ -262,6 +267,8 @@ no_packet:
 EXPORT_SYMBOL(__skb_try_recv_datagram);
 
 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
+				    void (*destructor)(struct sock *sk,
+						       struct sk_buff *skb),
 				    int *peeked, int *off, int *err)
 {
 	struct sk_buff *skb, *last;
@@ -270,8 +277,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
 	do {
-		skb = __skb_try_recv_datagram(sk, flags, peeked, off, err,
-					      &last);
+		skb = __skb_try_recv_datagram(sk, flags, destructor, peeked,
+					      off, err, &last);
 		if (skb)
 			return skb;
 
@@ -290,7 +297,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
 	int peeked, off = 0;
 
 	return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
-				   &peeked, &off, err);
+				   NULL, &peeked, &off, err);
 }
 EXPORT_SYMBOL(skb_recv_datagram);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 28a0165cb848..097b70628631 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1173,26 +1173,26 @@ out:
 	return ret;
 }
 
+/* fully reclaim rmem/fwd memory allocated for skb */
 static void udp_rmem_release(struct sock *sk, int size, int partial)
 {
 	int amt;
 
 	atomic_sub(size, &sk->sk_rmem_alloc);
-
-	spin_lock_bh(&sk->sk_receive_queue.lock);
 	sk->sk_forward_alloc += size;
 	amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
 	sk->sk_forward_alloc -= amt;
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
 
 	if (amt)
 		__sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
 }
 
-static void udp_rmem_free(struct sk_buff *skb)
+/* Note: called with sk_receive_queue.lock held */
+void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
 {
-	udp_rmem_release(skb->sk, skb->truesize, 1);
+	udp_rmem_release(sk, skb->truesize, 1);
 }
+EXPORT_SYMBOL(udp_skb_destructor);
 
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 {
@@ -1229,9 +1229,9 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 
 	sk->sk_forward_alloc -= size;
 
-	/* the skb owner in now the udp socket */
-	skb->sk = sk;
-	skb->destructor = udp_rmem_free;
+	/* no need to setup a destructor, we will explicitly release the
+	 * forward allocated memory on dequeue
+	 */
 	skb->dev = NULL;
 	sock_skb_set_dropcount(sk, skb);
 
@@ -1255,8 +1255,15 @@ EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
 static void udp_destruct_sock(struct sock *sk)
 {
 	/* reclaim completely the forward allocated memory */
-	__skb_queue_purge(&sk->sk_receive_queue);
-	udp_rmem_release(sk, 0, 0);
+	unsigned int total = 0;
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		total += skb->truesize;
+		kfree_skb(skb);
+	}
+	udp_rmem_release(sk, total, 0);
+
 	inet_sock_destruct(sk);
 }
 
@@ -1288,12 +1295,11 @@ EXPORT_SYMBOL_GPL(skb_consume_udp);
  */
 static int first_packet_length(struct sock *sk)
 {
-	struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue;
+	struct sk_buff_head *rcvq = &sk->sk_receive_queue;
 	struct sk_buff *skb;
+	int total = 0;
 	int res;
 
-	__skb_queue_head_init(&list_kill);
-
 	spin_lock_bh(&rcvq->lock);
 	while ((skb = skb_peek(rcvq)) != NULL &&
 		udp_lib_checksum_complete(skb)) {
@@ -1303,12 +1309,13 @@ static int first_packet_length(struct sock *sk)
 				IS_UDPLITE(sk));
 		atomic_inc(&sk->sk_drops);
 		__skb_unlink(skb, rcvq);
-		__skb_queue_tail(&list_kill, skb);
+		total += skb->truesize;
+		kfree_skb(skb);
 	}
 	res = skb ? skb->len : -1;
+	if (total)
+		udp_rmem_release(sk, total, 1);
 	spin_unlock_bh(&rcvq->lock);
-
-	__skb_queue_purge(&list_kill);
 	return res;
 }
 
@@ -1363,8 +1370,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 
 try_again:
 	peeking = off = sk_peek_offset(sk, flags);
-	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
-				  &peeked, &off, &err);
+	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b5a23ce8981d..5313818b7485 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -343,8 +343,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 
 try_again:
 	peeking = off = sk_peek_offset(sk, flags);
-	skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
-				  &peeked, &off, &err);
+	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
 
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 44fb8d893c7d..1d87b5453ef7 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1053,7 +1053,7 @@ void rxrpc_data_ready(struct sock *udp_sk)
 
 	ASSERT(!irqs_disabled());
 
-	skb = skb_recv_datagram(udp_sk, 0, 1, &ret);
+	skb = skb_recv_udp(udp_sk, 0, 1, &ret);
 	if (!skb) {
 		if (ret == -EAGAIN)
 			return;
@@ -1075,10 +1075,9 @@ void rxrpc_data_ready(struct sock *udp_sk)
 
 	__UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0);
 
-	/* The socket buffer we have is owned by UDP, with UDP's data all over
-	 * it, but we really want our own data there.
+	/* The UDP protocol already released all skb resources;
+	 * we are free to add our own data there.
 	 */
-	skb_orphan(skb);
 	sp = rxrpc_skb(skb);
 
 	/* dig out the RxRPC connection details */
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index e2a55dc787e6..78da4aee3543 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -547,7 +547,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 	err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
 			     0, 0, MSG_PEEK | MSG_DONTWAIT);
 	if (err >= 0)
-		skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err);
+		skb = skb_recv_udp(svsk->sk_sk, 0, 1, &err);
 
 	if (skb == NULL) {
 		if (err != -EAGAIN) {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 1758665d609c..7178d0aa7861 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1080,7 +1080,7 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
 	if (sk == NULL)
 		goto out;
 	for (;;) {
-		skb = skb_recv_datagram(sk, 0, 1, &err);
+		skb = skb_recv_udp(sk, 0, 1, &err);
 		if (skb != NULL) {
 			xs_udp_data_read_skb(&transport->xprt, sk, skb);
 			consume_skb(skb);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 145082e2ba36..87620183910e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2113,8 +2113,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 		mutex_lock(&u->iolock);
 
 		skip = sk_peek_offset(sk, flags);
-		skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
-					      &last);
+		skb = __skb_try_recv_datagram(sk, flags, NULL, &peeked, &skip,
+					      &err, &last);
 		if (skb)
 			break;
 
-- 
cgit v1.2.3


From 67db3e4bfbc90657c7be840aad5585be46240d6f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 4 Nov 2016 11:54:32 -0700
Subject: tcp: no longer hold ehash lock while calling tcp_get_info()

We had various problems in the past in tcp_get_info() and used
specific synchronization to avoid deadlocks.

We would like to add more instrumentation points for TCP, and
avoiding grabing socket lock in tcp_getinfo() was too costly.

Being able to lock the socket allows to provide consistent set
of fields.

inet_diag_dump_icsk() can make sure ehash locks are not
held any more when tcp_get_info() is called.

We can remove syncp added in commit d654976cbf85
("tcp: fix a potential deadlock in tcp_get_info()"), but we need
to use lock_sock_fast() instead of spin_lock_bh() since TCP input
path can now be run from process context.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h  |  2 --
 net/ipv4/inet_diag.c | 48 +++++++++++++++++++++++++++++++++---------------
 net/ipv4/tcp.c       | 20 +++++++++-----------
 net/ipv4/tcp_input.c |  4 ----
 4 files changed, 42 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index a17ae7b85218..32a7c7e35b71 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -176,8 +176,6 @@ struct tcp_sock {
 				 * sum(delta(snd_una)), or how many bytes
 				 * were acked.
 				 */
-	struct u64_stats_sync syncp; /* protects 64bit vars (cf tcp_get_info()) */
-
  	u32	snd_una;	/* First byte we want an ack for	*/
  	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
 	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 3b34024202d8..4dea33e5f295 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -861,10 +861,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 			 struct netlink_callback *cb,
 			 const struct inet_diag_req_v2 *r, struct nlattr *bc)
 {
+	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
 	struct net *net = sock_net(skb->sk);
-	int i, num, s_i, s_num;
 	u32 idiag_states = r->idiag_states;
-	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+	int i, num, s_i, s_num;
+	struct sock *sk;
 
 	if (idiag_states & TCPF_SYN_RECV)
 		idiag_states |= TCPF_NEW_SYN_RECV;
@@ -877,7 +878,6 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 
 		for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
 			struct inet_listen_hashbucket *ilb;
-			struct sock *sk;
 
 			num = 0;
 			ilb = &hashinfo->listening_hash[i];
@@ -922,13 +922,14 @@ skip_listen_ht:
 	if (!(idiag_states & ~TCPF_LISTEN))
 		goto out;
 
+#define SKARR_SZ 16
 	for (i = s_i; i <= hashinfo->ehash_mask; i++) {
 		struct inet_ehash_bucket *head = &hashinfo->ehash[i];
 		spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
 		struct hlist_nulls_node *node;
-		struct sock *sk;
-
-		num = 0;
+		struct sock *sk_arr[SKARR_SZ];
+		int num_arr[SKARR_SZ];
+		int idx, accum, res;
 
 		if (hlist_nulls_empty(&head->chain))
 			continue;
@@ -936,9 +937,12 @@ skip_listen_ht:
 		if (i > s_i)
 			s_num = 0;
 
+next_chunk:
+		num = 0;
+		accum = 0;
 		spin_lock_bh(lock);
 		sk_nulls_for_each(sk, node, &head->chain) {
-			int state, res;
+			int state;
 
 			if (!net_eq(sock_net(sk), net))
 				continue;
@@ -962,21 +966,35 @@ skip_listen_ht:
 			if (!inet_diag_bc_sk(bc, sk))
 				goto next_normal;
 
-			res = sk_diag_fill(sk, skb, r,
+			sock_hold(sk);
+			num_arr[accum] = num;
+			sk_arr[accum] = sk;
+			if (++accum == SKARR_SZ)
+				break;
+next_normal:
+			++num;
+		}
+		spin_unlock_bh(lock);
+		res = 0;
+		for (idx = 0; idx < accum; idx++) {
+			if (res >= 0) {
+				res = sk_diag_fill(sk_arr[idx], skb, r,
 					   sk_user_ns(NETLINK_CB(cb->skb).sk),
 					   NETLINK_CB(cb->skb).portid,
 					   cb->nlh->nlmsg_seq, NLM_F_MULTI,
 					   cb->nlh, net_admin);
-			if (res < 0) {
-				spin_unlock_bh(lock);
-				goto done;
+				if (res < 0)
+					num = num_arr[idx];
 			}
-next_normal:
-			++num;
+			sock_gen_put(sk_arr[idx]);
 		}
-
-		spin_unlock_bh(lock);
+		if (res < 0)
+			break;
 		cond_resched();
+		if (accum == SKARR_SZ) {
+			s_num = num + 1;
+			goto next_chunk;
+		}
 	}
 
 done:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 117982be0cab..a7d54cbcdabb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -405,7 +405,6 @@ void tcp_init_sock(struct sock *sk)
 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	tp->snd_cwnd_clamp = ~0;
 	tp->mss_cache = TCP_MSS_DEFAULT;
-	u64_stats_init(&tp->syncp);
 
 	tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
 	tcp_enable_early_retrans(tp);
@@ -2710,9 +2709,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 now = tcp_time_stamp, intv;
-	unsigned int start;
-	int notsent_bytes;
 	u64 rate64;
+	bool slow;
 	u32 rate;
 
 	memset(info, 0, sizeof(*info));
@@ -2792,17 +2790,17 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 
 	info->tcpi_total_retrans = tp->total_retrans;
 
-	do {
-		start = u64_stats_fetch_begin_irq(&tp->syncp);
-		put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
-		put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
-	} while (u64_stats_fetch_retry_irq(&tp->syncp, start));
+	slow = lock_sock_fast(sk);
+
+	put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
+	put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
+	info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
+
+	unlock_sock_fast(sk, slow);
+
 	info->tcpi_segs_out = tp->segs_out;
 	info->tcpi_segs_in = tp->segs_in;
 
-	notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
-	info->tcpi_notsent_bytes = max(0, notsent_bytes);
-
 	info->tcpi_min_rtt = tcp_min_rtt(tp);
 	info->tcpi_data_segs_in = tp->data_segs_in;
 	info->tcpi_data_segs_out = tp->data_segs_out;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f2c59c8e57ff..a70046fea0e8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3351,9 +3351,7 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
 	u32 delta = ack - tp->snd_una;
 
 	sock_owned_by_me((struct sock *)tp);
-	u64_stats_update_begin_raw(&tp->syncp);
 	tp->bytes_acked += delta;
-	u64_stats_update_end_raw(&tp->syncp);
 	tp->snd_una = ack;
 }
 
@@ -3363,9 +3361,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
 	u32 delta = seq - tp->rcv_nxt;
 
 	sock_owned_by_me((struct sock *)tp);
-	u64_stats_update_begin_raw(&tp->syncp);
 	tp->bytes_received += delta;
-	u64_stats_update_end_raw(&tp->syncp);
 	tp->rcv_nxt = seq;
 }
 
-- 
cgit v1.2.3


From 5a3c7805c444d9d55f302a4b3930e8758be13fab Mon Sep 17 00:00:00 2001
From: Joachim Eastwood <manabian@gmail.com>
Date: Sat, 5 Nov 2016 14:04:52 +0100
Subject: Revert "net: stmmac: allow to split suspend/resume from init/exit
 callbacks"

Instead of adding hooks inside stmmac_platform it is better to just use
the standard PM callbacks within the specific dwmac-driver. This only
used by the dwmac-rk driver.

This reverts commit cecbc5563a02 ("stmmac: allow to split suspend/resume
from init/exit callbacks").

Signed-off-by: Joachim Eastwood <manabian@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 8 ++------
 include/linux/stmmac.h                                | 2 --
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 0a0d6a86f397..4d544c34c1f2 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -417,9 +417,7 @@ static int stmmac_pltfr_suspend(struct device *dev)
 	struct platform_device *pdev = to_platform_device(dev);
 
 	ret = stmmac_suspend(dev);
-	if (priv->plat->suspend)
-		priv->plat->suspend(pdev, priv->plat->bsp_priv);
-	else if (priv->plat->exit)
+	if (priv->plat->exit)
 		priv->plat->exit(pdev, priv->plat->bsp_priv);
 
 	return ret;
@@ -438,9 +436,7 @@ static int stmmac_pltfr_resume(struct device *dev)
 	struct stmmac_priv *priv = netdev_priv(ndev);
 	struct platform_device *pdev = to_platform_device(dev);
 
-	if (priv->plat->resume)
-		priv->plat->resume(pdev, priv->plat->bsp_priv);
-	else if (priv->plat->init)
+	if (priv->plat->init)
 		priv->plat->init(pdev, priv->plat->bsp_priv);
 
 	return stmmac_resume(dev);
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 705840e0438f..3537fb33cc90 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -135,8 +135,6 @@ struct plat_stmmacenet_data {
 	void (*bus_setup)(void __iomem *ioaddr);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
-	void (*suspend)(struct platform_device *pdev, void *priv);
-	void (*resume)(struct platform_device *pdev, void *priv);
 	void *bsp_priv;
 	struct stmmac_axi *axi;
 	int has_gmac4;
-- 
cgit v1.2.3


From c9f1b073d0d750ccf8b30b272d1d76479f4cccbc Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 7 Nov 2016 15:14:44 +0200
Subject: net/mlx5: Add creation flags when adding new flow table

When creating flow tables, allow the caller to specify creation flags.
Currently no flags are used and as such this patch doesn't add any new
functionality.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/main.c                  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c    |  6 ++---
 .../ethernet/mellanox/mlx5/core/en_fs_ethtool.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  2 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |  7 +++---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c   |  7 +++++-
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 28 +++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |  1 +
 include/linux/mlx5/fs.h                            | 10 ++++++--
 12 files changed, 45 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 292ae8bbeae2..9b16431e1de8 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1857,7 +1857,7 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
 		ft = mlx5_create_auto_grouped_flow_table(ns, priority,
 							 num_entries,
 							 num_groups,
-							 0);
+							 0, 0);
 
 		if (!IS_ERR(ft)) {
 			prio->refcount = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
index 8ff22e83e1dd..677b23810953 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -324,7 +324,7 @@ static int arfs_create_table(struct mlx5e_priv *priv,
 	int err;
 
 	ft->t = mlx5_create_flow_table(priv->fs.ns, MLX5E_NIC_PRIO,
-				       MLX5E_ARFS_TABLE_SIZE, MLX5E_ARFS_FT_LEVEL);
+				       MLX5E_ARFS_TABLE_SIZE, MLX5E_ARFS_FT_LEVEL, 0);
 	if (IS_ERR(ft->t)) {
 		err = PTR_ERR(ft->t);
 		ft->t = NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index bed544d47ba1..9617892e0f15 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -777,7 +777,7 @@ static int mlx5e_create_ttc_table(struct mlx5e_priv *priv)
 	int err;
 
 	ft->t = mlx5_create_flow_table(priv->fs.ns, MLX5E_NIC_PRIO,
-				       MLX5E_TTC_TABLE_SIZE, MLX5E_TTC_FT_LEVEL);
+				       MLX5E_TTC_TABLE_SIZE, MLX5E_TTC_FT_LEVEL, 0);
 	if (IS_ERR(ft->t)) {
 		err = PTR_ERR(ft->t);
 		ft->t = NULL;
@@ -948,7 +948,7 @@ static int mlx5e_create_l2_table(struct mlx5e_priv *priv)
 
 	ft->num_groups = 0;
 	ft->t = mlx5_create_flow_table(priv->fs.ns, MLX5E_NIC_PRIO,
-				       MLX5E_L2_TABLE_SIZE, MLX5E_L2_FT_LEVEL);
+				       MLX5E_L2_TABLE_SIZE, MLX5E_L2_FT_LEVEL, 0);
 
 	if (IS_ERR(ft->t)) {
 		err = PTR_ERR(ft->t);
@@ -1038,7 +1038,7 @@ static int mlx5e_create_vlan_table(struct mlx5e_priv *priv)
 
 	ft->num_groups = 0;
 	ft->t = mlx5_create_flow_table(priv->fs.ns, MLX5E_NIC_PRIO,
-				       MLX5E_VLAN_TABLE_SIZE, MLX5E_VLAN_FT_LEVEL);
+				       MLX5E_VLAN_TABLE_SIZE, MLX5E_VLAN_FT_LEVEL, 0);
 
 	if (IS_ERR(ft->t)) {
 		err = PTR_ERR(ft->t);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index cf52c06377f2..87bb3db7b501 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -99,7 +99,7 @@ static struct mlx5e_ethtool_table *get_flow_table(struct mlx5e_priv *priv,
 			   MLX5E_ETHTOOL_NUM_ENTRIES);
 	ft = mlx5_create_auto_grouped_flow_table(ns, prio,
 						 table_size,
-						 MLX5E_ETHTOOL_NUM_GROUPS, 0);
+						 MLX5E_ETHTOOL_NUM_GROUPS, 0, 0);
 	if (IS_ERR(ft))
 		return (void *)ft;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 165682e2d2be..cdd430330e8e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -83,7 +83,7 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 							    MLX5E_TC_PRIO,
 							    MLX5E_TC_TABLE_NUM_ENTRIES,
 							    MLX5E_TC_TABLE_NUM_GROUPS,
-							    0);
+							    0, 0);
 		if (IS_ERR(priv->fs.tc.t)) {
 			netdev_err(priv->netdev,
 				   "Failed to create tc offload table\n");
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 9ee002ecb4bb..27f21ac66639 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -361,7 +361,7 @@ static int esw_create_legacy_fdb_table(struct mlx5_eswitch *esw, int nvports)
 	memset(flow_group_in, 0, inlen);
 
 	table_size = BIT(MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
-	fdb = mlx5_create_flow_table(root_ns, 0, table_size, 0);
+	fdb = mlx5_create_flow_table(root_ns, 0, table_size, 0, 0);
 	if (IS_ERR(fdb)) {
 		err = PTR_ERR(fdb);
 		esw_warn(dev, "Failed to create FDB Table err %d\n", err);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 53d9d6ce008b..b18f9513e71e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -423,7 +423,8 @@ static int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports)
 
 	fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
 						  ESW_OFFLOADS_NUM_ENTRIES,
-						  ESW_OFFLOADS_NUM_GROUPS, 0);
+						  ESW_OFFLOADS_NUM_GROUPS, 0,
+						  0);
 	if (IS_ERR(fdb)) {
 		err = PTR_ERR(fdb);
 		esw_warn(dev, "Failed to create Fast path FDB Table err %d\n", err);
@@ -432,7 +433,7 @@ static int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports)
 	esw->fdb_table.fdb = fdb;
 
 	table_size = nvports + MAX_PF_SQ + 1;
-	fdb = mlx5_create_flow_table(root_ns, FDB_SLOW_PATH, table_size, 0);
+	fdb = mlx5_create_flow_table(root_ns, FDB_SLOW_PATH, table_size, 0, 0);
 	if (IS_ERR(fdb)) {
 		err = PTR_ERR(fdb);
 		esw_warn(dev, "Failed to create slow path FDB Table err %d\n", err);
@@ -524,7 +525,7 @@ static int esw_create_offloads_table(struct mlx5_eswitch *esw)
 		return -ENOMEM;
 	}
 
-	ft_offloads = mlx5_create_flow_table(ns, 0, dev->priv.sriov.num_vfs + 2, 0);
+	ft_offloads = mlx5_create_flow_table(ns, 0, dev->priv.sriov.num_vfs + 2, 0, 0);
 	if (IS_ERR(ft_offloads)) {
 		err = PTR_ERR(ft_offloads);
 		esw_warn(esw->dev, "Failed to create offloads table, err %d\n", err);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 301cec896eb6..cc97bb218e74 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -37,6 +37,7 @@
 #include "fs_core.h"
 #include "fs_cmd.h"
 #include "mlx5_core.h"
+#include "eswitch.h"
 
 int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
 			    struct mlx5_flow_table *ft)
@@ -61,8 +62,9 @@ int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
 			       enum fs_flow_table_op_mod op_mod,
 			       enum fs_flow_table_type type, unsigned int level,
 			       unsigned int log_size, struct mlx5_flow_table
-			       *next_ft, unsigned int *table_id)
+			       *next_ft, unsigned int *table_id, u32 flags)
 {
+	int en_encap_decap = !!(flags & MLX5_FLOW_TABLE_TUNNEL_EN);
 	u32 out[MLX5_ST_SZ_DW(create_flow_table_out)] = {0};
 	u32 in[MLX5_ST_SZ_DW(create_flow_table_in)]   = {0};
 	int err;
@@ -78,6 +80,9 @@ int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
 		MLX5_SET(create_flow_table_in, in, other_vport, 1);
 	}
 
+	MLX5_SET(create_flow_table_in, in, decap_en, en_encap_decap);
+	MLX5_SET(create_flow_table_in, in, encap_en, en_encap_decap);
+
 	switch (op_mod) {
 	case FS_FT_OP_MOD_NORMAL:
 		if (next_ft) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
index 86bead1748a7..8fad80688536 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -38,7 +38,7 @@ int mlx5_cmd_create_flow_table(struct mlx5_core_dev *dev,
 			       enum fs_flow_table_op_mod op_mod,
 			       enum fs_flow_table_type type, unsigned int level,
 			       unsigned int log_size, struct mlx5_flow_table
-			       *next_ft, unsigned int *table_id);
+			       *next_ft, unsigned int *table_id, u32 flags);
 
 int mlx5_cmd_destroy_flow_table(struct mlx5_core_dev *dev,
 				struct mlx5_flow_table *ft);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index e65eabf9c850..4d28c8d70482 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -505,7 +505,8 @@ static struct mlx5_flow_group *alloc_flow_group(u32 *create_fg_in)
 
 static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, int max_fte,
 						enum fs_flow_table_type table_type,
-						enum fs_flow_table_op_mod op_mod)
+						enum fs_flow_table_op_mod op_mod,
+						u32 flags)
 {
 	struct mlx5_flow_table *ft;
 
@@ -519,6 +520,7 @@ static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, int max_ft
 	ft->type = table_type;
 	ft->vport = vport;
 	ft->max_fte = max_fte;
+	ft->flags = flags;
 	INIT_LIST_HEAD(&ft->fwd_rules);
 	mutex_init(&ft->lock);
 
@@ -777,7 +779,8 @@ static void list_add_flow_table(struct mlx5_flow_table *ft,
 static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 							enum fs_flow_table_op_mod op_mod,
 							u16 vport, int prio,
-							int max_fte, u32 level)
+							int max_fte, u32 level,
+							u32 flags)
 {
 	struct mlx5_flow_table *next_ft = NULL;
 	struct mlx5_flow_table *ft;
@@ -810,7 +813,7 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
 			      vport,
 			      max_fte ? roundup_pow_of_two(max_fte) : 0,
 			      root->table_type,
-			      op_mod);
+			      op_mod, flags);
 	if (!ft) {
 		err = -ENOMEM;
 		goto unlock_root;
@@ -820,7 +823,8 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
 	log_table_sz = ft->max_fte ? ilog2(ft->max_fte) : 0;
 	next_ft = find_next_chained_ft(fs_prio);
 	err = mlx5_cmd_create_flow_table(root->dev, ft->vport, ft->op_mod, ft->type,
-					 ft->level, log_table_sz, next_ft, &ft->id);
+					 ft->level, log_table_sz, next_ft, &ft->id,
+					 ft->flags);
 	if (err)
 		goto free_ft;
 
@@ -845,10 +849,11 @@ unlock_root:
 
 struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 					       int prio, int max_fte,
-					       u32 level)
+					       u32 level,
+					       u32 flags)
 {
 	return __mlx5_create_flow_table(ns, FS_FT_OP_MOD_NORMAL, 0, prio,
-					max_fte, level);
+					max_fte, level, flags);
 }
 
 struct mlx5_flow_table *mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
@@ -856,7 +861,7 @@ struct mlx5_flow_table *mlx5_create_vport_flow_table(struct mlx5_flow_namespace
 						     u32 level, u16 vport)
 {
 	return __mlx5_create_flow_table(ns, FS_FT_OP_MOD_NORMAL, vport, prio,
-					max_fte, level);
+					max_fte, level, 0);
 }
 
 struct mlx5_flow_table *mlx5_create_lag_demux_flow_table(
@@ -864,7 +869,7 @@ struct mlx5_flow_table *mlx5_create_lag_demux_flow_table(
 					       int prio, u32 level)
 {
 	return __mlx5_create_flow_table(ns, FS_FT_OP_MOD_LAG_DEMUX, 0, prio, 0,
-					level);
+					level, 0);
 }
 EXPORT_SYMBOL(mlx5_create_lag_demux_flow_table);
 
@@ -872,14 +877,15 @@ struct mlx5_flow_table *mlx5_create_auto_grouped_flow_table(struct mlx5_flow_nam
 							    int prio,
 							    int num_flow_table_entries,
 							    int max_num_groups,
-							    u32 level)
+							    u32 level,
+							    u32 flags)
 {
 	struct mlx5_flow_table *ft;
 
 	if (max_num_groups > num_flow_table_entries)
 		return ERR_PTR(-EINVAL);
 
-	ft = mlx5_create_flow_table(ns, prio, num_flow_table_entries, level);
+	ft = mlx5_create_flow_table(ns, prio, num_flow_table_entries, level, flags);
 	if (IS_ERR(ft))
 		return ft;
 
@@ -1822,7 +1828,7 @@ static int create_anchor_flow_table(struct mlx5_flow_steering *steering)
 	ns = mlx5_get_flow_namespace(steering->dev, MLX5_FLOW_NAMESPACE_ANCHOR);
 	if (!ns)
 		return -EINVAL;
-	ft = mlx5_create_flow_table(ns, ANCHOR_PRIO, ANCHOR_SIZE, ANCHOR_LEVEL);
+	ft = mlx5_create_flow_table(ns, ANCHOR_PRIO, ANCHOR_SIZE, ANCHOR_LEVEL, 0);
 	if (IS_ERR(ft)) {
 		mlx5_core_err(steering->dev, "Failed to create last anchor flow table");
 		return PTR_ERR(ft);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index d5150888645c..9f616ed25a89 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -117,6 +117,7 @@ struct mlx5_flow_table {
 	struct mutex			lock;
 	/* FWD rules that point on this flow table */
 	struct list_head		fwd_rules;
+	u32				flags;
 };
 
 struct mlx5_fc_cache {
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 0dcd287f4bd0..ab1a5fd2e995 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -42,6 +42,10 @@ enum {
 	MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO	= 1 << 16,
 };
 
+enum {
+	MLX5_FLOW_TABLE_TUNNEL_EN = BIT(0),
+};
+
 #define LEFTOVERS_RULE_NUM	 2
 static inline void build_leftovers_ft_param(int *priority,
 					    int *n_ent,
@@ -97,13 +101,15 @@ mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
 				    int prio,
 				    int num_flow_table_entries,
 				    int max_num_groups,
-				    u32 level);
+				    u32 level,
+				    u32 flags);
 
 struct mlx5_flow_table *
 mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 		       int prio,
 		       int num_flow_table_entries,
-		       u32 level);
+		       u32 level,
+		       u32 flags);
 struct mlx5_flow_table *
 mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
 			     int prio,
-- 
cgit v1.2.3


From 66958ed906b87816314c0517f05fe0b5766ec7fe Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 7 Nov 2016 15:14:45 +0200
Subject: net/mlx5: Support encap id when setting new steering entry

In order to support steering rules which add encapsulation headers,
encap_id parameter is needed.

Add new mlx5_flow_act struct which holds action related parameter:
action, flow_tag and encap_id. Use mlx5_flow_act struct when adding a new
steering rule.
This patch doesn't change any functionality.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/main.c                  | 10 ++---
 drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c  | 17 +++++---
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c    | 29 ++++++++-----
 .../ethernet/mellanox/mlx5/core/en_fs_ethtool.c    | 10 ++---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  9 ++--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 23 ++++++-----
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 25 ++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c   |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 48 ++++++++++------------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |  1 +
 include/linux/mlx5/fs.h                            |  9 +++-
 11 files changed, 104 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 9b16431e1de8..76ed57f1b678 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1877,10 +1877,10 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
 {
 	struct mlx5_flow_table	*ft = ft_prio->flow_table;
 	struct mlx5_ib_flow_handler *handler;
+	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_flow_spec *spec;
 	const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
 	unsigned int spec_index;
-	u32 action;
 	int err = 0;
 
 	if (!is_valid_attr(flow_attr))
@@ -1905,12 +1905,12 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
 	}
 
 	spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
-	action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
+	flow_act.action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
 		MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
+	flow_act.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
 	handler->rule = mlx5_add_flow_rules(ft, spec,
-					   action,
-					   MLX5_FS_DEFAULT_FLOW_TAG,
-					   dst, 1);
+					    &flow_act,
+					    dst, 1);
 
 	if (IS_ERR(handler->rule)) {
 		err = PTR_ERR(handler->rule);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
index 677b23810953..68419a01db36 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -174,6 +174,11 @@ static int arfs_add_default_rule(struct mlx5e_priv *priv,
 				 enum arfs_type type)
 {
 	struct arfs_table *arfs_t = &priv->fs.arfs.arfs_tables[type];
+	struct mlx5_flow_act flow_act = {
+		.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+		.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG,
+		.encap_id = 0,
+	};
 	struct mlx5_flow_destination dest;
 	struct mlx5e_tir *tir = priv->indir_tir;
 	struct mlx5_flow_spec *spec;
@@ -206,8 +211,7 @@ static int arfs_add_default_rule(struct mlx5e_priv *priv,
 	}
 
 	arfs_t->default_rule = mlx5_add_flow_rules(arfs_t->ft.t, spec,
-						   MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-						   MLX5_FS_DEFAULT_FLOW_TAG,
+						   &flow_act,
 						   &dest, 1);
 	if (IS_ERR(arfs_t->default_rule)) {
 		err = PTR_ERR(arfs_t->default_rule);
@@ -465,6 +469,11 @@ static struct arfs_table *arfs_get_table(struct mlx5e_arfs_tables *arfs,
 static struct mlx5_flow_handle *arfs_add_rule(struct mlx5e_priv *priv,
 					      struct arfs_rule *arfs_rule)
 {
+	struct mlx5_flow_act flow_act = {
+		.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+		.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG,
+		.encap_id = 0,
+	};
 	struct mlx5e_arfs_tables *arfs = &priv->fs.arfs;
 	struct arfs_tuple *tuple = &arfs_rule->tuple;
 	struct mlx5_flow_handle *rule = NULL;
@@ -544,9 +553,7 @@ static struct mlx5_flow_handle *arfs_add_rule(struct mlx5e_priv *priv,
 	}
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
 	dest.tir_num = priv->direct_tir[arfs_rule->rxq].tirn;
-	rule = mlx5_add_flow_rules(ft, spec, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				   MLX5_FS_DEFAULT_FLOW_TAG,
-				   &dest, 1);
+	rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1);
 	if (IS_ERR(rule)) {
 		err = PTR_ERR(rule);
 		netdev_err(priv->netdev, "%s: add rule(filter id=%d, rq idx=%d) failed, err=%d\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index 9617892e0f15..1fe80de5d68f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -158,6 +158,11 @@ static int __mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
 				 enum mlx5e_vlan_rule_type rule_type,
 				 u16 vid, struct mlx5_flow_spec *spec)
 {
+	struct mlx5_flow_act flow_act = {
+		.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+		.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG,
+		.encap_id = 0,
+	};
 	struct mlx5_flow_table *ft = priv->fs.vlan.ft.t;
 	struct mlx5_flow_destination dest;
 	struct mlx5_flow_handle **rule_p;
@@ -187,10 +192,7 @@ static int __mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
 		break;
 	}
 
-	*rule_p = mlx5_add_flow_rules(ft, spec,
-				      MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				      MLX5_FS_DEFAULT_FLOW_TAG,
-				      &dest, 1);
+	*rule_p = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1);
 
 	if (IS_ERR(*rule_p)) {
 		err = PTR_ERR(*rule_p);
@@ -623,6 +625,11 @@ mlx5e_generate_ttc_rule(struct mlx5e_priv *priv,
 			u16 etype,
 			u8 proto)
 {
+	struct mlx5_flow_act flow_act = {
+		.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+		.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG,
+		.encap_id = 0,
+	};
 	struct mlx5_flow_handle *rule;
 	struct mlx5_flow_spec *spec;
 	int err = 0;
@@ -644,10 +651,7 @@ mlx5e_generate_ttc_rule(struct mlx5e_priv *priv,
 		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ethertype, etype);
 	}
 
-	rule = mlx5_add_flow_rules(ft, spec,
-				   MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				   MLX5_FS_DEFAULT_FLOW_TAG,
-				   dest, 1);
+	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, 1);
 	if (IS_ERR(rule)) {
 		err = PTR_ERR(rule);
 		netdev_err(priv->netdev, "%s: add rule failed\n", __func__);
@@ -810,6 +814,11 @@ static void mlx5e_del_l2_flow_rule(struct mlx5e_priv *priv,
 static int mlx5e_add_l2_flow_rule(struct mlx5e_priv *priv,
 				  struct mlx5e_l2_rule *ai, int type)
 {
+	struct mlx5_flow_act flow_act = {
+		.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+		.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG,
+		.encap_id = 0,
+	};
 	struct mlx5_flow_table *ft = priv->fs.l2.ft.t;
 	struct mlx5_flow_destination dest;
 	struct mlx5_flow_spec *spec;
@@ -848,9 +857,7 @@ static int mlx5e_add_l2_flow_rule(struct mlx5e_priv *priv,
 		break;
 	}
 
-	ai->rule = mlx5_add_flow_rules(ft, spec,
-				       MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				       MLX5_FS_DEFAULT_FLOW_TAG, &dest, 1);
+	ai->rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1);
 	if (IS_ERR(ai->rule)) {
 		netdev_err(priv->netdev, "%s: add l2 rule(mac:%pM) failed\n",
 			   __func__, mv_dmac);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index 87bb3db7b501..3691451c728c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -290,10 +290,10 @@ add_ethtool_flow_rule(struct mlx5e_priv *priv,
 		      struct ethtool_rx_flow_spec *fs)
 {
 	struct mlx5_flow_destination *dst = NULL;
+	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_flow_spec *spec;
 	struct mlx5_flow_handle *rule;
 	int err = 0;
-	u32 action;
 
 	spec = mlx5_vzalloc(sizeof(*spec));
 	if (!spec)
@@ -304,7 +304,7 @@ add_ethtool_flow_rule(struct mlx5e_priv *priv,
 		goto free;
 
 	if (fs->ring_cookie == RX_CLS_FLOW_DISC) {
-		action = MLX5_FLOW_CONTEXT_ACTION_DROP;
+		flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
 	} else {
 		dst = kzalloc(sizeof(*dst), GFP_KERNEL);
 		if (!dst) {
@@ -314,12 +314,12 @@ add_ethtool_flow_rule(struct mlx5e_priv *priv,
 
 		dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
 		dst->tir_num = priv->direct_tir[fs->ring_cookie].tirn;
-		action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+		flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 	}
 
 	spec->match_criteria_enable = (!outer_header_zero(spec->match_criteria));
-	rule = mlx5_add_flow_rules(ft, spec, action,
-				   MLX5_FS_DEFAULT_FLOW_TAG, dst, 1);
+	flow_act.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
+	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dst, 1);
 	if (IS_ERR(rule)) {
 		err = PTR_ERR(rule);
 		netdev_err(priv->netdev, "%s: failed to add ethtool steering rule: %d\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index cdd430330e8e..35e38d12ba68 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -61,6 +61,11 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 {
 	struct mlx5_core_dev *dev = priv->mdev;
 	struct mlx5_flow_destination dest = { 0 };
+	struct mlx5_flow_act flow_act = {
+		.action = action,
+		.flow_tag = flow_tag,
+		.encap_id = 0,
+	};
 	struct mlx5_fc *counter = NULL;
 	struct mlx5_flow_handle *rule;
 	bool table_created = false;
@@ -95,9 +100,7 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 	}
 
 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	rule = mlx5_add_flow_rules(priv->fs.tc.t, spec,
-				   action, flow_tag,
-				   &dest, 1);
+	rule = mlx5_add_flow_rules(priv->fs.tc.t, spec, &flow_act, &dest, 1);
 
 	if (IS_ERR(rule))
 		goto err_add_rule;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 27f21ac66639..ae05d27832e4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -244,6 +244,7 @@ __esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u32 vport, bool rx_rule,
 	int match_header = (is_zero_ether_addr(mac_c) ? 0 :
 			    MLX5_MATCH_OUTER_HEADERS);
 	struct mlx5_flow_handle *flow_rule = NULL;
+	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_flow_destination dest;
 	struct mlx5_flow_spec *spec;
 	void *mv_misc = NULL;
@@ -285,10 +286,10 @@ __esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u32 vport, bool rx_rule,
 		  "\tFDB add rule dmac_v(%pM) dmac_c(%pM) -> vport(%d)\n",
 		  dmac_v, dmac_c, vport);
 	spec->match_criteria_enable = match_header;
+	flow_act.action =  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 	flow_rule =
 		mlx5_add_flow_rules(esw->fdb_table.fdb, spec,
-				    MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				    0, &dest, 1);
+				    &flow_act, &dest, 1);
 	if (IS_ERR(flow_rule)) {
 		esw_warn(esw->dev,
 			 "FDB: Failed to add flow rule: dmac_v(%pM) dmac_c(%pM) -> vport(%d), err(%ld)\n",
@@ -1212,6 +1213,7 @@ static void esw_vport_disable_ingress_acl(struct mlx5_eswitch *esw,
 static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
 				    struct mlx5_vport *vport)
 {
+	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_flow_spec *spec;
 	int err = 0;
 	u8 *smac_v;
@@ -1264,10 +1266,10 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
 	}
 
 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW;
 	vport->ingress.allow_rule =
 		mlx5_add_flow_rules(vport->ingress.acl, spec,
-				    MLX5_FLOW_CONTEXT_ACTION_ALLOW,
-				    0, NULL, 0);
+				    &flow_act, NULL, 0);
 	if (IS_ERR(vport->ingress.allow_rule)) {
 		err = PTR_ERR(vport->ingress.allow_rule);
 		esw_warn(esw->dev,
@@ -1278,10 +1280,10 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
 	}
 
 	memset(spec, 0, sizeof(*spec));
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
 	vport->ingress.drop_rule =
 		mlx5_add_flow_rules(vport->ingress.acl, spec,
-				    MLX5_FLOW_CONTEXT_ACTION_DROP,
-				    0, NULL, 0);
+				    &flow_act, NULL, 0);
 	if (IS_ERR(vport->ingress.drop_rule)) {
 		err = PTR_ERR(vport->ingress.drop_rule);
 		esw_warn(esw->dev,
@@ -1301,6 +1303,7 @@ out:
 static int esw_vport_egress_config(struct mlx5_eswitch *esw,
 				   struct mlx5_vport *vport)
 {
+	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_flow_spec *spec;
 	int err = 0;
 
@@ -1338,10 +1341,10 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw,
 	MLX5_SET(fte_match_param, spec->match_value, outer_headers.first_vid, vport->info.vlan);
 
 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_ALLOW;
 	vport->egress.allowed_vlan =
 		mlx5_add_flow_rules(vport->egress.acl, spec,
-				    MLX5_FLOW_CONTEXT_ACTION_ALLOW,
-				    0, NULL, 0);
+				    &flow_act, NULL, 0);
 	if (IS_ERR(vport->egress.allowed_vlan)) {
 		err = PTR_ERR(vport->egress.allowed_vlan);
 		esw_warn(esw->dev,
@@ -1353,10 +1356,10 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw,
 
 	/* Drop others rule (star rule) */
 	memset(spec, 0, sizeof(*spec));
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
 	vport->egress.drop_rule =
 		mlx5_add_flow_rules(vport->egress.acl, spec,
-				    MLX5_FLOW_CONTEXT_ACTION_DROP,
-				    0, NULL, 0);
+				    &flow_act, NULL, 0);
 	if (IS_ERR(vport->egress.drop_rule)) {
 		err = PTR_ERR(vport->egress.drop_rule);
 		esw_warn(esw->dev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index b18f9513e71e..a390117ed34c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -49,23 +49,23 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_esw_flow_attr *attr)
 {
 	struct mlx5_flow_destination dest[2] = {};
+	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_fc *counter = NULL;
 	struct mlx5_flow_handle *rule;
 	void *misc;
-	int action;
 	int i = 0;
 
 	if (esw->mode != SRIOV_OFFLOADS)
 		return ERR_PTR(-EOPNOTSUPP);
 
-	action = attr->action;
+	flow_act.action = attr->action;
 
-	if (action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
 		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
 		dest[i].vport_num = attr->out_rep->vport;
 		i++;
 	}
-	if (action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
 		counter = mlx5_fc_create(esw->dev, true);
 		if (IS_ERR(counter))
 			return ERR_CAST(counter);
@@ -84,7 +84,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 				      MLX5_MATCH_MISC_PARAMETERS;
 
 	rule = mlx5_add_flow_rules((struct mlx5_flow_table *)esw->fdb_table.fdb,
-				   spec, action, 0, dest, i);
+				   spec, &flow_act, dest, i);
 	if (IS_ERR(rule))
 		mlx5_fc_destroy(esw->dev, counter);
 
@@ -274,6 +274,7 @@ out:
 static struct mlx5_flow_handle *
 mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn)
 {
+	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_flow_destination dest;
 	struct mlx5_flow_handle *flow_rule;
 	struct mlx5_flow_spec *spec;
@@ -297,10 +298,10 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn
 	spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS;
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
 	dest.vport_num = vport;
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 
 	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
-					MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-					0, &dest, 1);
+					&flow_act, &dest, 1);
 	if (IS_ERR(flow_rule))
 		esw_warn(esw->dev, "FDB: Failed to add send to vport rule err %ld\n", PTR_ERR(flow_rule));
 out:
@@ -363,6 +364,7 @@ out_err:
 
 static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
 {
+	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_flow_destination dest;
 	struct mlx5_flow_handle *flow_rule = NULL;
 	struct mlx5_flow_spec *spec;
@@ -377,10 +379,10 @@ static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
 
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
 	dest.vport_num = 0;
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 
 	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
-					MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-					0, &dest, 1);
+					&flow_act, &dest, 1);
 	if (IS_ERR(flow_rule)) {
 		err = PTR_ERR(flow_rule);
 		esw_warn(esw->dev,  "FDB: Failed to add miss flow rule err %d\n", err);
@@ -591,6 +593,7 @@ static void esw_destroy_vport_rx_group(struct mlx5_eswitch *esw)
 struct mlx5_flow_handle *
 mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn)
 {
+	struct mlx5_flow_act flow_act = {0};
 	struct mlx5_flow_destination dest;
 	struct mlx5_flow_handle *flow_rule;
 	struct mlx5_flow_spec *spec;
@@ -613,9 +616,9 @@ mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn)
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
 	dest.tir_num = tirn;
 
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 	flow_rule = mlx5_add_flow_rules(esw->offloads.ft_offloads, spec,
-					MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-					0, &dest, 1);
+				       &flow_act, &dest, 1);
 	if (IS_ERR(flow_rule)) {
 		esw_warn(esw->dev, "fs offloads: Failed to add vport rx rule err %ld\n", PTR_ERR(flow_rule));
 		goto out;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index cc97bb218e74..c4478ecd8056 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -248,6 +248,7 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 	MLX5_SET(flow_context, in_flow_context, group_id, group_id);
 	MLX5_SET(flow_context, in_flow_context, flow_tag, fte->flow_tag);
 	MLX5_SET(flow_context, in_flow_context, action, fte->action);
+	MLX5_SET(flow_context, in_flow_context, encap_id, fte->encap_id);
 	in_match_value = MLX5_ADDR_OF(flow_context, in_flow_context,
 				      match_value);
 	memcpy(in_match_value, &fte->val, MLX5_ST_SZ_BYTES(fte_match_param));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 4d28c8d70482..9adc766c7a3f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -460,8 +460,7 @@ static void del_flow_group(struct fs_node *node)
 			       fg->id, ft->id);
 }
 
-static struct fs_fte *alloc_fte(u8 action,
-				u32 flow_tag,
+static struct fs_fte *alloc_fte(struct mlx5_flow_act *flow_act,
 				u32 *match_value,
 				unsigned int index)
 {
@@ -473,9 +472,10 @@ static struct fs_fte *alloc_fte(u8 action,
 
 	memcpy(fte->val, match_value, sizeof(fte->val));
 	fte->node.type =  FS_TYPE_FLOW_ENTRY;
-	fte->flow_tag = flow_tag;
+	fte->flow_tag = flow_act->flow_tag;
 	fte->index = index;
-	fte->action = action;
+	fte->action = flow_act->action;
+	fte->encap_id = flow_act->encap_id;
 
 	return fte;
 }
@@ -1117,15 +1117,14 @@ static unsigned int get_free_fte_index(struct mlx5_flow_group *fg,
 /* prev is output, prev->next = new_fte */
 static struct fs_fte *create_fte(struct mlx5_flow_group *fg,
 				 u32 *match_value,
-				 u8 action,
-				 u32 flow_tag,
+				 struct mlx5_flow_act *flow_act,
 				 struct list_head **prev)
 {
 	struct fs_fte *fte;
 	int index;
 
 	index = get_free_fte_index(fg, prev);
-	fte = alloc_fte(action, flow_tag, match_value, index);
+	fte = alloc_fte(flow_act, match_value, index);
 	if (IS_ERR(fte))
 		return fte;
 
@@ -1219,8 +1218,7 @@ static struct mlx5_flow_rule *find_flow_rule(struct fs_fte *fte,
 
 static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 					    u32 *match_value,
-					    u8 action,
-					    u32 flow_tag,
+					    struct mlx5_flow_act *flow_act,
 					    struct mlx5_flow_destination *dest,
 					    int dest_num)
 {
@@ -1234,12 +1232,13 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 	fs_for_each_fte(fte, fg) {
 		nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD);
 		if (compare_match_value(&fg->mask, match_value, &fte->val) &&
-		    (action & fte->action) && flow_tag == fte->flow_tag) {
+		    (flow_act->action & fte->action) &&
+		    flow_act->flow_tag == fte->flow_tag) {
 			int old_action = fte->action;
 
-			fte->action |= action;
+			fte->action |= flow_act->action;
 			handle = add_rule_fte(fte, fg, dest, dest_num,
-					      old_action != action);
+					      old_action != flow_act->action);
 			if (IS_ERR(handle)) {
 				fte->action = old_action;
 				goto unlock_fte;
@@ -1255,7 +1254,7 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 		goto unlock_fg;
 	}
 
-	fte = create_fte(fg, match_value, action, flow_tag, &prev);
+	fte = create_fte(fg, match_value, flow_act, &prev);
 	if (IS_ERR(fte)) {
 		handle = (void *)fte;
 		goto unlock_fg;
@@ -1332,17 +1331,17 @@ static bool dest_is_valid(struct mlx5_flow_destination *dest,
 static struct mlx5_flow_handle *
 _mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 		     struct mlx5_flow_spec *spec,
-		     u32 action,
-		     u32 flow_tag,
+		     struct mlx5_flow_act *flow_act,
 		     struct mlx5_flow_destination *dest,
 		     int dest_num)
+
 {
 	struct mlx5_flow_group *g;
 	struct mlx5_flow_handle *rule;
 	int i;
 
 	for (i = 0; i < dest_num; i++) {
-		if (!dest_is_valid(&dest[i], action, ft))
+		if (!dest_is_valid(&dest[i], flow_act->action, ft))
 			return ERR_PTR(-EINVAL);
 	}
 
@@ -1353,7 +1352,7 @@ _mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 					   g->mask.match_criteria,
 					   spec->match_criteria)) {
 			rule = add_rule_fg(g, spec->match_value,
-					   action, flow_tag, dest, dest_num);
+					   flow_act, dest, dest_num);
 			if (!IS_ERR(rule) || PTR_ERR(rule) != -ENOSPC)
 				goto unlock;
 		}
@@ -1365,8 +1364,7 @@ _mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 		goto unlock;
 	}
 
-	rule = add_rule_fg(g, spec->match_value,
-			   action, flow_tag, dest, dest_num);
+	rule = add_rule_fg(g, spec->match_value, flow_act, dest, dest_num);
 	if (IS_ERR(rule)) {
 		/* Remove assumes refcount > 0 and autogroup creates a group
 		 * with a refcount = 0.
@@ -1390,8 +1388,7 @@ static bool fwd_next_prio_supported(struct mlx5_flow_table *ft)
 struct mlx5_flow_handle *
 mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 		    struct mlx5_flow_spec *spec,
-		    u32 action,
-		    u32 flow_tag,
+		    struct mlx5_flow_act *flow_act,
 		    struct mlx5_flow_destination *dest,
 		    int dest_num)
 {
@@ -1399,11 +1396,11 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 	struct mlx5_flow_destination gen_dest;
 	struct mlx5_flow_table *next_ft = NULL;
 	struct mlx5_flow_handle *handle = NULL;
-	u32 sw_action = action;
+	u32 sw_action = flow_act->action;
 	struct fs_prio *prio;
 
 	fs_get_obj(prio, ft->node.parent);
-	if (action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
+	if (flow_act->action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
 		if (!fwd_next_prio_supported(ft))
 			return ERR_PTR(-EOPNOTSUPP);
 		if (dest)
@@ -1415,15 +1412,14 @@ mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 			gen_dest.ft = next_ft;
 			dest = &gen_dest;
 			dest_num = 1;
-			action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+			flow_act->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 		} else {
 			mutex_unlock(&root->chain_lock);
 			return ERR_PTR(-EOPNOTSUPP);
 		}
 	}
 
-	handle = _mlx5_add_flow_rules(ft, spec, action, flow_tag, dest,
-				      dest_num);
+	handle = _mlx5_add_flow_rules(ft, spec, flow_act, dest, dest_num);
 
 	if (sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
 		if (!IS_ERR_OR_NULL(handle) &&
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 9f616ed25a89..8e668c63f69e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -151,6 +151,7 @@ struct fs_fte {
 	u32				flow_tag;
 	u32				index;
 	u32				action;
+	u32				encap_id;
 	enum fs_fte_status		status;
 	struct mlx5_fc			*counter;
 };
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index ab1a5fd2e995..949b24b6c479 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -130,14 +130,19 @@ struct mlx5_flow_group *
 mlx5_create_flow_group(struct mlx5_flow_table *ft, u32 *in);
 void mlx5_destroy_flow_group(struct mlx5_flow_group *fg);
 
+struct mlx5_flow_act {
+	u32 action;
+	u32 flow_tag;
+	u32 encap_id;
+};
+
 /* Single destination per rule.
  * Group ID is implied by the match criteria.
  */
 struct mlx5_flow_handle *
 mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 		    struct mlx5_flow_spec *spec,
-		    u32 action,
-		    u32 flow_tag,
+		    struct mlx5_flow_act *flow_act,
 		    struct mlx5_flow_destination *dest,
 		    int dest_num);
 void mlx5_del_flow_rules(struct mlx5_flow_handle *fr);
-- 
cgit v1.2.3


From 1ababeba4a21f3dba3da3523c670b207fb2feb62 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 8 Nov 2016 14:57:39 +0100
Subject: ipv6: implement dataplane support for rthdr type 4 (Segment Routing
 Header)

Implement minimal support for processing of SR-enabled packets
as described in
https://tools.ietf.org/html/draft-ietf-6man-segment-routing-header-02.

This patch implements the following operations:
- Intermediate segment endpoint: incrementation of active segment and rerouting.
- Egress for SR-encapsulated packets: decapsulation of outer IPv6 header + SRH
  and routing of inner packet.
- Cleanup flag support for SR-inlined packets: removal of SRH if we are the
  penultimate segment endpoint.

A per-interface sysctl seg6_enabled is provided, to accept/deny SR-enabled
packets. Default is deny.

This patch does not provide support for HMAC-signed packets.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h      |   1 +
 include/linux/seg6.h      |   6 ++
 include/net/seg6.h        |  36 ++++++++++
 include/uapi/linux/ipv6.h |   2 +
 include/uapi/linux/seg6.h |  54 ++++++++++++++
 net/ipv6/addrconf.c       |  10 +++
 net/ipv6/exthdrs.c        | 175 ++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 284 insertions(+)
 create mode 100644 include/linux/seg6.h
 create mode 100644 include/net/seg6.h
 create mode 100644 include/uapi/linux/seg6.h

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 1afb6e8d35c3..68d3f71f0abf 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -64,6 +64,7 @@ struct ipv6_devconf {
 	} stable_secret;
 	__s32		use_oif_addrs_only;
 	__s32		keep_addr_on_down;
+	__s32		seg6_enabled;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/linux/seg6.h b/include/linux/seg6.h
new file mode 100644
index 000000000000..7a66d2b4c5a6
--- /dev/null
+++ b/include/linux/seg6.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SEG6_H
+#define _LINUX_SEG6_H
+
+#include <uapi/linux/seg6.h>
+
+#endif
diff --git a/include/net/seg6.h b/include/net/seg6.h
new file mode 100644
index 000000000000..4dd52a7e95f1
--- /dev/null
+++ b/include/net/seg6.h
@@ -0,0 +1,36 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _NET_SEG6_H
+#define _NET_SEG6_H
+
+static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
+				     __be32 to)
+{
+	__be32 diff[] = { ~from, to };
+
+	skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum);
+}
+
+static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from,
+				      __be32 *to)
+{
+	__be32 diff[] = {
+		~from[0], ~from[1], ~from[2], ~from[3],
+		to[0], to[1], to[2], to[3],
+	};
+
+	skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum);
+}
+
+#endif
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 8c2772340c3f..7ff1d654e333 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -39,6 +39,7 @@ struct in6_ifreq {
 #define IPV6_SRCRT_STRICT	0x01	/* Deprecated; will be removed */
 #define IPV6_SRCRT_TYPE_0	0	/* Deprecated; will be removed */
 #define IPV6_SRCRT_TYPE_2	2	/* IPv6 type 2 Routing Header	*/
+#define IPV6_SRCRT_TYPE_4	4	/* Segment Routing with IPv6 */
 
 /*
  *	routing header
@@ -178,6 +179,7 @@ enum {
 	DEVCONF_DROP_UNSOLICITED_NA,
 	DEVCONF_KEEP_ADDR_ON_DOWN,
 	DEVCONF_RTR_SOLICIT_MAX_INTERVAL,
+	DEVCONF_SEG6_ENABLED,
 	DEVCONF_MAX
 };
 
diff --git a/include/uapi/linux/seg6.h b/include/uapi/linux/seg6.h
new file mode 100644
index 000000000000..c396a8052f73
--- /dev/null
+++ b/include/uapi/linux/seg6.h
@@ -0,0 +1,54 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_SEG6_H
+#define _UAPI_LINUX_SEG6_H
+
+/*
+ * SRH
+ */
+struct ipv6_sr_hdr {
+	__u8	nexthdr;
+	__u8	hdrlen;
+	__u8	type;
+	__u8	segments_left;
+	__u8	first_segment;
+	__u8	flag_1;
+	__u8	flag_2;
+	__u8	reserved;
+
+	struct in6_addr segments[0];
+};
+
+#define SR6_FLAG1_CLEANUP	(1 << 7)
+#define SR6_FLAG1_PROTECTED	(1 << 6)
+#define SR6_FLAG1_OAM		(1 << 5)
+#define SR6_FLAG1_ALERT		(1 << 4)
+#define SR6_FLAG1_HMAC		(1 << 3)
+
+#define SR6_TLV_INGRESS		1
+#define SR6_TLV_EGRESS		2
+#define SR6_TLV_OPAQUE		3
+#define SR6_TLV_PADDING		4
+#define SR6_TLV_HMAC		5
+
+#define sr_has_cleanup(srh) ((srh)->flag_1 & SR6_FLAG1_CLEANUP)
+#define sr_has_hmac(srh) ((srh)->flag_1 & SR6_FLAG1_HMAC)
+
+struct sr6_tlv {
+	__u8 type;
+	__u8 len;
+	__u8 data[0];
+};
+
+#endif
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 060dd9922018..2ac6cb460af0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -238,6 +238,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
 	.keep_addr_on_down	= 0,
+	.seg6_enabled		= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -284,6 +285,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
 	.keep_addr_on_down	= 0,
+	.seg6_enabled		= 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -4944,6 +4946,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
 	array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
 	array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down;
+	array[DEVCONF_SEG6_ENABLED] = cnf->seg6_enabled;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -6035,6 +6038,13 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.proc_handler	= proc_dointvec,
 
 	},
+	{
+		.procname	= "seg6_enabled",
+		.data		= &ipv6_devconf.seg6_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		/* sentinel */
 	}
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 139ceb68bd37..b8ba3961ff8a 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -47,6 +47,8 @@
 #if IS_ENABLED(CONFIG_IPV6_MIP6)
 #include <net/xfrm.h>
 #endif
+#include <linux/seg6.h>
+#include <net/seg6.h>
 
 #include <linux/uaccess.h>
 
@@ -286,6 +288,175 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
 	return -1;
 }
 
+static void seg6_update_csum(struct sk_buff *skb)
+{
+	struct ipv6_sr_hdr *hdr;
+	struct in6_addr *addr;
+	__be32 from, to;
+
+	/* srh is at transport offset and seg_left is already decremented
+	 * but daddr is not yet updated with next segment
+	 */
+
+	hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+	addr = hdr->segments + hdr->segments_left;
+
+	hdr->segments_left++;
+	from = *(__be32 *)hdr;
+
+	hdr->segments_left--;
+	to = *(__be32 *)hdr;
+
+	/* update skb csum with diff resulting from seg_left decrement */
+
+	update_csum_diff4(skb, from, to);
+
+	/* compute csum diff between current and next segment and update */
+
+	update_csum_diff16(skb, (__be32 *)(&ipv6_hdr(skb)->daddr),
+			   (__be32 *)addr);
+}
+
+static int ipv6_srh_rcv(struct sk_buff *skb)
+{
+	struct inet6_skb_parm *opt = IP6CB(skb);
+	struct net *net = dev_net(skb->dev);
+	struct ipv6_sr_hdr *hdr;
+	struct inet6_dev *idev;
+	struct in6_addr *addr;
+	bool cleanup = false;
+	int accept_seg6;
+
+	hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+
+	idev = __in6_dev_get(skb->dev);
+
+	accept_seg6 = net->ipv6.devconf_all->seg6_enabled;
+	if (accept_seg6 > idev->cnf.seg6_enabled)
+		accept_seg6 = idev->cnf.seg6_enabled;
+
+	if (!accept_seg6) {
+		kfree_skb(skb);
+		return -1;
+	}
+
+looped_back:
+	if (hdr->segments_left > 0) {
+		if (hdr->nexthdr != NEXTHDR_IPV6 && hdr->segments_left == 1 &&
+		    sr_has_cleanup(hdr))
+			cleanup = true;
+	} else {
+		if (hdr->nexthdr == NEXTHDR_IPV6) {
+			int offset = (hdr->hdrlen + 1) << 3;
+
+			skb_postpull_rcsum(skb, skb_network_header(skb),
+					   skb_network_header_len(skb));
+
+			if (!pskb_pull(skb, offset)) {
+				kfree_skb(skb);
+				return -1;
+			}
+			skb_postpull_rcsum(skb, skb_transport_header(skb),
+					   offset);
+
+			skb_reset_network_header(skb);
+			skb_reset_transport_header(skb);
+			skb->encapsulation = 0;
+
+			__skb_tunnel_rx(skb, skb->dev, net);
+
+			netif_rx(skb);
+			return -1;
+		}
+
+		opt->srcrt = skb_network_header_len(skb);
+		opt->lastopt = opt->srcrt;
+		skb->transport_header += (hdr->hdrlen + 1) << 3;
+		opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
+
+		return 1;
+	}
+
+	if (hdr->segments_left >= (hdr->hdrlen >> 1)) {
+		__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+				IPSTATS_MIB_INHDRERRORS);
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+				  ((&hdr->segments_left) -
+				   skb_network_header(skb)));
+		kfree_skb(skb);
+		return -1;
+	}
+
+	if (skb_cloned(skb)) {
+		if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
+			__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+					IPSTATS_MIB_OUTDISCARDS);
+			kfree_skb(skb);
+			return -1;
+		}
+	}
+
+	hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+
+	hdr->segments_left--;
+	addr = hdr->segments + hdr->segments_left;
+
+	skb_push(skb, sizeof(struct ipv6hdr));
+
+	if (skb->ip_summed == CHECKSUM_COMPLETE)
+		seg6_update_csum(skb);
+
+	ipv6_hdr(skb)->daddr = *addr;
+
+	if (cleanup) {
+		int srhlen = (hdr->hdrlen + 1) << 3;
+		int nh = hdr->nexthdr;
+
+		skb_pull_rcsum(skb, sizeof(struct ipv6hdr) + srhlen);
+		memmove(skb_network_header(skb) + srhlen,
+			skb_network_header(skb),
+			(unsigned char *)hdr - skb_network_header(skb));
+		skb->network_header += srhlen;
+		ipv6_hdr(skb)->nexthdr = nh;
+		ipv6_hdr(skb)->payload_len = htons(skb->len -
+						   sizeof(struct ipv6hdr));
+		skb_push_rcsum(skb, sizeof(struct ipv6hdr));
+	}
+
+	skb_dst_drop(skb);
+
+	ip6_route_input(skb);
+
+	if (skb_dst(skb)->error) {
+		dst_input(skb);
+		return -1;
+	}
+
+	if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
+		if (ipv6_hdr(skb)->hop_limit <= 1) {
+			__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+					IPSTATS_MIB_INHDRERRORS);
+			icmpv6_send(skb, ICMPV6_TIME_EXCEED,
+				    ICMPV6_EXC_HOPLIMIT, 0);
+			kfree_skb(skb);
+			return -1;
+		}
+		ipv6_hdr(skb)->hop_limit--;
+
+		/* be sure that srh is still present before reinjecting */
+		if (!cleanup) {
+			skb_pull(skb, sizeof(struct ipv6hdr));
+			goto looped_back;
+		}
+		skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+		IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
+	}
+
+	dst_input(skb);
+
+	return -1;
+}
+
 /********************************
   Routing header.
  ********************************/
@@ -326,6 +497,10 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
 		return -1;
 	}
 
+	/* segment routing */
+	if (hdr->type == IPV6_SRCRT_TYPE_4)
+		return ipv6_srh_rcv(skb);
+
 looped_back:
 	if (hdr->segments_left == 0) {
 		switch (hdr->type) {
-- 
cgit v1.2.3


From 915d7e5e5930b4f01d0971d93b9b25ed17d221aa Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 8 Nov 2016 14:57:40 +0100
Subject: ipv6: sr: add code base for control plane support of SR-IPv6

This patch adds the necessary hooks and structures to provide support
for SR-IPv6 control plane, essentially the Generic Netlink commands
that will be used for userspace control over the Segment Routing
kernel structures.

The genetlink commands provide control over two different structures:
tunnel source and HMAC data. The tunnel source is the source address
that will be used by default when encapsulating packets into an
outer IPv6 header + SRH. If the tunnel source is set to :: then an
address of the outgoing interface will be selected as the source.

The HMAC commands currently just return ENOTSUPP and will be implemented
in a future patch.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/seg6_genl.h      |   6 ++
 include/net/netns/ipv6.h       |   1 +
 include/net/seg6.h             |  16 +++
 include/uapi/linux/seg6_genl.h |  32 ++++++
 net/ipv6/Makefile              |   2 +-
 net/ipv6/af_inet6.c            |   9 +-
 net/ipv6/seg6.c                | 214 +++++++++++++++++++++++++++++++++++++++++
 7 files changed, 278 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/seg6_genl.h
 create mode 100644 include/uapi/linux/seg6_genl.h
 create mode 100644 net/ipv6/seg6.c

(limited to 'include/linux')

diff --git a/include/linux/seg6_genl.h b/include/linux/seg6_genl.h
new file mode 100644
index 000000000000..d6c3fb4f3734
--- /dev/null
+++ b/include/linux/seg6_genl.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SEG6_GENL_H
+#define _LINUX_SEG6_GENL_H
+
+#include <uapi/linux/seg6_genl.h>
+
+#endif
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 10d0848f5b8a..de7745e2edcc 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -85,6 +85,7 @@ struct netns_ipv6 {
 #endif
 	atomic_t		dev_addr_genid;
 	atomic_t		fib6_sernum;
+	struct seg6_pernet_data *seg6_data;
 };
 
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
diff --git a/include/net/seg6.h b/include/net/seg6.h
index 4dd52a7e95f1..7c7b8ed39661 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -14,6 +14,9 @@
 #ifndef _NET_SEG6_H
 #define _NET_SEG6_H
 
+#include <linux/net.h>
+#include <linux/ipv6.h>
+
 static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
 				     __be32 to)
 {
@@ -33,4 +36,17 @@ static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from,
 	skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum);
 }
 
+struct seg6_pernet_data {
+	struct mutex lock;
+	struct in6_addr __rcu *tun_src;
+};
+
+static inline struct seg6_pernet_data *seg6_pernet(struct net *net)
+{
+	return net->ipv6.seg6_data;
+}
+
+extern int seg6_init(void);
+extern void seg6_exit(void);
+
 #endif
diff --git a/include/uapi/linux/seg6_genl.h b/include/uapi/linux/seg6_genl.h
new file mode 100644
index 000000000000..fcf1c60d7df3
--- /dev/null
+++ b/include/uapi/linux/seg6_genl.h
@@ -0,0 +1,32 @@
+#ifndef _UAPI_LINUX_SEG6_GENL_H
+#define _UAPI_LINUX_SEG6_GENL_H
+
+#define SEG6_GENL_NAME		"SEG6"
+#define SEG6_GENL_VERSION	0x1
+
+enum {
+	SEG6_ATTR_UNSPEC,
+	SEG6_ATTR_DST,
+	SEG6_ATTR_DSTLEN,
+	SEG6_ATTR_HMACKEYID,
+	SEG6_ATTR_SECRET,
+	SEG6_ATTR_SECRETLEN,
+	SEG6_ATTR_ALGID,
+	SEG6_ATTR_HMACINFO,
+	__SEG6_ATTR_MAX,
+};
+
+#define SEG6_ATTR_MAX (__SEG6_ATTR_MAX - 1)
+
+enum {
+	SEG6_CMD_UNSPEC,
+	SEG6_CMD_SETHMAC,
+	SEG6_CMD_DUMPHMAC,
+	SEG6_CMD_SET_TUNSRC,
+	SEG6_CMD_GET_TUNSRC,
+	__SEG6_CMD_MAX,
+};
+
+#define SEG6_CMD_MAX (__SEG6_CMD_MAX - 1)
+
+#endif
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index c174ccb340a1..c92010d62afc 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -9,7 +9,7 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
 		route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
 		raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
 		exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
-		udp_offload.o
+		udp_offload.o seg6.o
 
 ipv6-offload :=	ip6_offload.o tcpv6_offload.o exthdrs_offload.o
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index c86911b63f8a..d424f3a3737a 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -61,6 +61,7 @@
 #include <net/ip6_tunnel.h>
 #endif
 #include <net/calipso.h>
+#include <net/seg6.h>
 
 #include <asm/uaccess.h>
 #include <linux/mroute6.h>
@@ -991,6 +992,10 @@ static int __init inet6_init(void)
 	if (err)
 		goto calipso_fail;
 
+	err = seg6_init();
+	if (err)
+		goto seg6_fail;
+
 #ifdef CONFIG_SYSCTL
 	err = ipv6_sysctl_register();
 	if (err)
@@ -1001,8 +1006,10 @@ out:
 
 #ifdef CONFIG_SYSCTL
 sysctl_fail:
-	calipso_exit();
+	seg6_exit();
 #endif
+seg6_fail:
+	calipso_exit();
 calipso_fail:
 	pingv6_exit();
 pingv6_fail:
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
new file mode 100644
index 000000000000..e246b0ba12ac
--- /dev/null
+++ b/net/ipv6/seg6.c
@@ -0,0 +1,214 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *	  modify it under the terms of the GNU General Public License
+ *	  as published by the Free Software Foundation; either version
+ *	  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/slab.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+
+#include <net/seg6.h>
+#include <net/genetlink.h>
+#include <linux/seg6.h>
+#include <linux/seg6_genl.h>
+
+static struct genl_family seg6_genl_family;
+
+static const struct nla_policy seg6_genl_policy[SEG6_ATTR_MAX + 1] = {
+	[SEG6_ATTR_DST]				= { .type = NLA_BINARY,
+		.len = sizeof(struct in6_addr) },
+	[SEG6_ATTR_DSTLEN]			= { .type = NLA_S32, },
+	[SEG6_ATTR_HMACKEYID]		= { .type = NLA_U32, },
+	[SEG6_ATTR_SECRET]			= { .type = NLA_BINARY, },
+	[SEG6_ATTR_SECRETLEN]		= { .type = NLA_U8, },
+	[SEG6_ATTR_ALGID]			= { .type = NLA_U8, },
+	[SEG6_ATTR_HMACINFO]		= { .type = NLA_NESTED, },
+};
+
+static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
+{
+	return -ENOTSUPP;
+}
+
+static int seg6_genl_set_tunsrc(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = genl_info_net(info);
+	struct in6_addr *val, *t_old, *t_new;
+	struct seg6_pernet_data *sdata;
+
+	sdata = seg6_pernet(net);
+
+	if (!info->attrs[SEG6_ATTR_DST])
+		return -EINVAL;
+
+	val = nla_data(info->attrs[SEG6_ATTR_DST]);
+	t_new = kmemdup(val, sizeof(*val), GFP_KERNEL);
+
+	mutex_lock(&sdata->lock);
+
+	t_old = sdata->tun_src;
+	rcu_assign_pointer(sdata->tun_src, t_new);
+
+	mutex_unlock(&sdata->lock);
+
+	synchronize_net();
+	kfree(t_old);
+
+	return 0;
+}
+
+static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = genl_info_net(info);
+	struct in6_addr *tun_src;
+	struct sk_buff *msg;
+	void *hdr;
+
+	msg = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+			  &seg6_genl_family, 0, SEG6_CMD_GET_TUNSRC);
+	if (!hdr)
+		goto free_msg;
+
+	rcu_read_lock();
+	tun_src = rcu_dereference(seg6_pernet(net)->tun_src);
+
+	if (nla_put(msg, SEG6_ATTR_DST, sizeof(struct in6_addr), tun_src))
+		goto nla_put_failure;
+
+	rcu_read_unlock();
+
+	genlmsg_end(msg, hdr);
+	genlmsg_reply(msg, info);
+
+	return 0;
+
+nla_put_failure:
+	rcu_read_unlock();
+	genlmsg_cancel(msg, hdr);
+free_msg:
+	nlmsg_free(msg);
+	return -ENOMEM;
+}
+
+static int seg6_genl_dumphmac(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	return -ENOTSUPP;
+}
+
+static int __net_init seg6_net_init(struct net *net)
+{
+	struct seg6_pernet_data *sdata;
+
+	sdata = kzalloc(sizeof(*sdata), GFP_KERNEL);
+	if (!sdata)
+		return -ENOMEM;
+
+	mutex_init(&sdata->lock);
+
+	sdata->tun_src = kzalloc(sizeof(*sdata->tun_src), GFP_KERNEL);
+	if (!sdata->tun_src) {
+		kfree(sdata);
+		return -ENOMEM;
+	}
+
+	net->ipv6.seg6_data = sdata;
+
+	return 0;
+}
+
+static void __net_exit seg6_net_exit(struct net *net)
+{
+	struct seg6_pernet_data *sdata = seg6_pernet(net);
+
+	kfree(sdata->tun_src);
+	kfree(sdata);
+}
+
+static struct pernet_operations ip6_segments_ops = {
+	.init = seg6_net_init,
+	.exit = seg6_net_exit,
+};
+
+static const struct genl_ops seg6_genl_ops[] = {
+	{
+		.cmd	= SEG6_CMD_SETHMAC,
+		.doit	= seg6_genl_sethmac,
+		.policy	= seg6_genl_policy,
+		.flags	= GENL_ADMIN_PERM,
+	},
+	{
+		.cmd	= SEG6_CMD_DUMPHMAC,
+		.dumpit	= seg6_genl_dumphmac,
+		.policy	= seg6_genl_policy,
+		.flags	= GENL_ADMIN_PERM,
+	},
+	{
+		.cmd	= SEG6_CMD_SET_TUNSRC,
+		.doit	= seg6_genl_set_tunsrc,
+		.policy	= seg6_genl_policy,
+		.flags	= GENL_ADMIN_PERM,
+	},
+	{
+		.cmd	= SEG6_CMD_GET_TUNSRC,
+		.doit	= seg6_genl_get_tunsrc,
+		.policy = seg6_genl_policy,
+		.flags	= GENL_ADMIN_PERM,
+	},
+};
+
+static struct genl_family seg6_genl_family __ro_after_init = {
+	.hdrsize	= 0,
+	.name		= SEG6_GENL_NAME,
+	.version	= SEG6_GENL_VERSION,
+	.maxattr	= SEG6_ATTR_MAX,
+	.netnsok	= true,
+	.parallel_ops	= true,
+	.ops		= seg6_genl_ops,
+	.n_ops		= ARRAY_SIZE(seg6_genl_ops),
+	.module		= THIS_MODULE,
+};
+
+int __init seg6_init(void)
+{
+	int err = -ENOMEM;
+
+	err = genl_register_family(&seg6_genl_family);
+	if (err)
+		goto out;
+
+	err = register_pernet_subsys(&ip6_segments_ops);
+	if (err)
+		goto out_unregister_genl;
+
+	pr_info("Segment Routing with IPv6\n");
+
+out:
+	return err;
+out_unregister_genl:
+	genl_unregister_family(&seg6_genl_family);
+	goto out;
+}
+
+void seg6_exit(void)
+{
+	unregister_pernet_subsys(&ip6_segments_ops);
+	genl_unregister_family(&seg6_genl_family);
+}
-- 
cgit v1.2.3


From 6c8702c60b88651072460f3f4026c7dfe2521d12 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 8 Nov 2016 14:57:41 +0100
Subject: ipv6: sr: add support for SRH encapsulation and injection with
 lwtunnels

This patch creates a new type of interfaceless lightweight tunnel (SEG6),
enabling the encapsulation and injection of SRH within locally emitted
packets and forwarded packets.

>From a configuration viewpoint, a seg6 tunnel would be configured as follows:

  ip -6 ro ad fc00::1/128 encap seg6 mode encap segs fc42::1,fc42::2,fc42::3 dev eth0

Any packet whose destination address is fc00::1 would thus be encapsulated
within an outer IPv6 header containing the SRH with three segments, and would
actually be routed to the first segment of the list. If `mode inline' was
specified instead of `mode encap', then the SRH would be directly inserted
after the IPv6 header without outer encapsulation.

The inline mode is only available if CONFIG_IPV6_SEG6_INLINE is enabled. This
feature was made configurable because direct header insertion may break
several mechanisms such as PMTUD or IPSec AH.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/seg6_iptunnel.h      |   6 +
 include/net/seg6.h                 |   6 +
 include/uapi/linux/lwtunnel.h      |   1 +
 include/uapi/linux/seg6_iptunnel.h |  44 ++++
 net/core/lwtunnel.c                |   2 +
 net/ipv6/Kconfig                   |  12 ++
 net/ipv6/Makefile                  |   2 +-
 net/ipv6/seg6.c                    |  44 ++++
 net/ipv6/seg6_iptunnel.c           | 410 +++++++++++++++++++++++++++++++++++++
 9 files changed, 526 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/seg6_iptunnel.h
 create mode 100644 include/uapi/linux/seg6_iptunnel.h
 create mode 100644 net/ipv6/seg6_iptunnel.c

(limited to 'include/linux')

diff --git a/include/linux/seg6_iptunnel.h b/include/linux/seg6_iptunnel.h
new file mode 100644
index 000000000000..5377cf6a5a02
--- /dev/null
+++ b/include/linux/seg6_iptunnel.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SEG6_IPTUNNEL_H
+#define _LINUX_SEG6_IPTUNNEL_H
+
+#include <uapi/linux/seg6_iptunnel.h>
+
+#endif
diff --git a/include/net/seg6.h b/include/net/seg6.h
index 7c7b8ed39661..ff5da0ce83e9 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -16,6 +16,8 @@
 
 #include <linux/net.h>
 #include <linux/ipv6.h>
+#include <net/lwtunnel.h>
+#include <linux/seg6.h>
 
 static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
 				     __be32 to)
@@ -48,5 +50,9 @@ static inline struct seg6_pernet_data *seg6_pernet(struct net *net)
 
 extern int seg6_init(void);
 extern void seg6_exit(void);
+extern int seg6_iptunnel_init(void);
+extern void seg6_iptunnel_exit(void);
+
+extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len);
 
 #endif
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index a478fe80e203..453cc6215bfd 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -9,6 +9,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_IP,
 	LWTUNNEL_ENCAP_ILA,
 	LWTUNNEL_ENCAP_IP6,
+	LWTUNNEL_ENCAP_SEG6,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h
new file mode 100644
index 000000000000..0f7dbd280a9c
--- /dev/null
+++ b/include/uapi/linux/seg6_iptunnel.h
@@ -0,0 +1,44 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_SEG6_IPTUNNEL_H
+#define _UAPI_LINUX_SEG6_IPTUNNEL_H
+
+enum {
+	SEG6_IPTUNNEL_UNSPEC,
+	SEG6_IPTUNNEL_SRH,
+	__SEG6_IPTUNNEL_MAX,
+};
+#define SEG6_IPTUNNEL_MAX (__SEG6_IPTUNNEL_MAX - 1)
+
+struct seg6_iptunnel_encap {
+	int mode;
+	struct ipv6_sr_hdr srh[0];
+};
+
+#define SEG6_IPTUN_ENCAP_SIZE(x) ((sizeof(*x)) + (((x)->srh->hdrlen + 1) << 3))
+
+enum {
+	SEG6_IPTUN_MODE_INLINE,
+	SEG6_IPTUN_MODE_ENCAP,
+};
+
+static inline size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
+{
+	int encap = (tuninfo->mode == SEG6_IPTUN_MODE_ENCAP);
+
+	return ((tuninfo->srh->hdrlen + 1) << 3) +
+	       (encap * sizeof(struct ipv6hdr));
+}
+
+#endif
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 88fd64250b02..03976e939818 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -39,6 +39,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
 		return "MPLS";
 	case LWTUNNEL_ENCAP_ILA:
 		return "ILA";
+	case LWTUNNEL_ENCAP_SEG6:
+		return "SEG6";
 	case LWTUNNEL_ENCAP_IP6:
 	case LWTUNNEL_ENCAP_IP:
 	case LWTUNNEL_ENCAP_NONE:
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 2343e4f2e0bf..1123a001d729 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -289,4 +289,16 @@ config IPV6_PIMSM_V2
 	  Support for IPv6 PIM multicast routing protocol PIM-SMv2.
 	  If unsure, say N.
 
+config IPV6_SEG6_INLINE
+	bool "IPv6: direct Segment Routing Header insertion "
+	depends on IPV6
+	---help---
+	  Support for direct insertion of the Segment Routing Header,
+	  also known as inline mode. Be aware that direct insertion of
+	  extension headers (as opposed to encapsulation) may break
+	  multiple mechanisms such as PMTUD or IPSec AH. Use this feature
+	  only if you know exactly what you are doing.
+
+	  If unsure, say N.
+
 endif # IPV6
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index c92010d62afc..59ee92fb3689 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -9,7 +9,7 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
 		route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
 		raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
 		exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
-		udp_offload.o seg6.o
+		udp_offload.o seg6.o seg6_iptunnel.o
 
 ipv6-offload :=	ip6_offload.o tcpv6_offload.o exthdrs_offload.o
 
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index e246b0ba12ac..9c78053e67e0 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -26,6 +26,43 @@
 #include <linux/seg6.h>
 #include <linux/seg6_genl.h>
 
+bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)
+{
+	int trailing;
+	unsigned int tlv_offset;
+
+	if (srh->type != IPV6_SRCRT_TYPE_4)
+		return false;
+
+	if (((srh->hdrlen + 1) << 3) != len)
+		return false;
+
+	if (srh->segments_left != srh->first_segment)
+		return false;
+
+	tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4);
+
+	trailing = len - tlv_offset;
+	if (trailing < 0)
+		return false;
+
+	while (trailing) {
+		struct sr6_tlv *tlv;
+		unsigned int tlv_len;
+
+		tlv = (struct sr6_tlv *)((unsigned char *)srh + tlv_offset);
+		tlv_len = sizeof(*tlv) + tlv->len;
+
+		trailing -= tlv_len;
+		if (trailing < 0)
+			return false;
+
+		tlv_offset += tlv_len;
+	}
+
+	return true;
+}
+
 static struct genl_family seg6_genl_family;
 
 static const struct nla_policy seg6_genl_policy[SEG6_ATTR_MAX + 1] = {
@@ -198,10 +235,16 @@ int __init seg6_init(void)
 	if (err)
 		goto out_unregister_genl;
 
+	err = seg6_iptunnel_init();
+	if (err)
+		goto out_unregister_pernet;
+
 	pr_info("Segment Routing with IPv6\n");
 
 out:
 	return err;
+out_unregister_pernet:
+	unregister_pernet_subsys(&ip6_segments_ops);
 out_unregister_genl:
 	genl_unregister_family(&seg6_genl_family);
 	goto out;
@@ -209,6 +252,7 @@ out_unregister_genl:
 
 void seg6_exit(void)
 {
+	seg6_iptunnel_exit();
 	unregister_pernet_subsys(&ip6_segments_ops);
 	genl_unregister_family(&seg6_genl_family);
 }
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
new file mode 100644
index 000000000000..39762b2fa7a2
--- /dev/null
+++ b/net/ipv6/seg6_iptunnel.c
@@ -0,0 +1,410 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *        modify it under the terms of the GNU General Public License
+ *        as published by the Free Software Foundation; either version
+ *        2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include <net/ip6_fib.h>
+#include <net/route.h>
+#include <net/seg6.h>
+#include <linux/seg6.h>
+#include <linux/seg6_iptunnel.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#ifdef CONFIG_DST_CACHE
+#include <net/dst_cache.h>
+#endif
+
+struct seg6_lwt {
+#ifdef CONFIG_DST_CACHE
+	struct dst_cache cache;
+#endif
+	struct seg6_iptunnel_encap tuninfo[0];
+};
+
+static inline struct seg6_lwt *seg6_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+	return (struct seg6_lwt *)lwt->data;
+}
+
+static inline struct seg6_iptunnel_encap *
+seg6_encap_lwtunnel(struct lwtunnel_state *lwt)
+{
+	return seg6_lwt_lwtunnel(lwt)->tuninfo;
+}
+
+static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX + 1] = {
+	[SEG6_IPTUNNEL_SRH]	= { .type = NLA_BINARY },
+};
+
+int nla_put_srh(struct sk_buff *skb, int attrtype,
+		struct seg6_iptunnel_encap *tuninfo)
+{
+	struct seg6_iptunnel_encap *data;
+	struct nlattr *nla;
+	int len;
+
+	len = SEG6_IPTUN_ENCAP_SIZE(tuninfo);
+
+	nla = nla_reserve(skb, attrtype, len);
+	if (!nla)
+		return -EMSGSIZE;
+
+	data = nla_data(nla);
+	memcpy(data, tuninfo, len);
+
+	return 0;
+}
+
+static void set_tun_src(struct net *net, struct net_device *dev,
+			struct in6_addr *daddr, struct in6_addr *saddr)
+{
+	struct seg6_pernet_data *sdata = seg6_pernet(net);
+	struct in6_addr *tun_src;
+
+	rcu_read_lock();
+
+	tun_src = rcu_dereference(sdata->tun_src);
+
+	if (!ipv6_addr_any(tun_src)) {
+		memcpy(saddr, tun_src, sizeof(struct in6_addr));
+	} else {
+		ipv6_dev_get_saddr(net, dev, daddr, IPV6_PREFER_SRC_PUBLIC,
+				   saddr);
+	}
+
+	rcu_read_unlock();
+}
+
+/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
+static int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
+{
+	struct net *net = dev_net(skb_dst(skb)->dev);
+	struct ipv6hdr *hdr, *inner_hdr;
+	struct ipv6_sr_hdr *isrh;
+	int hdrlen, tot_len, err;
+
+	hdrlen = (osrh->hdrlen + 1) << 3;
+	tot_len = hdrlen + sizeof(*hdr);
+
+	err = pskb_expand_head(skb, tot_len, 0, GFP_ATOMIC);
+	if (unlikely(err))
+		return err;
+
+	inner_hdr = ipv6_hdr(skb);
+
+	skb_push(skb, tot_len);
+	skb_reset_network_header(skb);
+	skb_mac_header_rebuild(skb);
+	hdr = ipv6_hdr(skb);
+
+	/* inherit tc, flowlabel and hlim
+	 * hlim will be decremented in ip6_forward() afterwards and
+	 * decapsulation will overwrite inner hlim with outer hlim
+	 */
+	ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
+		     ip6_flowlabel(inner_hdr));
+	hdr->hop_limit = inner_hdr->hop_limit;
+	hdr->nexthdr = NEXTHDR_ROUTING;
+
+	isrh = (void *)hdr + sizeof(*hdr);
+	memcpy(isrh, osrh, hdrlen);
+
+	isrh->nexthdr = NEXTHDR_IPV6;
+
+	hdr->daddr = isrh->segments[isrh->first_segment];
+	set_tun_src(net, skb->dev, &hdr->daddr, &hdr->saddr);
+
+	skb_postpush_rcsum(skb, hdr, tot_len);
+
+	return 0;
+}
+
+/* insert an SRH within an IPv6 packet, just after the IPv6 header */
+#ifdef CONFIG_IPV6_SEG6_INLINE
+static int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
+{
+	struct ipv6hdr *hdr, *oldhdr;
+	struct ipv6_sr_hdr *isrh;
+	int hdrlen, err;
+
+	hdrlen = (osrh->hdrlen + 1) << 3;
+
+	err = pskb_expand_head(skb, hdrlen, 0, GFP_ATOMIC);
+	if (unlikely(err))
+		return err;
+
+	oldhdr = ipv6_hdr(skb);
+
+	skb_pull(skb, sizeof(struct ipv6hdr));
+	skb_postpull_rcsum(skb, skb_network_header(skb),
+			   sizeof(struct ipv6hdr));
+
+	skb_push(skb, sizeof(struct ipv6hdr) + hdrlen);
+	skb_reset_network_header(skb);
+	skb_mac_header_rebuild(skb);
+
+	hdr = ipv6_hdr(skb);
+
+	memmove(hdr, oldhdr, sizeof(*hdr));
+
+	isrh = (void *)hdr + sizeof(*hdr);
+	memcpy(isrh, osrh, hdrlen);
+
+	isrh->nexthdr = hdr->nexthdr;
+	hdr->nexthdr = NEXTHDR_ROUTING;
+
+	isrh->segments[0] = hdr->daddr;
+	hdr->daddr = isrh->segments[isrh->first_segment];
+
+	skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen);
+
+	return 0;
+}
+#endif
+
+static int seg6_do_srh(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct seg6_iptunnel_encap *tinfo;
+	int err = 0;
+
+	tinfo = seg6_encap_lwtunnel(dst->lwtstate);
+
+	if (likely(!skb->encapsulation)) {
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+	}
+
+	switch (tinfo->mode) {
+#ifdef CONFIG_IPV6_SEG6_INLINE
+	case SEG6_IPTUN_MODE_INLINE:
+		err = seg6_do_srh_inline(skb, tinfo->srh);
+		skb_reset_inner_headers(skb);
+		break;
+#endif
+	case SEG6_IPTUN_MODE_ENCAP:
+		err = seg6_do_srh_encap(skb, tinfo->srh);
+		break;
+	}
+
+	if (err)
+		return err;
+
+	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+	skb_set_inner_protocol(skb, skb->protocol);
+
+	return 0;
+}
+
+int seg6_input(struct sk_buff *skb)
+{
+	int err;
+
+	err = seg6_do_srh(skb);
+	if (unlikely(err)) {
+		kfree_skb(skb);
+		return err;
+	}
+
+	skb_dst_drop(skb);
+	ip6_route_input(skb);
+
+	return dst_input(skb);
+}
+
+int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct dst_entry *orig_dst = skb_dst(skb);
+	struct dst_entry *dst = NULL;
+	struct seg6_lwt *slwt;
+	int err = -EINVAL;
+
+	err = seg6_do_srh(skb);
+	if (unlikely(err))
+		goto drop;
+
+	slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate);
+
+#ifdef CONFIG_DST_CACHE
+	dst = dst_cache_get(&slwt->cache);
+#endif
+
+	if (unlikely(!dst)) {
+		struct ipv6hdr *hdr = ipv6_hdr(skb);
+		struct flowi6 fl6;
+
+		fl6.daddr = hdr->daddr;
+		fl6.saddr = hdr->saddr;
+		fl6.flowlabel = ip6_flowinfo(hdr);
+		fl6.flowi6_mark = skb->mark;
+		fl6.flowi6_proto = hdr->nexthdr;
+
+		dst = ip6_route_output(net, NULL, &fl6);
+		if (dst->error) {
+			err = dst->error;
+			dst_release(dst);
+			goto drop;
+		}
+
+#ifdef CONFIG_DST_CACHE
+		dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr);
+#endif
+	}
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+
+	return dst_output(net, sk, skb);
+drop:
+	kfree_skb(skb);
+	return err;
+}
+
+static int seg6_build_state(struct net_device *dev, struct nlattr *nla,
+			    unsigned int family, const void *cfg,
+			    struct lwtunnel_state **ts)
+{
+	struct nlattr *tb[SEG6_IPTUNNEL_MAX + 1];
+	struct seg6_iptunnel_encap *tuninfo;
+	struct lwtunnel_state *newts;
+	int tuninfo_len, min_size;
+	struct seg6_lwt *slwt;
+	int err;
+
+	err = nla_parse_nested(tb, SEG6_IPTUNNEL_MAX, nla,
+			       seg6_iptunnel_policy);
+
+	if (err < 0)
+		return err;
+
+	if (!tb[SEG6_IPTUNNEL_SRH])
+		return -EINVAL;
+
+	tuninfo = nla_data(tb[SEG6_IPTUNNEL_SRH]);
+	tuninfo_len = nla_len(tb[SEG6_IPTUNNEL_SRH]);
+
+	/* tuninfo must contain at least the iptunnel encap structure,
+	 * the SRH and one segment
+	 */
+	min_size = sizeof(*tuninfo) + sizeof(struct ipv6_sr_hdr) +
+		   sizeof(struct in6_addr);
+	if (tuninfo_len < min_size)
+		return -EINVAL;
+
+	switch (tuninfo->mode) {
+#ifdef CONFIG_IPV6_SEG6_INLINE
+	case SEG6_IPTUN_MODE_INLINE:
+		break;
+#endif
+	case SEG6_IPTUN_MODE_ENCAP:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* verify that SRH is consistent */
+	if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo)))
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt));
+	if (!newts)
+		return -ENOMEM;
+
+	slwt = seg6_lwt_lwtunnel(newts);
+
+#ifdef CONFIG_DST_CACHE
+	err = dst_cache_init(&slwt->cache, GFP_KERNEL);
+	if (err) {
+		kfree(newts);
+		return err;
+	}
+#endif
+
+	memcpy(&slwt->tuninfo, tuninfo, tuninfo_len);
+
+	newts->type = LWTUNNEL_ENCAP_SEG6;
+	newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
+			LWTUNNEL_STATE_INPUT_REDIRECT;
+	newts->headroom = seg6_lwt_headroom(tuninfo);
+
+	*ts = newts;
+
+	return 0;
+}
+
+#ifdef CONFIG_DST_CACHE
+static void seg6_destroy_state(struct lwtunnel_state *lwt)
+{
+	dst_cache_destroy(&seg6_lwt_lwtunnel(lwt)->cache);
+}
+#endif
+
+static int seg6_fill_encap_info(struct sk_buff *skb,
+				struct lwtunnel_state *lwtstate)
+{
+	struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate);
+
+	if (nla_put_srh(skb, SEG6_IPTUNNEL_SRH, tuninfo))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int seg6_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate);
+
+	return nla_total_size(SEG6_IPTUN_ENCAP_SIZE(tuninfo));
+}
+
+static int seg6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct seg6_iptunnel_encap *a_hdr = seg6_encap_lwtunnel(a);
+	struct seg6_iptunnel_encap *b_hdr = seg6_encap_lwtunnel(b);
+	int len = SEG6_IPTUN_ENCAP_SIZE(a_hdr);
+
+	if (len != SEG6_IPTUN_ENCAP_SIZE(b_hdr))
+		return 1;
+
+	return memcmp(a_hdr, b_hdr, len);
+}
+
+static const struct lwtunnel_encap_ops seg6_iptun_ops = {
+	.build_state = seg6_build_state,
+#ifdef CONFIG_DST_CACHE
+	.destroy_state = seg6_destroy_state,
+#endif
+	.output = seg6_output,
+	.input = seg6_input,
+	.fill_encap = seg6_fill_encap_info,
+	.get_encap_size = seg6_encap_nlsize,
+	.cmp_encap = seg6_encap_cmp,
+};
+
+int __init seg6_iptunnel_init(void)
+{
+	return lwtunnel_encap_add_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6);
+}
+
+void seg6_iptunnel_exit(void)
+{
+	lwtunnel_encap_del_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6);
+}
-- 
cgit v1.2.3


From bf355b8d2c30a289232042cacc1cfaea4923936c Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 8 Nov 2016 14:57:42 +0100
Subject: ipv6: sr: add core files for SR HMAC support

This patch adds the necessary functions to compute and check the HMAC signature
of an SR-enabled packet. Two HMAC algorithms are supported: hmac(sha1) and
hmac(sha256).

In order to avoid dynamic memory allocation for each HMAC computation,
a per-cpu ring buffer is allocated for this purpose.

A new per-interface sysctl called seg6_require_hmac is added, allowing a
user-defined policy for processing HMAC-signed SR-enabled packets.
A value of -1 means that the HMAC field will always be ignored.
A value of 0 means that if an HMAC field is present, its validity will
be enforced (the packet is dropped is the signature is incorrect).
Finally, a value of 1 means that any SR-enabled packet that does not
contain an HMAC signature or whose signature is incorrect will be dropped.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h           |   3 +
 include/linux/seg6_hmac.h      |   6 +
 include/net/seg6.h             |   4 +
 include/net/seg6_hmac.h        |  62 ++++++
 include/uapi/linux/ipv6.h      |   1 +
 include/uapi/linux/seg6_hmac.h |  21 ++
 net/ipv6/Kconfig               |  12 +
 net/ipv6/Makefile              |   1 +
 net/ipv6/addrconf.c            |  18 ++
 net/ipv6/seg6_hmac.c           | 484 +++++++++++++++++++++++++++++++++++++++++
 10 files changed, 612 insertions(+)
 create mode 100644 include/linux/seg6_hmac.h
 create mode 100644 include/net/seg6_hmac.h
 create mode 100644 include/uapi/linux/seg6_hmac.h
 create mode 100644 net/ipv6/seg6_hmac.c

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 68d3f71f0abf..93756585521f 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -65,6 +65,9 @@ struct ipv6_devconf {
 	__s32		use_oif_addrs_only;
 	__s32		keep_addr_on_down;
 	__s32		seg6_enabled;
+#ifdef CONFIG_IPV6_SEG6_HMAC
+	__s32		seg6_require_hmac;
+#endif
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/linux/seg6_hmac.h b/include/linux/seg6_hmac.h
new file mode 100644
index 000000000000..da437ebdc6cd
--- /dev/null
+++ b/include/linux/seg6_hmac.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SEG6_HMAC_H
+#define _LINUX_SEG6_HMAC_H
+
+#include <uapi/linux/seg6_hmac.h>
+
+#endif
diff --git a/include/net/seg6.h b/include/net/seg6.h
index ff5da0ce83e9..4e0357517d79 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -18,6 +18,7 @@
 #include <linux/ipv6.h>
 #include <net/lwtunnel.h>
 #include <linux/seg6.h>
+#include <linux/rhashtable.h>
 
 static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
 				     __be32 to)
@@ -41,6 +42,9 @@ static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from,
 struct seg6_pernet_data {
 	struct mutex lock;
 	struct in6_addr __rcu *tun_src;
+#ifdef CONFIG_IPV6_SEG6_HMAC
+	struct rhashtable hmac_infos;
+#endif
 };
 
 static inline struct seg6_pernet_data *seg6_pernet(struct net *net)
diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h
new file mode 100644
index 000000000000..69c3a106056b
--- /dev/null
+++ b/include/net/seg6_hmac.h
@@ -0,0 +1,62 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _NET_SEG6_HMAC_H
+#define _NET_SEG6_HMAC_H
+
+#include <net/flow.h>
+#include <net/ip6_fib.h>
+#include <net/sock.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/route.h>
+#include <net/seg6.h>
+#include <linux/seg6_hmac.h>
+#include <linux/rhashtable.h>
+
+#define SEG6_HMAC_MAX_DIGESTSIZE	160
+#define SEG6_HMAC_RING_SIZE		256
+
+struct seg6_hmac_info {
+	struct rhash_head node;
+	struct rcu_head rcu;
+
+	u32 hmackeyid;
+	char secret[SEG6_HMAC_SECRET_LEN];
+	u8 slen;
+	u8 alg_id;
+};
+
+struct seg6_hmac_algo {
+	u8 alg_id;
+	char name[64];
+	struct crypto_shash * __percpu *tfms;
+	struct shash_desc * __percpu *shashs;
+};
+
+extern int seg6_hmac_compute(struct seg6_hmac_info *hinfo,
+			     struct ipv6_sr_hdr *hdr, struct in6_addr *saddr,
+			     u8 *output);
+extern struct seg6_hmac_info *seg6_hmac_info_lookup(struct net *net, u32 key);
+extern int seg6_hmac_info_add(struct net *net, u32 key,
+			      struct seg6_hmac_info *hinfo);
+extern int seg6_hmac_info_del(struct net *net, u32 key);
+extern int seg6_push_hmac(struct net *net, struct in6_addr *saddr,
+			  struct ipv6_sr_hdr *srh);
+extern bool seg6_hmac_validate_skb(struct sk_buff *skb);
+extern int seg6_hmac_init(void);
+extern void seg6_hmac_exit(void);
+extern int seg6_hmac_net_init(struct net *net);
+extern void seg6_hmac_net_exit(struct net *net);
+
+#endif
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 7ff1d654e333..53561be1ac21 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -180,6 +180,7 @@ enum {
 	DEVCONF_KEEP_ADDR_ON_DOWN,
 	DEVCONF_RTR_SOLICIT_MAX_INTERVAL,
 	DEVCONF_SEG6_ENABLED,
+	DEVCONF_SEG6_REQUIRE_HMAC,
 	DEVCONF_MAX
 };
 
diff --git a/include/uapi/linux/seg6_hmac.h b/include/uapi/linux/seg6_hmac.h
new file mode 100644
index 000000000000..b652dfd51bc5
--- /dev/null
+++ b/include/uapi/linux/seg6_hmac.h
@@ -0,0 +1,21 @@
+#ifndef _UAPI_LINUX_SEG6_HMAC_H
+#define _UAPI_LINUX_SEG6_HMAC_H
+
+#include <linux/seg6.h>
+
+#define SEG6_HMAC_SECRET_LEN	64
+#define SEG6_HMAC_FIELD_LEN	32
+
+struct sr6_tlv_hmac {
+	struct sr6_tlv tlvhdr;
+	__u16 reserved;
+	__be32 hmackeyid;
+	__u8 hmac[SEG6_HMAC_FIELD_LEN];
+};
+
+enum {
+	SEG6_HMAC_ALGO_SHA1 = 1,
+	SEG6_HMAC_ALGO_SHA256 = 2,
+};
+
+#endif
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 1123a001d729..0f00811a785f 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -301,4 +301,16 @@ config IPV6_SEG6_INLINE
 
 	  If unsure, say N.
 
+config IPV6_SEG6_HMAC
+	bool "IPv6: Segment Routing HMAC support"
+	depends on IPV6
+	select CRYPTO_HMAC
+	select CRYPTO_SHA1
+	select CRYPTO_SHA256
+	---help---
+	  Support for HMAC signature generation and verification
+	  of SR-enabled packets.
+
+	  If unsure, say N.
+
 endif # IPV6
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 59ee92fb3689..129cad2ba960 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_IPV6_SIT) += sit.o
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
 obj-$(CONFIG_IPV6_GRE) += ip6_gre.o
 obj-$(CONFIG_IPV6_FOU) += fou6.o
+obj-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
 
 obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
 obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2ac6cb460af0..86219c0a0104 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -239,6 +239,9 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.ignore_routes_with_linkdown = 0,
 	.keep_addr_on_down	= 0,
 	.seg6_enabled		= 0,
+#ifdef CONFIG_IPV6_SEG6_HMAC
+	.seg6_require_hmac	= 0,
+#endif
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -286,6 +289,9 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.ignore_routes_with_linkdown = 0,
 	.keep_addr_on_down	= 0,
 	.seg6_enabled		= 0,
+#ifdef CONFIG_IPV6_SEG6_HMAC
+	.seg6_require_hmac	= 0,
+#endif
 };
 
 /* Check if a valid qdisc is available */
@@ -4947,6 +4953,9 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
 	array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down;
 	array[DEVCONF_SEG6_ENABLED] = cnf->seg6_enabled;
+#ifdef CONFIG_IPV6_SEG6_HMAC
+	array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac;
+#endif
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -6045,6 +6054,15 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_IPV6_SEG6_HMAC
+	{
+		.procname	= "seg6_require_hmac",
+		.data		= &ipv6_devconf.seg6_require_hmac,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 	{
 		/* sentinel */
 	}
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
new file mode 100644
index 000000000000..ef1c8a46e7ac
--- /dev/null
+++ b/net/ipv6/seg6_hmac.c
@@ -0,0 +1,484 @@
+/*
+ *  SR-IPv6 implementation -- HMAC functions
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/in6.h>
+#include <linux/icmpv6.h>
+#include <linux/mroute6.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+
+#include <net/sock.h>
+#include <net/snmp.h>
+
+#include <net/ipv6.h>
+#include <net/protocol.h>
+#include <net/transp_v6.h>
+#include <net/rawv6.h>
+#include <net/ndisc.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/xfrm.h>
+
+#include <linux/cryptohash.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <net/seg6.h>
+#include <net/genetlink.h>
+#include <net/seg6_hmac.h>
+#include <linux/random.h>
+
+static char * __percpu *hmac_ring;
+
+static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+	const struct seg6_hmac_info *hinfo = obj;
+
+	return (hinfo->hmackeyid != *(__u32 *)arg->key);
+}
+
+static inline void seg6_hinfo_release(struct seg6_hmac_info *hinfo)
+{
+	kfree_rcu(hinfo, rcu);
+}
+
+static void seg6_free_hi(void *ptr, void *arg)
+{
+	struct seg6_hmac_info *hinfo = (struct seg6_hmac_info *)ptr;
+
+	if (hinfo)
+		seg6_hinfo_release(hinfo);
+}
+
+static const struct rhashtable_params rht_params = {
+	.head_offset		= offsetof(struct seg6_hmac_info, node),
+	.key_offset		= offsetof(struct seg6_hmac_info, hmackeyid),
+	.key_len		= sizeof(u32),
+	.automatic_shrinking	= true,
+	.obj_cmpfn		= seg6_hmac_cmpfn,
+};
+
+static struct seg6_hmac_algo hmac_algos[] = {
+	{
+		.alg_id = SEG6_HMAC_ALGO_SHA1,
+		.name = "hmac(sha1)",
+	},
+	{
+		.alg_id = SEG6_HMAC_ALGO_SHA256,
+		.name = "hmac(sha256)",
+	},
+};
+
+static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh)
+{
+	struct sr6_tlv_hmac *tlv;
+
+	if (srh->hdrlen < (srh->first_segment + 1) * 2 + 5)
+		return NULL;
+
+	if (!sr_has_hmac(srh))
+		return NULL;
+
+	tlv = (struct sr6_tlv_hmac *)
+	      ((char *)srh + ((srh->hdrlen + 1) << 3) - 40);
+
+	if (tlv->tlvhdr.type != SR6_TLV_HMAC || tlv->tlvhdr.len != 38)
+		return NULL;
+
+	return tlv;
+}
+
+static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id)
+{
+	struct seg6_hmac_algo *algo;
+	int i, alg_count;
+
+	alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
+	for (i = 0; i < alg_count; i++) {
+		algo = &hmac_algos[i];
+		if (algo->alg_id == alg_id)
+			return algo;
+	}
+
+	return NULL;
+}
+
+static int __do_hmac(struct seg6_hmac_info *hinfo, const char *text, u8 psize,
+		     u8 *output, int outlen)
+{
+	struct seg6_hmac_algo *algo;
+	struct crypto_shash *tfm;
+	struct shash_desc *shash;
+	int ret, dgsize;
+
+	algo = __hmac_get_algo(hinfo->alg_id);
+	if (!algo)
+		return -ENOENT;
+
+	tfm = *this_cpu_ptr(algo->tfms);
+
+	dgsize = crypto_shash_digestsize(tfm);
+	if (dgsize > outlen) {
+		pr_debug("sr-ipv6: __do_hmac: digest size too big (%d / %d)\n",
+			 dgsize, outlen);
+		return -ENOMEM;
+	}
+
+	ret = crypto_shash_setkey(tfm, hinfo->secret, hinfo->slen);
+	if (ret < 0) {
+		pr_debug("sr-ipv6: crypto_shash_setkey failed: err %d\n", ret);
+		goto failed;
+	}
+
+	shash = *this_cpu_ptr(algo->shashs);
+	shash->tfm = tfm;
+
+	ret = crypto_shash_digest(shash, text, psize, output);
+	if (ret < 0) {
+		pr_debug("sr-ipv6: crypto_shash_digest failed: err %d\n", ret);
+		goto failed;
+	}
+
+	return dgsize;
+
+failed:
+	return ret;
+}
+
+int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
+		      struct in6_addr *saddr, u8 *output)
+{
+	__be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid);
+	u8 tmp_out[SEG6_HMAC_MAX_DIGESTSIZE];
+	int plen, i, dgsize, wrsize;
+	char *ring, *off;
+
+	/* a 160-byte buffer for digest output allows to store highest known
+	 * hash function (RadioGatun) with up to 1216 bits
+	 */
+
+	/* saddr(16) + first_seg(1) + cleanup(1) + keyid(4) + seglist(16n) */
+	plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16;
+
+	/* this limit allows for 14 segments */
+	if (plen >= SEG6_HMAC_RING_SIZE)
+		return -EMSGSIZE;
+
+	/* Let's build the HMAC text on the ring buffer. The text is composed
+	 * as follows, in order:
+	 *
+	 * 1. Source IPv6 address (128 bits)
+	 * 2. first_segment value (8 bits)
+	 * 3. cleanup flag (8 bits: highest bit is cleanup value, others are 0)
+	 * 4. HMAC Key ID (32 bits)
+	 * 5. All segments in the segments list (n * 128 bits)
+	 */
+
+	local_bh_disable();
+	ring = *this_cpu_ptr(hmac_ring);
+	off = ring;
+
+	/* source address */
+	memcpy(off, saddr, 16);
+	off += 16;
+
+	/* first_segment value */
+	*off++ = hdr->first_segment;
+
+	/* cleanup flag */
+	*off++ = !!(sr_has_cleanup(hdr)) << 7;
+
+	/* HMAC Key ID */
+	memcpy(off, &hmackeyid, 4);
+	off += 4;
+
+	/* all segments in the list */
+	for (i = 0; i < hdr->first_segment + 1; i++) {
+		memcpy(off, hdr->segments + i, 16);
+		off += 16;
+	}
+
+	dgsize = __do_hmac(hinfo, ring, plen, tmp_out,
+			   SEG6_HMAC_MAX_DIGESTSIZE);
+	local_bh_enable();
+
+	if (dgsize < 0)
+		return dgsize;
+
+	wrsize = SEG6_HMAC_FIELD_LEN;
+	if (wrsize > dgsize)
+		wrsize = dgsize;
+
+	memset(output, 0, SEG6_HMAC_FIELD_LEN);
+	memcpy(output, tmp_out, wrsize);
+
+	return 0;
+}
+EXPORT_SYMBOL(seg6_hmac_compute);
+
+/* checks if an incoming SR-enabled packet's HMAC status matches
+ * the incoming policy.
+ *
+ * called with rcu_read_lock()
+ */
+bool seg6_hmac_validate_skb(struct sk_buff *skb)
+{
+	u8 hmac_output[SEG6_HMAC_FIELD_LEN];
+	struct net *net = dev_net(skb->dev);
+	struct seg6_hmac_info *hinfo;
+	struct sr6_tlv_hmac *tlv;
+	struct ipv6_sr_hdr *srh;
+	struct inet6_dev *idev;
+
+	idev = __in6_dev_get(skb->dev);
+
+	srh = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+
+	tlv = seg6_get_tlv_hmac(srh);
+
+	/* mandatory check but no tlv */
+	if (idev->cnf.seg6_require_hmac > 0 && !tlv)
+		return false;
+
+	/* no check */
+	if (idev->cnf.seg6_require_hmac < 0)
+		return true;
+
+	/* check only if present */
+	if (idev->cnf.seg6_require_hmac == 0 && !tlv)
+		return true;
+
+	/* now, seg6_require_hmac >= 0 && tlv */
+
+	hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid));
+	if (!hinfo)
+		return false;
+
+	if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output))
+		return false;
+
+	if (memcmp(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN) != 0)
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL(seg6_hmac_validate_skb);
+
+/* called with rcu_read_lock() */
+struct seg6_hmac_info *seg6_hmac_info_lookup(struct net *net, u32 key)
+{
+	struct seg6_pernet_data *sdata = seg6_pernet(net);
+	struct seg6_hmac_info *hinfo;
+
+	hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params);
+
+	return hinfo;
+}
+EXPORT_SYMBOL(seg6_hmac_info_lookup);
+
+int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo)
+{
+	struct seg6_pernet_data *sdata = seg6_pernet(net);
+	int err;
+
+	err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node,
+					    rht_params);
+
+	return err;
+}
+EXPORT_SYMBOL(seg6_hmac_info_add);
+
+int seg6_hmac_info_del(struct net *net, u32 key)
+{
+	struct seg6_pernet_data *sdata = seg6_pernet(net);
+	struct seg6_hmac_info *hinfo;
+	int err = -ENOENT;
+
+	hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params);
+	if (!hinfo)
+		goto out;
+
+	err = rhashtable_remove_fast(&sdata->hmac_infos, &hinfo->node,
+				     rht_params);
+	if (err)
+		goto out;
+
+	seg6_hinfo_release(hinfo);
+
+out:
+	return err;
+}
+EXPORT_SYMBOL(seg6_hmac_info_del);
+
+int seg6_push_hmac(struct net *net, struct in6_addr *saddr,
+		   struct ipv6_sr_hdr *srh)
+{
+	struct seg6_hmac_info *hinfo;
+	struct sr6_tlv_hmac *tlv;
+	int err = -ENOENT;
+
+	tlv = seg6_get_tlv_hmac(srh);
+	if (!tlv)
+		return -EINVAL;
+
+	rcu_read_lock();
+
+	hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid));
+	if (!hinfo)
+		goto out;
+
+	memset(tlv->hmac, 0, SEG6_HMAC_FIELD_LEN);
+	err = seg6_hmac_compute(hinfo, srh, saddr, tlv->hmac);
+
+out:
+	rcu_read_unlock();
+	return err;
+}
+EXPORT_SYMBOL(seg6_push_hmac);
+
+static int seg6_hmac_init_ring(void)
+{
+	int i;
+
+	hmac_ring = alloc_percpu(char *);
+
+	if (!hmac_ring)
+		return -ENOMEM;
+
+	for_each_possible_cpu(i) {
+		char *ring = kzalloc(SEG6_HMAC_RING_SIZE, GFP_KERNEL);
+
+		if (!ring)
+			return -ENOMEM;
+
+		*per_cpu_ptr(hmac_ring, i) = ring;
+	}
+
+	return 0;
+}
+
+static int seg6_hmac_init_algo(void)
+{
+	struct seg6_hmac_algo *algo;
+	struct crypto_shash *tfm;
+	struct shash_desc *shash;
+	int i, alg_count, cpu;
+
+	alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
+
+	for (i = 0; i < alg_count; i++) {
+		struct crypto_shash **p_tfm;
+		int shsize;
+
+		algo = &hmac_algos[i];
+		algo->tfms = alloc_percpu(struct crypto_shash *);
+		if (!algo->tfms)
+			return -ENOMEM;
+
+		for_each_possible_cpu(cpu) {
+			tfm = crypto_alloc_shash(algo->name, 0, GFP_KERNEL);
+			if (IS_ERR(tfm))
+				return PTR_ERR(tfm);
+			p_tfm = per_cpu_ptr(algo->tfms, cpu);
+			*p_tfm = tfm;
+		}
+
+		p_tfm = this_cpu_ptr(algo->tfms);
+		tfm = *p_tfm;
+
+		shsize = sizeof(*shash) + crypto_shash_descsize(tfm);
+
+		algo->shashs = alloc_percpu(struct shash_desc *);
+		if (!algo->shashs)
+			return -ENOMEM;
+
+		for_each_possible_cpu(cpu) {
+			shash = kzalloc(shsize, GFP_KERNEL);
+			if (!shash)
+				return -ENOMEM;
+			*per_cpu_ptr(algo->shashs, cpu) = shash;
+		}
+	}
+
+	return 0;
+}
+
+int __init seg6_hmac_init(void)
+{
+	int ret;
+
+	ret = seg6_hmac_init_ring();
+	if (ret < 0)
+		goto out;
+
+	ret = seg6_hmac_init_algo();
+
+out:
+	return ret;
+}
+EXPORT_SYMBOL(seg6_hmac_init);
+
+int __net_init seg6_hmac_net_init(struct net *net)
+{
+	struct seg6_pernet_data *sdata = seg6_pernet(net);
+
+	rhashtable_init(&sdata->hmac_infos, &rht_params);
+
+	return 0;
+}
+EXPORT_SYMBOL(seg6_hmac_net_init);
+
+void seg6_hmac_exit(void)
+{
+	struct seg6_hmac_algo *algo = NULL;
+	int i, alg_count, cpu;
+
+	for_each_possible_cpu(i) {
+		char *ring = *per_cpu_ptr(hmac_ring, i);
+
+		kfree(ring);
+	}
+	free_percpu(hmac_ring);
+
+	alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
+	for (i = 0; i < alg_count; i++) {
+		algo = &hmac_algos[i];
+		for_each_possible_cpu(cpu) {
+			struct crypto_shash *tfm;
+			struct shash_desc *shash;
+
+			shash = *per_cpu_ptr(algo->shashs, cpu);
+			kfree(shash);
+			tfm = *per_cpu_ptr(algo->tfms, cpu);
+			crypto_free_shash(tfm);
+		}
+		free_percpu(algo->tfms);
+		free_percpu(algo->shashs);
+	}
+}
+EXPORT_SYMBOL(seg6_hmac_exit);
+
+void __net_exit seg6_hmac_net_exit(struct net *net)
+{
+	struct seg6_pernet_data *sdata = seg6_pernet(net);
+
+	rhashtable_free_and_destroy(&sdata->hmac_infos, seg6_free_hi, NULL);
+}
+EXPORT_SYMBOL(seg6_hmac_net_exit);
-- 
cgit v1.2.3


From 149d6ad83663b4820ca09c9d40b1eea7f5c22c2b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 8 Nov 2016 11:07:28 -0800
Subject: net: napi_hash_add() is no longer exported

There are no more users except from net/core/dev.c
napi_hash_add() can now be static.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 11 -----------
 net/core/dev.c            |  3 +--
 2 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 66fd61c681d9..d64135a0ab71 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -467,17 +467,6 @@ static inline void napi_complete(struct napi_struct *n)
 	return napi_complete_done(n, 0);
 }
 
-/**
- *	napi_hash_add - add a NAPI to global hashtable
- *	@napi: NAPI context
- *
- * Generate a new napi_id and store a @napi under it in napi_hash.
- * Used for busy polling (CONFIG_NET_RX_BUSY_POLL).
- * Note: This is normally automatically done from netif_napi_add(),
- * so might disappear in a future Linux version.
- */
-void napi_hash_add(struct napi_struct *napi);
-
 /**
  *	napi_hash_del - remove a NAPI from global table
  *	@napi: NAPI context
diff --git a/net/core/dev.c b/net/core/dev.c
index c9837fa08dfc..7385c1a152fd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5017,7 +5017,7 @@ EXPORT_SYMBOL(sk_busy_loop);
 
 #endif /* CONFIG_NET_RX_BUSY_POLL */
 
-void napi_hash_add(struct napi_struct *napi)
+static void napi_hash_add(struct napi_struct *napi)
 {
 	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
 	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
@@ -5037,7 +5037,6 @@ void napi_hash_add(struct napi_struct *napi)
 
 	spin_unlock(&napi_hash_lock);
 }
-EXPORT_SYMBOL_GPL(napi_hash_add);
 
 /* Warning : caller is responsible to make sure rcu grace period
  * is respected before freeing memory containing @napi
-- 
cgit v1.2.3


From d8d26354191399627bac9cf0da0667b0f5178686 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 8 Nov 2016 22:49:16 +0100
Subject: ptp: Introduce a high resolution frequency adjustment method.

The internal PTP Hardware Clock (PHC) interface limits the resolution for
frequency adjustments to one part per billion.  However, some hardware
devices allow finer adjustment, and making use of the increased resolution
improves synchronization measurably on such devices.

This patch adds an alternative method that allows finer frequency tuning
by passing the scaled ppm value to PHC drivers.  This value comes from
user space, and it has a resolution of about 0.015 ppb.  We also deprecate
the older method, anticipating its removal once existing drivers have been
converted over.

Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Suggested-by: Ulrik De Bie <ulrik.debie-os@e2big.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_clock.c          | 5 ++++-
 include/linux/ptp_clock_kernel.h | 8 ++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index 86280b7e41f3..9c13381b6966 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -153,7 +153,10 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct timex *tx)
 		s32 ppb = scaled_ppm_to_ppb(tx->freq);
 		if (ppb > ops->max_adj || ppb < -ops->max_adj)
 			return -ERANGE;
-		err = ops->adjfreq(ops, ppb);
+		if (ops->adjfine)
+			err = ops->adjfine(ops, tx->freq);
+		else
+			err = ops->adjfreq(ops, ppb);
 		ptp->dialed_frequency = tx->freq;
 	} else if (tx->modes == 0) {
 		tx->freq = ptp->dialed_frequency;
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 5ad54fc66cf0..b76d47aba564 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -58,7 +58,14 @@ struct system_device_crosststamp;
  *
  * clock operations
  *
+ * @adjfine:  Adjusts the frequency of the hardware clock.
+ *            parameter scaled_ppm: Desired frequency offset from
+ *            nominal frequency in parts per million, but with a
+ *            16 bit binary fractional field.
+ *
  * @adjfreq:  Adjusts the frequency of the hardware clock.
+ *            This method is deprecated.  New drivers should implement
+ *            the @adjfine method instead.
  *            parameter delta: Desired frequency offset from nominal frequency
  *            in parts per billion
  *
@@ -108,6 +115,7 @@ struct ptp_clock_info {
 	int n_pins;
 	int pps;
 	struct ptp_pin_desc *pin_config;
+	int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm);
 	int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta);
 	int (*adjtime)(struct ptp_clock_info *ptp, s64 delta);
 	int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts);
-- 
cgit v1.2.3


From 2da16a6948ca8f025e2c226ea4fc32baa6b90f27 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 10 Nov 2016 11:17:25 +0100
Subject: netfilter: ipset: Remove extra whitespaces in ip_set.h

Remove unnecessary whitespaces.

Ported from a patch proposed by Sergey Popovich <popovich_sergei@mail.ua>.

Suggested-by: Sergey Popovich <popovich_sergei@mail.ua>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 83b9a2e0d8d4..5b1fd090f34b 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -336,14 +336,15 @@ ip_set_update_counter(struct ip_set_counter *counter,
 
 static inline void
 ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
-		      const struct ip_set_ext *ext,
-		      struct ip_set_ext *mext, u32 flags)
+		   const struct ip_set_ext *ext,
+		   struct ip_set_ext *mext, u32 flags)
 {
-		mext->skbmark = skbinfo->skbmark;
-		mext->skbmarkmask = skbinfo->skbmarkmask;
-		mext->skbprio = skbinfo->skbprio;
-		mext->skbqueue = skbinfo->skbqueue;
+	mext->skbmark = skbinfo->skbmark;
+	mext->skbmarkmask = skbinfo->skbmarkmask;
+	mext->skbprio = skbinfo->skbprio;
+	mext->skbqueue = skbinfo->skbqueue;
 }
+
 static inline bool
 ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo)
 {
-- 
cgit v1.2.3


From da9fbfa76f32a031cb70b11e9fa650e30c85d040 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 10 Nov 2016 11:24:15 +0100
Subject: netfilter: ipset: Mark some helper args as const.

Mark some of the helpers arguments as const.

Ported from a patch proposed by Sergey Popovich <popovich_sergei@mail.ua>.

Suggested-by: Sergey Popovich <popovich_sergei@mail.ua>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h         | 4 ++--
 include/linux/netfilter/ipset/ip_set_comment.h | 2 +-
 include/linux/netfilter/ipset/ip_set_timeout.h | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 5b1fd090f34b..524467f933bf 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -346,7 +346,7 @@ ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
 }
 
 static inline bool
-ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo)
+ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo)
 {
 	/* Send nonzero parameters only */
 	return ((skbinfo->skbmark || skbinfo->skbmarkmask) &&
@@ -373,7 +373,7 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
 }
 
 static inline bool
-ip_set_put_counter(struct sk_buff *skb, struct ip_set_counter *counter)
+ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter)
 {
 	return nla_put_net64(skb, IPSET_ATTR_BYTES,
 			     cpu_to_be64(ip_set_get_bytes(counter)),
diff --git a/include/linux/netfilter/ipset/ip_set_comment.h b/include/linux/netfilter/ipset/ip_set_comment.h
index 8d0248525957..bae5c7609be2 100644
--- a/include/linux/netfilter/ipset/ip_set_comment.h
+++ b/include/linux/netfilter/ipset/ip_set_comment.h
@@ -43,7 +43,7 @@ ip_set_init_comment(struct ip_set_comment *comment,
 
 /* Used only when dumping a set, protected by rcu_read_lock_bh() */
 static inline int
-ip_set_put_comment(struct sk_buff *skb, struct ip_set_comment *comment)
+ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment)
 {
 	struct ip_set_comment_rcu *c = rcu_dereference_bh(comment->c);
 
diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h
index 1d6a935c1ac5..bfb3531fd88a 100644
--- a/include/linux/netfilter/ipset/ip_set_timeout.h
+++ b/include/linux/netfilter/ipset/ip_set_timeout.h
@@ -40,7 +40,7 @@ ip_set_timeout_uget(struct nlattr *tb)
 }
 
 static inline bool
-ip_set_timeout_expired(unsigned long *t)
+ip_set_timeout_expired(const unsigned long *t)
 {
 	return *t != IPSET_ELEM_PERMANENT && time_is_before_jiffies(*t);
 }
@@ -63,7 +63,7 @@ ip_set_timeout_set(unsigned long *timeout, u32 value)
 }
 
 static inline u32
-ip_set_timeout_get(unsigned long *timeout)
+ip_set_timeout_get(const unsigned long *timeout)
 {
 	return *timeout == IPSET_ELEM_PERMANENT ? 0 :
 		jiffies_to_msecs(*timeout - jiffies)/MSEC_PER_SEC;
-- 
cgit v1.2.3


From 7ffea37957b900422ce8b82e9651f7a0a6fac733 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 10 Nov 2016 11:31:03 +0100
Subject: netfilter: ipset: Headers file cleanup

Group counter helper functions together.

Ported from a patch proposed by Sergey Popovich <popovich_sergei@mail.ua>.

Suggested-by: Sergey Popovich <popovich_sergei@mail.ua>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h | 42 +++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 524467f933bf..1ea28e30a6dd 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -334,6 +334,27 @@ ip_set_update_counter(struct ip_set_counter *counter,
 	}
 }
 
+static inline bool
+ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter)
+{
+	return nla_put_net64(skb, IPSET_ATTR_BYTES,
+			     cpu_to_be64(ip_set_get_bytes(counter)),
+			     IPSET_ATTR_PAD) ||
+	       nla_put_net64(skb, IPSET_ATTR_PACKETS,
+			     cpu_to_be64(ip_set_get_packets(counter)),
+			     IPSET_ATTR_PAD);
+}
+
+static inline void
+ip_set_init_counter(struct ip_set_counter *counter,
+		    const struct ip_set_ext *ext)
+{
+	if (ext->bytes != ULLONG_MAX)
+		atomic64_set(&(counter)->bytes, (long long)(ext->bytes));
+	if (ext->packets != ULLONG_MAX)
+		atomic64_set(&(counter)->packets, (long long)(ext->packets));
+}
+
 static inline void
 ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
 		   const struct ip_set_ext *ext,
@@ -372,27 +393,6 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
 	skbinfo->skbqueue = ext->skbqueue;
 }
 
-static inline bool
-ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter)
-{
-	return nla_put_net64(skb, IPSET_ATTR_BYTES,
-			     cpu_to_be64(ip_set_get_bytes(counter)),
-			     IPSET_ATTR_PAD) ||
-	       nla_put_net64(skb, IPSET_ATTR_PACKETS,
-			     cpu_to_be64(ip_set_get_packets(counter)),
-			     IPSET_ATTR_PAD);
-}
-
-static inline void
-ip_set_init_counter(struct ip_set_counter *counter,
-		    const struct ip_set_ext *ext)
-{
-	if (ext->bytes != ULLONG_MAX)
-		atomic64_set(&(counter)->bytes, (long long)(ext->bytes));
-	if (ext->packets != ULLONG_MAX)
-		atomic64_set(&(counter)->packets, (long long)(ext->packets));
-}
-
 /* Netlink CB args */
 enum {
 	IPSET_CB_NET = 0,	/* net namespace */
-- 
cgit v1.2.3


From bec810d973003b30bc477146904af6bd93fd2df8 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Tue, 5 May 2015 17:13:28 +0200
Subject: netfilter: ipset: Improve skbinfo get/init helpers

Use struct ip_set_skbinfo in struct ip_set_ext instead of open
coded fields and assign structure members in get/init helpers
instead of copying members one by one. Explicitly note that
struct ip_set_skbinfo must be padded to prevent non-aligned
access in the extension blob.

Ported from a patch proposed by Sergey Popovich <popovich_sergei@mail.ua>.

Suggested-by: Sergey Popovich <popovich_sergei@mail.ua>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h | 30 +++++++++++-------------------
 net/netfilter/ipset/ip_set_core.c      | 12 ++++++------
 net/netfilter/xt_set.c                 | 12 +++++++-----
 3 files changed, 24 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 1ea28e30a6dd..780262124632 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -92,17 +92,6 @@ struct ip_set_ext_type {
 
 extern const struct ip_set_ext_type ip_set_extensions[];
 
-struct ip_set_ext {
-	u64 packets;
-	u64 bytes;
-	u32 timeout;
-	u32 skbmark;
-	u32 skbmarkmask;
-	u32 skbprio;
-	u16 skbqueue;
-	char *comment;
-};
-
 struct ip_set_counter {
 	atomic64_t bytes;
 	atomic64_t packets;
@@ -122,6 +111,15 @@ struct ip_set_skbinfo {
 	u32 skbmarkmask;
 	u32 skbprio;
 	u16 skbqueue;
+	u16 __pad;
+};
+
+struct ip_set_ext {
+	struct ip_set_skbinfo skbinfo;
+	u64 packets;
+	u64 bytes;
+	char *comment;
+	u32 timeout;
 };
 
 struct ip_set;
@@ -360,10 +358,7 @@ ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
 		   const struct ip_set_ext *ext,
 		   struct ip_set_ext *mext, u32 flags)
 {
-	mext->skbmark = skbinfo->skbmark;
-	mext->skbmarkmask = skbinfo->skbmarkmask;
-	mext->skbprio = skbinfo->skbprio;
-	mext->skbqueue = skbinfo->skbqueue;
+	mext->skbinfo = *skbinfo;
 }
 
 static inline bool
@@ -387,10 +382,7 @@ static inline void
 ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
 		    const struct ip_set_ext *ext)
 {
-	skbinfo->skbmark = ext->skbmark;
-	skbinfo->skbmarkmask = ext->skbmarkmask;
-	skbinfo->skbprio = ext->skbprio;
-	skbinfo->skbqueue = ext->skbqueue;
+	*skbinfo = ext->skbinfo;
 }
 
 /* Netlink CB args */
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 3f1b945a24d5..bfacccff7196 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -426,20 +426,20 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
 		if (!SET_WITH_SKBINFO(set))
 			return -IPSET_ERR_SKBINFO;
 		fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK]));
-		ext->skbmark = fullmark >> 32;
-		ext->skbmarkmask = fullmark & 0xffffffff;
+		ext->skbinfo.skbmark = fullmark >> 32;
+		ext->skbinfo.skbmarkmask = fullmark & 0xffffffff;
 	}
 	if (tb[IPSET_ATTR_SKBPRIO]) {
 		if (!SET_WITH_SKBINFO(set))
 			return -IPSET_ERR_SKBINFO;
-		ext->skbprio = be32_to_cpu(nla_get_be32(
-					    tb[IPSET_ATTR_SKBPRIO]));
+		ext->skbinfo.skbprio =
+			be32_to_cpu(nla_get_be32(tb[IPSET_ATTR_SKBPRIO]));
 	}
 	if (tb[IPSET_ATTR_SKBQUEUE]) {
 		if (!SET_WITH_SKBINFO(set))
 			return -IPSET_ERR_SKBINFO;
-		ext->skbqueue = be16_to_cpu(nla_get_be16(
-					    tb[IPSET_ATTR_SKBQUEUE]));
+		ext->skbinfo.skbqueue =
+			be16_to_cpu(nla_get_be16(tb[IPSET_ATTR_SKBQUEUE]));
 	}
 	return 0;
 }
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 1bfede7be418..64285702afd5 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -423,6 +423,8 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
 
 /* Revision 3 target */
 
+#define MOPT(opt, member)	((opt).ext.skbinfo.member)
+
 static unsigned int
 set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -453,14 +455,14 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par)
 		if (!ret)
 			return XT_CONTINUE;
 		if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBMARK)
-			skb->mark = (skb->mark & ~(map_opt.ext.skbmarkmask))
-				    ^ (map_opt.ext.skbmark);
+			skb->mark = (skb->mark & ~MOPT(map_opt,skbmarkmask))
+				    ^ MOPT(map_opt, skbmark);
 		if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBPRIO)
-			skb->priority = map_opt.ext.skbprio;
+			skb->priority = MOPT(map_opt, skbprio);
 		if ((map_opt.cmdflags & IPSET_FLAG_MAP_SKBQUEUE) &&
 		    skb->dev &&
-		    skb->dev->real_num_tx_queues > map_opt.ext.skbqueue)
-			skb_set_queue_mapping(skb, map_opt.ext.skbqueue);
+		    skb->dev->real_num_tx_queues > MOPT(map_opt, skbqueue))
+			skb_set_queue_mapping(skb, MOPT(map_opt, skbqueue));
 	}
 	return XT_CONTINUE;
 }
-- 
cgit v1.2.3


From 1d0d6bd61d495d271b9774a15fbea93e4875474b Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Wed, 6 May 2015 07:27:28 +0200
Subject: netfilter: ipset: Use kmalloc() in comment extension helper

Allocate memory with kmalloc() rather than kzalloc(): the string
is immediately initialized so it is unnecessary to zero out
the allocated memory area.

Ported from a patch proposed by Sergey Popovich <popovich_sergei@mail.ua>.

Suggested-by: Sergey Popovich <popovich_sergei@mail.ua>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set_comment.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set_comment.h b/include/linux/netfilter/ipset/ip_set_comment.h
index bae5c7609be2..5444b1bbe656 100644
--- a/include/linux/netfilter/ipset/ip_set_comment.h
+++ b/include/linux/netfilter/ipset/ip_set_comment.h
@@ -34,7 +34,7 @@ ip_set_init_comment(struct ip_set_comment *comment,
 		return;
 	if (unlikely(len > IPSET_MAX_COMMENT_SIZE))
 		len = IPSET_MAX_COMMENT_SIZE;
-	c = kzalloc(sizeof(*c) + len + 1, GFP_ATOMIC);
+	c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC);
 	if (unlikely(!c))
 		return;
 	strlcpy(c->str, ext->comment, len + 1);
-- 
cgit v1.2.3


From 57982edc2739b4473868e7579c0185270468bae1 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 10 Oct 2016 21:34:56 +0200
Subject: netfilter: ipset: Split extensions into separate files

Cleanup to separate all extensions into individual files.

Ported from a patch proposed by Sergey Popovich <popovich_sergei@mail.ua>.

Suggested-by: Sergey Popovich <popovich_sergei@mail.ua>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h         | 95 +-------------------------
 include/linux/netfilter/ipset/ip_set_counter.h | 75 ++++++++++++++++++++
 include/linux/netfilter/ipset/ip_set_skbinfo.h | 46 +++++++++++++
 3 files changed, 123 insertions(+), 93 deletions(-)
 create mode 100644 include/linux/netfilter/ipset/ip_set_counter.h
 create mode 100644 include/linux/netfilter/ipset/ip_set_skbinfo.h

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 780262124632..b5bd0fb3d07b 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -292,99 +292,6 @@ ip_set_put_flags(struct sk_buff *skb, struct ip_set *set)
 	return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags));
 }
 
-static inline void
-ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter)
-{
-	atomic64_add((long long)bytes, &(counter)->bytes);
-}
-
-static inline void
-ip_set_add_packets(u64 packets, struct ip_set_counter *counter)
-{
-	atomic64_add((long long)packets, &(counter)->packets);
-}
-
-static inline u64
-ip_set_get_bytes(const struct ip_set_counter *counter)
-{
-	return (u64)atomic64_read(&(counter)->bytes);
-}
-
-static inline u64
-ip_set_get_packets(const struct ip_set_counter *counter)
-{
-	return (u64)atomic64_read(&(counter)->packets);
-}
-
-static inline void
-ip_set_update_counter(struct ip_set_counter *counter,
-		      const struct ip_set_ext *ext,
-		      struct ip_set_ext *mext, u32 flags)
-{
-	if (ext->packets != ULLONG_MAX &&
-	    !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) {
-		ip_set_add_bytes(ext->bytes, counter);
-		ip_set_add_packets(ext->packets, counter);
-	}
-	if (flags & IPSET_FLAG_MATCH_COUNTERS) {
-		mext->packets = ip_set_get_packets(counter);
-		mext->bytes = ip_set_get_bytes(counter);
-	}
-}
-
-static inline bool
-ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter)
-{
-	return nla_put_net64(skb, IPSET_ATTR_BYTES,
-			     cpu_to_be64(ip_set_get_bytes(counter)),
-			     IPSET_ATTR_PAD) ||
-	       nla_put_net64(skb, IPSET_ATTR_PACKETS,
-			     cpu_to_be64(ip_set_get_packets(counter)),
-			     IPSET_ATTR_PAD);
-}
-
-static inline void
-ip_set_init_counter(struct ip_set_counter *counter,
-		    const struct ip_set_ext *ext)
-{
-	if (ext->bytes != ULLONG_MAX)
-		atomic64_set(&(counter)->bytes, (long long)(ext->bytes));
-	if (ext->packets != ULLONG_MAX)
-		atomic64_set(&(counter)->packets, (long long)(ext->packets));
-}
-
-static inline void
-ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
-		   const struct ip_set_ext *ext,
-		   struct ip_set_ext *mext, u32 flags)
-{
-	mext->skbinfo = *skbinfo;
-}
-
-static inline bool
-ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo)
-{
-	/* Send nonzero parameters only */
-	return ((skbinfo->skbmark || skbinfo->skbmarkmask) &&
-		nla_put_net64(skb, IPSET_ATTR_SKBMARK,
-			      cpu_to_be64((u64)skbinfo->skbmark << 32 |
-					  skbinfo->skbmarkmask),
-			      IPSET_ATTR_PAD)) ||
-	       (skbinfo->skbprio &&
-		nla_put_net32(skb, IPSET_ATTR_SKBPRIO,
-			      cpu_to_be32(skbinfo->skbprio))) ||
-	       (skbinfo->skbqueue &&
-		nla_put_net16(skb, IPSET_ATTR_SKBQUEUE,
-			     cpu_to_be16(skbinfo->skbqueue)));
-}
-
-static inline void
-ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
-		    const struct ip_set_ext *ext)
-{
-	*skbinfo = ext->skbinfo;
-}
-
 /* Netlink CB args */
 enum {
 	IPSET_CB_NET = 0,	/* net namespace */
@@ -539,6 +446,8 @@ bitmap_bytes(u32 a, u32 b)
 
 #include <linux/netfilter/ipset/ip_set_timeout.h>
 #include <linux/netfilter/ipset/ip_set_comment.h>
+#include <linux/netfilter/ipset/ip_set_counter.h>
+#include <linux/netfilter/ipset/ip_set_skbinfo.h>
 
 int
 ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
diff --git a/include/linux/netfilter/ipset/ip_set_counter.h b/include/linux/netfilter/ipset/ip_set_counter.h
new file mode 100644
index 000000000000..bb6fba480118
--- /dev/null
+++ b/include/linux/netfilter/ipset/ip_set_counter.h
@@ -0,0 +1,75 @@
+#ifndef _IP_SET_COUNTER_H
+#define _IP_SET_COUNTER_H
+
+/* Copyright (C) 2015 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifdef __KERNEL__
+
+static inline void
+ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter)
+{
+	atomic64_add((long long)bytes, &(counter)->bytes);
+}
+
+static inline void
+ip_set_add_packets(u64 packets, struct ip_set_counter *counter)
+{
+	atomic64_add((long long)packets, &(counter)->packets);
+}
+
+static inline u64
+ip_set_get_bytes(const struct ip_set_counter *counter)
+{
+	return (u64)atomic64_read(&(counter)->bytes);
+}
+
+static inline u64
+ip_set_get_packets(const struct ip_set_counter *counter)
+{
+	return (u64)atomic64_read(&(counter)->packets);
+}
+
+static inline void
+ip_set_update_counter(struct ip_set_counter *counter,
+		      const struct ip_set_ext *ext,
+		      struct ip_set_ext *mext, u32 flags)
+{
+	if (ext->packets != ULLONG_MAX &&
+	    !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) {
+		ip_set_add_bytes(ext->bytes, counter);
+		ip_set_add_packets(ext->packets, counter);
+	}
+	if (flags & IPSET_FLAG_MATCH_COUNTERS) {
+		mext->packets = ip_set_get_packets(counter);
+		mext->bytes = ip_set_get_bytes(counter);
+	}
+}
+
+static inline bool
+ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter)
+{
+	return nla_put_net64(skb, IPSET_ATTR_BYTES,
+			     cpu_to_be64(ip_set_get_bytes(counter)),
+			     IPSET_ATTR_PAD) ||
+	       nla_put_net64(skb, IPSET_ATTR_PACKETS,
+			     cpu_to_be64(ip_set_get_packets(counter)),
+			     IPSET_ATTR_PAD);
+}
+
+static inline void
+ip_set_init_counter(struct ip_set_counter *counter,
+		    const struct ip_set_ext *ext)
+{
+	if (ext->bytes != ULLONG_MAX)
+		atomic64_set(&(counter)->bytes, (long long)(ext->bytes));
+	if (ext->packets != ULLONG_MAX)
+		atomic64_set(&(counter)->packets, (long long)(ext->packets));
+}
+
+#endif /* __KERNEL__ */
+#endif /* _IP_SET_COUNTER_H */
diff --git a/include/linux/netfilter/ipset/ip_set_skbinfo.h b/include/linux/netfilter/ipset/ip_set_skbinfo.h
new file mode 100644
index 000000000000..29d7ef2bc3fa
--- /dev/null
+++ b/include/linux/netfilter/ipset/ip_set_skbinfo.h
@@ -0,0 +1,46 @@
+#ifndef _IP_SET_SKBINFO_H
+#define _IP_SET_SKBINFO_H
+
+/* Copyright (C) 2015 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifdef __KERNEL__
+
+static inline void
+ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
+		   const struct ip_set_ext *ext,
+		   struct ip_set_ext *mext, u32 flags)
+{
+	mext->skbinfo = *skbinfo;
+}
+
+static inline bool
+ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo)
+{
+	/* Send nonzero parameters only */
+	return ((skbinfo->skbmark || skbinfo->skbmarkmask) &&
+		nla_put_net64(skb, IPSET_ATTR_SKBMARK,
+			      cpu_to_be64((u64)skbinfo->skbmark << 32 |
+					  skbinfo->skbmarkmask),
+			      IPSET_ATTR_PAD)) ||
+	       (skbinfo->skbprio &&
+		nla_put_net32(skb, IPSET_ATTR_SKBPRIO,
+			      cpu_to_be32(skbinfo->skbprio))) ||
+	       (skbinfo->skbqueue &&
+		nla_put_net16(skb, IPSET_ATTR_SKBQUEUE,
+			      cpu_to_be16(skbinfo->skbqueue)));
+}
+
+static inline void
+ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
+		    const struct ip_set_ext *ext)
+{
+	*skbinfo = ext->skbinfo;
+}
+
+#endif /* __KERNEL__ */
+#endif /* _IP_SET_SKBINFO_H */
-- 
cgit v1.2.3


From 837a90eab67edfa464dcc0ddef193449d23da408 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 10 Oct 2016 21:52:51 +0200
Subject: netfilter: ipset: Regroup ip_set_put_extensions and add extern

Cleanup: group ip_set_put_extensions and ip_set_get_extensions
together and add missing extern.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index b5bd0fb3d07b..7a218eb74887 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -331,6 +331,8 @@ extern size_t ip_set_elem_len(struct ip_set *set, struct nlattr *tb[],
 			      size_t len, size_t align);
 extern int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
 				 struct ip_set_ext *ext);
+extern int ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
+				 const void *e, bool active);
 
 static inline int
 ip_set_get_hostipaddr4(struct nlattr *nla, u32 *ipaddr)
@@ -449,10 +451,6 @@ bitmap_bytes(u32 a, u32 b)
 #include <linux/netfilter/ipset/ip_set_counter.h>
 #include <linux/netfilter/ipset/ip_set_skbinfo.h>
 
-int
-ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
-		      const void *e, bool active);
-
 #define IP_SET_INIT_KEXT(skb, opt, set)			\
 	{ .bytes = (skb)->len, .packets = 1,		\
 	  .timeout = ip_set_adt_opt_timeout(opt, set) }
-- 
cgit v1.2.3


From 702b71e7c666a1c9be9d49e8cd173f0d4d1e859f Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 10 Oct 2016 22:07:41 +0200
Subject: netfilter: ipset: Add element count to all set types header

It is better to list the set elements for all set types, thus the
header information is uniform. Element counts are therefore added
to the bitmap and list types.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h        |  2 ++
 include/linux/netfilter/ipset/ip_set_bitmap.h |  2 +-
 net/netfilter/ipset/ip_set_bitmap_gen.h       | 10 +++++++++-
 net/netfilter/ipset/ip_set_hash_gen.h         | 21 ++++++++++-----------
 net/netfilter/ipset/ip_set_list_set.c         |  6 +++++-
 5 files changed, 27 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 7a218eb74887..4671d740610f 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -250,6 +250,8 @@ struct ip_set {
 	u8 flags;
 	/* Default timeout value, if enabled */
 	u32 timeout;
+	/* Number of elements (vs timeout) */
+	u32 elements;
 	/* Element data size */
 	size_t dsize;
 	/* Offsets to extensions in elements */
diff --git a/include/linux/netfilter/ipset/ip_set_bitmap.h b/include/linux/netfilter/ipset/ip_set_bitmap.h
index 5e4662a71e01..366d6c0ea04f 100644
--- a/include/linux/netfilter/ipset/ip_set_bitmap.h
+++ b/include/linux/netfilter/ipset/ip_set_bitmap.h
@@ -6,8 +6,8 @@
 #define IPSET_BITMAP_MAX_RANGE	0x0000FFFF
 
 enum {
+	IPSET_ADD_STORE_PLAIN_TIMEOUT = -1,
 	IPSET_ADD_FAILED = 1,
-	IPSET_ADD_STORE_PLAIN_TIMEOUT,
 	IPSET_ADD_START_STORED_TIMEOUT,
 };
 
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 4f07b90f8ef4..1810d1c06e3d 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -83,6 +83,7 @@ mtype_flush(struct ip_set *set)
 	if (set->extensions & IPSET_EXT_DESTROY)
 		mtype_ext_cleanup(set);
 	memset(map->members, 0, map->memsize);
+	set->elements = 0;
 }
 
 /* Calculate the actual memory size of the set data */
@@ -105,7 +106,8 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
 		goto nla_put_failure;
 	if (mtype_do_head(skb, map) ||
 	    nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
-	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
+	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
+	    nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
 		goto nla_put_failure;
 	if (unlikely(ip_set_put_flags(skb, set)))
 		goto nla_put_failure;
@@ -149,6 +151,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	if (ret == IPSET_ADD_FAILED) {
 		if (SET_WITH_TIMEOUT(set) &&
 		    ip_set_timeout_expired(ext_timeout(x, set))) {
+			set->elements--;
 			ret = 0;
 		} else if (!(flags & IPSET_FLAG_EXIST)) {
 			set_bit(e->id, map->members);
@@ -157,6 +160,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		/* Element is re-added, cleanup extensions */
 		ip_set_ext_destroy(set, x);
 	}
+	if (ret > 0)
+		set->elements--;
 
 	if (SET_WITH_TIMEOUT(set))
 #ifdef IP_SET_BITMAP_STORED_TIMEOUT
@@ -174,6 +179,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 
 	/* Activate element */
 	set_bit(e->id, map->members);
+	set->elements++;
 
 	return 0;
 }
@@ -190,6 +196,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		return -IPSET_ERR_EXIST;
 
 	ip_set_ext_destroy(set, x);
+	set->elements--;
 	if (SET_WITH_TIMEOUT(set) &&
 	    ip_set_timeout_expired(ext_timeout(x, set)))
 		return -IPSET_ERR_EXIST;
@@ -285,6 +292,7 @@ mtype_gc(unsigned long ul_set)
 			if (ip_set_timeout_expired(ext_timeout(x, set))) {
 				clear_bit(id, map->members);
 				ip_set_ext_destroy(set, x);
+				set->elements--;
 			}
 		}
 	spin_unlock_bh(&set->lock);
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index f5acfb9709c9..6e967f198d1e 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -275,7 +275,6 @@ htable_bits(u32 hashsize)
 struct htype {
 	struct htable __rcu *table; /* the hash table */
 	u32 maxelem;		/* max elements in the hash */
-	u32 elements;		/* current element (vs timeout) */
 	u32 initval;		/* random jhash init value */
 #ifdef IP_SET_HASH_WITH_MARKMASK
 	u32 markmask;		/* markmask value for mark mask to store */
@@ -400,7 +399,7 @@ mtype_flush(struct ip_set *set)
 #ifdef IP_SET_HASH_WITH_NETS
 	memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family));
 #endif
-	h->elements = 0;
+	set->elements = 0;
 }
 
 /* Destroy the hashtable part of the set */
@@ -506,7 +505,7 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
 						nets_length, k);
 #endif
 				ip_set_ext_destroy(set, data);
-				h->elements--;
+				set->elements--;
 				d++;
 			}
 		}
@@ -715,11 +714,11 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	bool deleted = false, forceadd = false, reuse = false;
 	u32 key, multi = 0;
 
-	if (h->elements >= h->maxelem) {
+	if (set->elements >= h->maxelem) {
 		if (SET_WITH_TIMEOUT(set))
 			/* FIXME: when set is full, we slow down here */
 			mtype_expire(set, h, NLEN(set->family), set->dsize);
-		if (h->elements >= h->maxelem && SET_WITH_FORCEADD(set))
+		if (set->elements >= h->maxelem && SET_WITH_FORCEADD(set))
 			forceadd = true;
 	}
 
@@ -732,7 +731,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 				pr_warn("Set %s is full, maxelem %u reached\n",
 					set->name, h->maxelem);
 			return -IPSET_ERR_HASH_FULL;
-		} else if (h->elements >= h->maxelem) {
+		} else if (set->elements >= h->maxelem) {
 			goto set_full;
 		}
 		old = NULL;
@@ -781,11 +780,11 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 					NLEN(set->family), i);
 #endif
 			ip_set_ext_destroy(set, data);
-			h->elements--;
+			set->elements--;
 		}
 		goto copy_data;
 	}
-	if (h->elements >= h->maxelem)
+	if (set->elements >= h->maxelem)
 		goto set_full;
 	/* Create a new slot */
 	if (n->pos >= n->size) {
@@ -810,7 +809,7 @@ copy_elem:
 	j = n->pos++;
 	data = ahash_data(n, j, set->dsize);
 copy_data:
-	h->elements++;
+	set->elements++;
 #ifdef IP_SET_HASH_WITH_NETS
 	for (i = 0; i < IPSET_NET_COUNT; i++)
 		mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)),
@@ -883,7 +882,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		smp_mb__after_atomic();
 		if (i + 1 == n->pos)
 			n->pos--;
-		h->elements--;
+		set->elements--;
 #ifdef IP_SET_HASH_WITH_NETS
 		for (j = 0; j < IPSET_NET_COUNT; j++)
 			mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)),
@@ -1084,7 +1083,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
 #endif
 	if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
 	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
-	    nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(h->elements)))
+	    nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
 		goto nla_put_failure;
 	if (unlikely(ip_set_put_flags(skb, set)))
 		goto nla_put_failure;
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 462b0b1870e2..c45516695934 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -166,6 +166,7 @@ __list_set_del_rcu(struct rcu_head * rcu)
 static inline void
 list_set_del(struct ip_set *set, struct set_elem *e)
 {
+	set->elements--;
 	list_del_rcu(&e->list);
 	call_rcu(&e->rcu, __list_set_del_rcu);
 }
@@ -309,6 +310,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		list_add_rcu(&e->list, &prev->list);
 	else
 		list_add_tail_rcu(&e->list, &map->members);
+	set->elements++;
 
 	return 0;
 }
@@ -419,6 +421,7 @@ list_set_flush(struct ip_set *set)
 
 	list_for_each_entry_safe(e, n, &map->members, list)
 		list_set_del(set, e);
+	set->elements = 0;
 }
 
 static void
@@ -471,7 +474,8 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
 		goto nla_put_failure;
 	if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) ||
 	    nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
-	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
+	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
+	    nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
 		goto nla_put_failure;
 	if (unlikely(ip_set_put_flags(skb, set)))
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 9e41f26a505cca04b7122e65053cf6447007ea79 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 10 Nov 2016 12:05:34 +0100
Subject: netfilter: ipset: Count non-static extension memory for userspace

Non-static (i.e. comment) extension was not counted into the memory
size. A new internal counter is introduced for this. In the case of
the hash types the sizes of the arrays are counted there as well so
that we can avoid to scan the whole set when just the header data
is requested.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h         |  8 ++++++--
 include/linux/netfilter/ipset/ip_set_comment.h |  7 +++++--
 net/netfilter/ipset/ip_set_bitmap_gen.h        |  5 +++--
 net/netfilter/ipset/ip_set_core.c              |  2 +-
 net/netfilter/ipset/ip_set_hash_gen.h          | 26 ++++++++++++++------------
 net/netfilter/ipset/ip_set_list_set.c          |  5 +++--
 6 files changed, 32 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 4671d740610f..8e42253e5d4d 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -79,10 +79,12 @@ enum ip_set_ext_id {
 	IPSET_EXT_ID_MAX,
 };
 
+struct ip_set;
+
 /* Extension type */
 struct ip_set_ext_type {
 	/* Destroy extension private data (can be NULL) */
-	void (*destroy)(void *ext);
+	void (*destroy)(struct ip_set *set, void *ext);
 	enum ip_set_extension type;
 	enum ipset_cadt_flags flag;
 	/* Size and minimal alignment */
@@ -252,6 +254,8 @@ struct ip_set {
 	u32 timeout;
 	/* Number of elements (vs timeout) */
 	u32 elements;
+	/* Size of the dynamic extensions (vs timeout) */
+	size_t ext_size;
 	/* Element data size */
 	size_t dsize;
 	/* Offsets to extensions in elements */
@@ -268,7 +272,7 @@ ip_set_ext_destroy(struct ip_set *set, void *data)
 	 */
 	if (SET_WITH_COMMENT(set))
 		ip_set_extensions[IPSET_EXT_ID_COMMENT].destroy(
-			ext_comment(data, set));
+			set, ext_comment(data, set));
 }
 
 static inline int
diff --git a/include/linux/netfilter/ipset/ip_set_comment.h b/include/linux/netfilter/ipset/ip_set_comment.h
index 5444b1bbe656..8e2bab1e8e90 100644
--- a/include/linux/netfilter/ipset/ip_set_comment.h
+++ b/include/linux/netfilter/ipset/ip_set_comment.h
@@ -20,13 +20,14 @@ ip_set_comment_uget(struct nlattr *tb)
  * The kadt functions don't use the comment extensions in any way.
  */
 static inline void
-ip_set_init_comment(struct ip_set_comment *comment,
+ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment,
 		    const struct ip_set_ext *ext)
 {
 	struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1);
 	size_t len = ext->comment ? strlen(ext->comment) : 0;
 
 	if (unlikely(c)) {
+		set->ext_size -= sizeof(*c) + strlen(c->str) + 1;
 		kfree_rcu(c, rcu);
 		rcu_assign_pointer(comment->c, NULL);
 	}
@@ -38,6 +39,7 @@ ip_set_init_comment(struct ip_set_comment *comment,
 	if (unlikely(!c))
 		return;
 	strlcpy(c->str, ext->comment, len + 1);
+	set->ext_size += sizeof(*c) + strlen(c->str) + 1;
 	rcu_assign_pointer(comment->c, c);
 }
 
@@ -58,13 +60,14 @@ ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment)
  * of the set data anymore.
  */
 static inline void
-ip_set_comment_free(struct ip_set_comment *comment)
+ip_set_comment_free(struct ip_set *set, struct ip_set_comment *comment)
 {
 	struct ip_set_comment_rcu *c;
 
 	c = rcu_dereference_protected(comment->c, 1);
 	if (unlikely(!c))
 		return;
+	set->ext_size -= sizeof(*c) + strlen(c->str) + 1;
 	kfree_rcu(c, rcu);
 	rcu_assign_pointer(comment->c, NULL);
 }
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 1810d1c06e3d..f8ea26cafa30 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -84,6 +84,7 @@ mtype_flush(struct ip_set *set)
 		mtype_ext_cleanup(set);
 	memset(map->members, 0, map->memsize);
 	set->elements = 0;
+	set->ext_size = 0;
 }
 
 /* Calculate the actual memory size of the set data */
@@ -99,7 +100,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
 {
 	const struct mtype *map = set->data;
 	struct nlattr *nested;
-	size_t memsize = mtype_memsize(map, set->dsize);
+	size_t memsize = mtype_memsize(map, set->dsize) + set->ext_size;
 
 	nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
 	if (!nested)
@@ -173,7 +174,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	if (SET_WITH_COUNTER(set))
 		ip_set_init_counter(ext_counter(x, set), ext);
 	if (SET_WITH_COMMENT(set))
-		ip_set_init_comment(ext_comment(x, set), ext);
+		ip_set_init_comment(set, ext_comment(x, set), ext);
 	if (SET_WITH_SKBINFO(set))
 		ip_set_init_skbinfo(ext_skbinfo(x, set), ext);
 
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index bfacccff7196..23345d2d136a 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -324,7 +324,7 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
 }
 EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
 
-typedef void (*destroyer)(void *);
+typedef void (*destroyer)(struct ip_set *, void *);
 /* ipset data extension types, in size order */
 
 const struct ip_set_ext_type ip_set_extensions[] = {
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 6e967f198d1e..0746405a1d14 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -343,21 +343,13 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length, u8 n)
 /* Calculate the actual memory size of the set data */
 static size_t
 mtype_ahash_memsize(const struct htype *h, const struct htable *t,
-		    u8 nets_length, size_t dsize)
+		    u8 nets_length)
 {
-	u32 i;
-	struct hbucket *n;
 	size_t memsize = sizeof(*h) + sizeof(*t);
 
 #ifdef IP_SET_HASH_WITH_NETS
 	memsize += sizeof(struct net_prefixes) * nets_length;
 #endif
-	for (i = 0; i < jhash_size(t->htable_bits); i++) {
-		n = rcu_dereference_bh(hbucket(t, i));
-		if (!n)
-			continue;
-		memsize += sizeof(struct hbucket) + n->size * dsize;
-	}
 
 	return memsize;
 }
@@ -400,6 +392,7 @@ mtype_flush(struct ip_set *set)
 	memset(h->nets, 0, sizeof(struct net_prefixes) * NLEN(set->family));
 #endif
 	set->elements = 0;
+	set->ext_size = 0;
 }
 
 /* Destroy the hashtable part of the set */
@@ -531,6 +524,7 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
 				d++;
 			}
 			tmp->pos = d;
+			set->ext_size -= AHASH_INIT_SIZE * dsize;
 			rcu_assign_pointer(hbucket(t, i), tmp);
 			kfree_rcu(n, rcu);
 		}
@@ -562,7 +556,7 @@ mtype_resize(struct ip_set *set, bool retried)
 	struct htype *h = set->data;
 	struct htable *t, *orig;
 	u8 htable_bits;
-	size_t dsize = set->dsize;
+	size_t extsize, dsize = set->dsize;
 #ifdef IP_SET_HASH_WITH_NETS
 	u8 flags;
 	struct mtype_elem *tmp;
@@ -605,6 +599,7 @@ retry:
 	/* There can't be another parallel resizing, but dumping is possible */
 	atomic_set(&orig->ref, 1);
 	atomic_inc(&orig->uref);
+	extsize = 0;
 	pr_debug("attempt to resize set %s from %u to %u, t %p\n",
 		 set->name, orig->htable_bits, htable_bits, orig);
 	for (i = 0; i < jhash_size(orig->htable_bits); i++) {
@@ -635,6 +630,7 @@ retry:
 					goto cleanup;
 				}
 				m->size = AHASH_INIT_SIZE;
+				extsize = sizeof(*m) + AHASH_INIT_SIZE * dsize;
 				RCU_INIT_POINTER(hbucket(t, key), m);
 			} else if (m->pos >= m->size) {
 				struct hbucket *ht;
@@ -654,6 +650,7 @@ retry:
 				memcpy(ht, m, sizeof(struct hbucket) +
 					      m->size * dsize);
 				ht->size = m->size + AHASH_INIT_SIZE;
+				extsize += AHASH_INIT_SIZE * dsize;
 				kfree(m);
 				m = ht;
 				RCU_INIT_POINTER(hbucket(t, key), ht);
@@ -667,6 +664,7 @@ retry:
 		}
 	}
 	rcu_assign_pointer(h->table, t);
+	set->ext_size = extsize;
 
 	spin_unlock_bh(&set->lock);
 
@@ -740,6 +738,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		if (!n)
 			return -ENOMEM;
 		n->size = AHASH_INIT_SIZE;
+		set->ext_size += sizeof(*n) + AHASH_INIT_SIZE * set->dsize;
 		goto copy_elem;
 	}
 	for (i = 0; i < n->pos; i++) {
@@ -803,6 +802,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		memcpy(n, old, sizeof(struct hbucket) +
 		       old->size * set->dsize);
 		n->size = old->size + AHASH_INIT_SIZE;
+		set->ext_size += AHASH_INIT_SIZE * set->dsize;
 	}
 
 copy_elem:
@@ -823,7 +823,7 @@ overwrite_extensions:
 	if (SET_WITH_COUNTER(set))
 		ip_set_init_counter(ext_counter(data, set), ext);
 	if (SET_WITH_COMMENT(set))
-		ip_set_init_comment(ext_comment(data, set), ext);
+		ip_set_init_comment(set, ext_comment(data, set), ext);
 	if (SET_WITH_SKBINFO(set))
 		ip_set_init_skbinfo(ext_skbinfo(data, set), ext);
 	/* Must come last for the case when timed out entry is reused */
@@ -895,6 +895,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 				k++;
 		}
 		if (n->pos == 0 && k == 0) {
+			set->ext_size -= sizeof(*n) + n->size * dsize;
 			rcu_assign_pointer(hbucket(t, key), NULL);
 			kfree_rcu(n, rcu);
 		} else if (k >= AHASH_INIT_SIZE) {
@@ -913,6 +914,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 				k++;
 			}
 			tmp->pos = k;
+			set->ext_size -= AHASH_INIT_SIZE * dsize;
 			rcu_assign_pointer(hbucket(t, key), tmp);
 			kfree_rcu(n, rcu);
 		}
@@ -1061,7 +1063,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
 
 	rcu_read_lock_bh();
 	t = rcu_dereference_bh_nfnl(h->table);
-	memsize = mtype_ahash_memsize(h, t, NLEN(set->family), set->dsize);
+	memsize = mtype_ahash_memsize(h, t, NLEN(set->family)) + set->ext_size;
 	htable_bits = t->htable_bits;
 	rcu_read_unlock_bh();
 
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index c45516695934..dede343a662b 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -228,7 +228,7 @@ list_set_init_extensions(struct ip_set *set, const struct ip_set_ext *ext,
 	if (SET_WITH_COUNTER(set))
 		ip_set_init_counter(ext_counter(e, set), ext);
 	if (SET_WITH_COMMENT(set))
-		ip_set_init_comment(ext_comment(e, set), ext);
+		ip_set_init_comment(set, ext_comment(e, set), ext);
 	if (SET_WITH_SKBINFO(set))
 		ip_set_init_skbinfo(ext_skbinfo(e, set), ext);
 	/* Update timeout last */
@@ -422,6 +422,7 @@ list_set_flush(struct ip_set *set)
 	list_for_each_entry_safe(e, n, &map->members, list)
 		list_set_del(set, e);
 	set->elements = 0;
+	set->ext_size = 0;
 }
 
 static void
@@ -467,7 +468,7 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
 {
 	const struct list_set *map = set->data;
 	struct nlattr *nested;
-	size_t memsize = list_set_memsize(map, set->dsize);
+	size_t memsize = list_set_memsize(map, set->dsize) + set->ext_size;
 
 	nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
 	if (!nested)
-- 
cgit v1.2.3


From c540594f864bb4645573c2c0a304919fabb3d7ea Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 9 Nov 2016 22:02:34 +0100
Subject: bpf, mlx4: fix prog refcount in mlx4_en_try_alloc_resources error
 path

Commit 67f8b1dcb9ee ("net/mlx4_en: Refactor the XDP forwarding rings
scheme") added a bug in that the prog's reference count is not dropped
in the error path when mlx4_en_try_alloc_resources() is failing from
mlx4_xdp_set().

We previously took bpf_prog_add(prog, priv->rx_ring_num - 1), that we
need to release again. Earlier in the call path, dev_change_xdp_fd()
itself holds a reference to the prog as well (hence the '- 1' in the
bpf_prog_add()), so a simple atomic_sub() is safe to use here. When
an error is propagated, then bpf_prog_put() is called eventually from
dev_change_xdp_fd()

Fixes: 67f8b1dcb9ee ("net/mlx4_en: Refactor the XDP forwarding rings scheme")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  5 ++++-
 include/linux/bpf.h                            |  5 +++++
 kernel/bpf/syscall.c                           | 11 +++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 0f6225c042be..9bf7320107b0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2747,8 +2747,11 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 	}
 
 	err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof);
-	if (err)
+	if (err) {
+		if (prog)
+			bpf_prog_sub(prog, priv->rx_ring_num - 1);
 		goto unlock_out;
+	}
 
 	if (priv->port_up) {
 		port_up = 1;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index edcd96ded8aa..01c1487277b2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -234,6 +234,7 @@ void bpf_register_map_type(struct bpf_map_type_list *tl);
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i);
+void bpf_prog_sub(struct bpf_prog *prog, int i);
 struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
 
@@ -303,6 +304,10 @@ static inline struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline void bpf_prog_sub(struct bpf_prog *prog, int i)
+{
+}
+
 static inline void bpf_prog_put(struct bpf_prog *prog)
 {
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 228f962447a5..23eb2050f15e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -680,6 +680,17 @@ struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_add);
 
+void bpf_prog_sub(struct bpf_prog *prog, int i)
+{
+	/* Only to be used for undoing previous bpf_prog_add() in some
+	 * error path. We still know that another entity in our call
+	 * path holds a reference to the program, thus atomic_sub() can
+	 * be safely used in such cases!
+	 */
+	WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_sub);
+
 struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
 {
 	return bpf_prog_add(prog, 1);
-- 
cgit v1.2.3


From 372788f964c95a6fa0f677c43d6153c27896ef42 Mon Sep 17 00:00:00 2001
From: "Lendacky, Thomas" <Thomas.Lendacky@amd.com>
Date: Thu, 10 Nov 2016 17:10:46 -0600
Subject: net: phy: expose phy_aneg_done API for use by drivers

Make phy_aneg_done() available to drivers so that the result of the
auto-negotiation initiated by phy_start_aneg() can be determined.

Remove the local implementation of phy_aneg_done() from the Aeroflex
driver and use the phy library version.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/aeroflex/greth.c | 9 ---------
 drivers/net/phy/phy.c                 | 3 ++-
 include/linux/phy.h                   | 1 +
 3 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/aeroflex/greth.c b/drivers/net/ethernet/aeroflex/greth.c
index f8df8248035e..93def92f9997 100644
--- a/drivers/net/ethernet/aeroflex/greth.c
+++ b/drivers/net/ethernet/aeroflex/greth.c
@@ -1290,15 +1290,6 @@ static int greth_mdio_probe(struct net_device *dev)
 	return 0;
 }
 
-static inline int phy_aneg_done(struct phy_device *phydev)
-{
-	int retval;
-
-	retval = phy_read(phydev, MII_BMSR);
-
-	return (retval < 0) ? retval : (retval & BMSR_ANEGCOMPLETE);
-}
-
 static int greth_mdio_init(struct greth_private *greth)
 {
 	int ret;
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 2f94c60d4939..e6dd222fddb1 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -143,13 +143,14 @@ static int phy_config_interrupt(struct phy_device *phydev, u32 interrupts)
  * Returns > 0 on success or < 0 on error. 0 means that auto-negotiation
  * is still pending.
  */
-static inline int phy_aneg_done(struct phy_device *phydev)
+int phy_aneg_done(struct phy_device *phydev)
 {
 	if (phydev->drv->aneg_done)
 		return phydev->drv->aneg_done(phydev);
 
 	return genphy_aneg_done(phydev);
 }
+EXPORT_SYMBOL(phy_aneg_done);
 
 /* A structure for mapping a particular speed and duplex
  * combination to a particular SUPPORTED and ADVERTISED value
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e7e1fd382564..9880d73a2c3d 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -786,6 +786,7 @@ void phy_detach(struct phy_device *phydev);
 void phy_start(struct phy_device *phydev);
 void phy_stop(struct phy_device *phydev);
 int phy_start_aneg(struct phy_device *phydev);
+int phy_aneg_done(struct phy_device *phydev);
 
 int phy_stop_interrupts(struct phy_device *phydev);
 
-- 
cgit v1.2.3


From 4a4f86cc7d6bc74522f581341a2cae3119d5a0f5 Mon Sep 17 00:00:00 2001
From: pravin shelar <pshelar@ovn.org>
Date: Sun, 13 Nov 2016 20:43:52 -0800
Subject: vxlan: avoid vlan processing in vxlan device.

VxLan device does not have special handling for vlan taging on egress.
Therefore it does not make sense to expose vlan offloading feature.
This patch does not change vxlan functinality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c     |  9 +--------
 include/linux/if_vlan.h | 16 ----------------
 2 files changed, 1 insertion(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 5264c1a49d86..7bebce190270 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1750,18 +1750,13 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
 	}
 
 	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
-			+ VXLAN_HLEN + iphdr_len
-			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
+			+ VXLAN_HLEN + iphdr_len;
 
 	/* Need space for new headers (invalidates iph ptr) */
 	err = skb_cow_head(skb, min_headroom);
 	if (unlikely(err))
 		goto out_free;
 
-	skb = vlan_hwaccel_push_inside(skb);
-	if (WARN_ON(!skb))
-		return -ENOMEM;
-
 	err = iptunnel_handle_offloads(skb, type);
 	if (err)
 		goto out_free;
@@ -2529,10 +2524,8 @@ static void vxlan_setup(struct net_device *dev)
 	dev->features   |= NETIF_F_GSO_SOFTWARE;
 
 	dev->vlan_features = dev->features;
-	dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
 	dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
 	dev->hw_features |= NETIF_F_GSO_SOFTWARE;
-	dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
 	netif_keep_dst(dev);
 	dev->priv_flags |= IFF_NO_QUEUE;
 
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 3319d97d789d..8d5fcd6284ce 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -399,22 +399,6 @@ static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
 		skb->vlan_tci = 0;
 	return skb;
 }
-/*
- * vlan_hwaccel_push_inside - pushes vlan tag to the payload
- * @skb: skbuff to tag
- *
- * Checks is tag is present in @skb->vlan_tci and if it is, it pushes the
- * VLAN tag from @skb->vlan_tci inside to the payload.
- *
- * Following the skb_unshare() example, in case of error, the calling function
- * doesn't have to worry about freeing the original skb.
- */
-static inline struct sk_buff *vlan_hwaccel_push_inside(struct sk_buff *skb)
-{
-	if (skb_vlan_tag_present(skb))
-		skb = __vlan_hwaccel_push_inside(skb);
-	return skb;
-}
 
 /**
  * __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting
-- 
cgit v1.2.3


From e86a8987e458a1826f509c41494b0b29a61144a7 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 15 Nov 2016 10:06:30 -0800
Subject: net: phy: Add phy_ethtool_nway_reset

This function just calls into genphy_restart_aneg() to perform an
autonegotation restart.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 11 +++++++++++
 include/linux/phy.h   |  1 +
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index e6dd222fddb1..73adbaa9ac86 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1441,3 +1441,14 @@ int phy_ethtool_set_link_ksettings(struct net_device *ndev,
 	return phy_ethtool_ksettings_set(phydev, cmd);
 }
 EXPORT_SYMBOL(phy_ethtool_set_link_ksettings);
+
+int phy_ethtool_nway_reset(struct net_device *ndev)
+{
+	struct phy_device *phydev = ndev->phydev;
+
+	if (!phydev)
+		return -ENODEV;
+
+	return genphy_restart_aneg(phydev);
+}
+EXPORT_SYMBOL(phy_ethtool_nway_reset);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 9880d73a2c3d..b9bd3b4f4ea1 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -860,6 +860,7 @@ int phy_ethtool_get_link_ksettings(struct net_device *ndev,
 				   struct ethtool_link_ksettings *cmd);
 int phy_ethtool_set_link_ksettings(struct net_device *ndev,
 				   const struct ethtool_link_ksettings *cmd);
+int phy_ethtool_nway_reset(struct net_device *ndev);
 
 int __init mdio_bus_init(void);
 void mdio_bus_exit(void);
-- 
cgit v1.2.3


From ff86aae3b4112b85d2231c23bccbc49589df1c06 Mon Sep 17 00:00:00 2001
From: Madalin Bucur <madalin.bucur@nxp.com>
Date: Tue, 15 Nov 2016 10:41:01 +0200
Subject: devres: add devm_alloc_percpu()

Introduce managed counterparts for alloc_percpu() and free_percpu().
Add devm_alloc_percpu() and devm_free_percpu() into the managed
interfaces list.

Signed-off-by: Madalin Bucur <madalin.bucur@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/driver-model/devres.txt |  4 +++
 drivers/base/devres.c                 | 66 +++++++++++++++++++++++++++++++++++
 include/linux/device.h                | 19 ++++++++++
 3 files changed, 89 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index 167070895498..ca9d1eb46bc0 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -332,6 +332,10 @@ MEM
 MFD
  devm_mfd_add_devices()
 
+PER-CPU MEM
+  devm_alloc_percpu()
+  devm_free_percpu()
+
 PCI
   pcim_enable_device()	: after success, all PCI ops become managed
   pcim_pin_device()	: keep PCI device enabled after release
diff --git a/drivers/base/devres.c b/drivers/base/devres.c
index 8fc654f0807b..71d577025285 100644
--- a/drivers/base/devres.c
+++ b/drivers/base/devres.c
@@ -10,6 +10,7 @@
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/percpu.h>
 
 #include "base.h"
 
@@ -985,3 +986,68 @@ void devm_free_pages(struct device *dev, unsigned long addr)
 			       &devres));
 }
 EXPORT_SYMBOL_GPL(devm_free_pages);
+
+static void devm_percpu_release(struct device *dev, void *pdata)
+{
+	void __percpu *p;
+
+	p = *(void __percpu **)pdata;
+	free_percpu(p);
+}
+
+static int devm_percpu_match(struct device *dev, void *data, void *p)
+{
+	struct devres *devr = container_of(data, struct devres, data);
+
+	return *(void **)devr->data == p;
+}
+
+/**
+ * __devm_alloc_percpu - Resource-managed alloc_percpu
+ * @dev: Device to allocate per-cpu memory for
+ * @size: Size of per-cpu memory to allocate
+ * @align: Alignment of per-cpu memory to allocate
+ *
+ * Managed alloc_percpu. Per-cpu memory allocated with this function is
+ * automatically freed on driver detach.
+ *
+ * RETURNS:
+ * Pointer to allocated memory on success, NULL on failure.
+ */
+void __percpu *__devm_alloc_percpu(struct device *dev, size_t size,
+		size_t align)
+{
+	void *p;
+	void __percpu *pcpu;
+
+	pcpu = __alloc_percpu(size, align);
+	if (!pcpu)
+		return NULL;
+
+	p = devres_alloc(devm_percpu_release, sizeof(void *), GFP_KERNEL);
+	if (!p) {
+		free_percpu(pcpu);
+		return NULL;
+	}
+
+	*(void __percpu **)p = pcpu;
+
+	devres_add(dev, p);
+
+	return pcpu;
+}
+EXPORT_SYMBOL_GPL(__devm_alloc_percpu);
+
+/**
+ * devm_free_percpu - Resource-managed free_percpu
+ * @dev: Device this memory belongs to
+ * @pdata: Per-cpu memory to free
+ *
+ * Free memory allocated with devm_alloc_percpu().
+ */
+void devm_free_percpu(struct device *dev, void __percpu *pdata)
+{
+	WARN_ON(devres_destroy(dev, devm_percpu_release, devm_percpu_match,
+			       (void *)pdata));
+}
+EXPORT_SYMBOL_GPL(devm_free_percpu);
diff --git a/include/linux/device.h b/include/linux/device.h
index bc41e87a969b..a00105cf795e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -698,6 +698,25 @@ static inline int devm_add_action_or_reset(struct device *dev,
 	return ret;
 }
 
+/**
+ * devm_alloc_percpu - Resource-managed alloc_percpu
+ * @dev: Device to allocate per-cpu memory for
+ * @type: Type to allocate per-cpu memory for
+ *
+ * Managed alloc_percpu. Per-cpu memory allocated with this function is
+ * automatically freed on driver detach.
+ *
+ * RETURNS:
+ * Pointer to allocated memory on success, NULL on failure.
+ */
+#define devm_alloc_percpu(dev, type)      \
+	((typeof(type) __percpu *)__devm_alloc_percpu((dev), sizeof(type), \
+						      __alignof__(type)))
+
+void __percpu *__devm_alloc_percpu(struct device *dev, size_t size,
+				   size_t align);
+void devm_free_percpu(struct device *dev, void __percpu *pdata);
+
 struct device_dma_parameters {
 	/*
 	 * a low level driver may set these to teach IOMMU code about
-- 
cgit v1.2.3


From 217f6974368188fd8bd7804bf5a036aa5762c5e4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 15 Nov 2016 10:15:11 -0800
Subject: net: busy-poll: allow preemption in sk_busy_loop()

After commit 4cd13c21b207 ("softirq: Let ksoftirqd do its job"),
sk_busy_loop() needs a bit of care :
softirqs might be delayed since we do not allow preemption yet.

This patch adds preemptiom points in sk_busy_loop(),
and makes sure no unnecessary cache line dirtying
or atomic operations are done while looping.

A new flag is added into napi->state : NAPI_STATE_IN_BUSY_POLL

This prevents napi_complete_done() from clearing NAPIF_STATE_SCHED,
so that sk_busy_loop() does not have to grab it again.

Similarly, netpoll_poll_lock() is done one time.

This gives about 10 to 20 % improvement in various busy polling
tests, especially when many threads are busy polling in
configurations with large number of NIC queues.

This should allow experimenting with bigger delays without
hurting overall latencies.

Tested:
 On a 40Gb mlx4 NIC, 32 RX/TX queues.

 echo 70 >/proc/sys/net/core/busy_read
 for i in `seq 1 40`; do echo -n $i: ; ./super_netperf $i -H lpaa24 -t UDP_RR -- -N -n; done

    Before:      After:
 1:   90072   92819
 2:  157289  184007
 3:  235772  213504
 4:  344074  357513
 5:  394755  458267
 6:  461151  487819
 7:  549116  625963
 8:  544423  716219
 9:  720460  738446
10:  794686  837612
11:  915998  923960
12:  937507  925107
13: 1019677  971506
14: 1046831 1113650
15: 1114154 1148902
16: 1105221 1179263
17: 1266552 1299585
18: 1258454 1383817
19: 1341453 1312194
20: 1363557 1488487
21: 1387979 1501004
22: 1417552 1601683
23: 1550049 1642002
24: 1568876 1601915
25: 1560239 1683607
26: 1640207 1745211
27: 1706540 1723574
28: 1638518 1722036
29: 1734309 1757447
30: 1782007 1855436
31: 1724806 1888539
32: 1717716 1944297
33: 1778716 1869118
34: 1805738 1983466
35: 1815694 2020758
36: 1893059 2035632
37: 1843406 2034653
38: 1888830 2086580
39: 1972827 2143567
40: 1877729 2181851

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Adam Belay <abelay@google.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Cc: Yuval Mintz <Yuval.Mintz@cavium.com>
Cc: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  10 +++++
 net/core/dev.c            | 102 +++++++++++++++++++++++++++++++++++++---------
 2 files changed, 92 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 86bacf6a64f0..e71de66e3792 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -334,6 +334,16 @@ enum {
 	NAPI_STATE_NPSVC,	/* Netpoll - don't dequeue from poll_list */
 	NAPI_STATE_HASHED,	/* In NAPI hash (busy polling possible) */
 	NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
+	NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+};
+
+enum {
+	NAPIF_STATE_SCHED	 = (1UL << NAPI_STATE_SCHED),
+	NAPIF_STATE_DISABLE	 = (1UL << NAPI_STATE_DISABLE),
+	NAPIF_STATE_NPSVC	 = (1UL << NAPI_STATE_NPSVC),
+	NAPIF_STATE_HASHED	 = (1UL << NAPI_STATE_HASHED),
+	NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL),
+	NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL),
 };
 
 enum gro_result {
diff --git a/net/core/dev.c b/net/core/dev.c
index 6deba68ad9e4..369dcc8efc01 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4902,6 +4902,12 @@ void __napi_complete(struct napi_struct *n)
 {
 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
 
+	/* Some drivers call us directly, instead of calling
+	 * napi_complete_done().
+	 */
+	if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
+		return;
+
 	list_del_init(&n->poll_list);
 	smp_mb__before_atomic();
 	clear_bit(NAPI_STATE_SCHED, &n->state);
@@ -4913,10 +4919,13 @@ void napi_complete_done(struct napi_struct *n, int work_done)
 	unsigned long flags;
 
 	/*
-	 * don't let napi dequeue from the cpu poll list
-	 * just in case its running on a different cpu
+	 * 1) Don't let napi dequeue from the cpu poll list
+	 *    just in case its running on a different cpu.
+	 * 2) If we are busy polling, do nothing here, we have
+	 *    the guarantee we will be called later.
 	 */
-	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
+	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
+				 NAPIF_STATE_IN_BUSY_POLL)))
 		return;
 
 	if (n->gro_list) {
@@ -4956,13 +4965,41 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
 }
 
 #if defined(CONFIG_NET_RX_BUSY_POLL)
+
 #define BUSY_POLL_BUDGET 8
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
+{
+	int rc;
+
+	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
+
+	local_bh_disable();
+
+	/* All we really want here is to re-enable device interrupts.
+	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
+	 */
+	rc = napi->poll(napi, BUSY_POLL_BUDGET);
+	netpoll_poll_unlock(have_poll_lock);
+	if (rc == BUSY_POLL_BUDGET)
+		__napi_schedule(napi);
+	local_bh_enable();
+	if (local_softirq_pending())
+		do_softirq();
+}
+
 bool sk_busy_loop(struct sock *sk, int nonblock)
 {
 	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
+	int (*napi_poll)(struct napi_struct *napi, int budget);
 	int (*busy_poll)(struct napi_struct *dev);
+	void *have_poll_lock = NULL;
 	struct napi_struct *napi;
-	int rc = false;
+	int rc;
+
+restart:
+	rc = false;
+	napi_poll = NULL;
 
 	rcu_read_lock();
 
@@ -4973,24 +5010,33 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
 	/* Note: ndo_busy_poll method is optional in linux-4.5 */
 	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
 
-	do {
+	preempt_disable();
+	for (;;) {
 		rc = 0;
 		local_bh_disable();
 		if (busy_poll) {
 			rc = busy_poll(napi);
-		} else if (napi_schedule_prep(napi)) {
-			void *have = netpoll_poll_lock(napi);
-
-			if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
-				rc = napi->poll(napi, BUSY_POLL_BUDGET);
-				trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
-				if (rc == BUSY_POLL_BUDGET) {
-					napi_complete_done(napi, rc);
-					napi_schedule(napi);
-				}
-			}
-			netpoll_poll_unlock(have);
+			goto count;
 		}
+		if (!napi_poll) {
+			unsigned long val = READ_ONCE(napi->state);
+
+			/* If multiple threads are competing for this napi,
+			 * we avoid dirtying napi->state as much as we can.
+			 */
+			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
+				   NAPIF_STATE_IN_BUSY_POLL))
+				goto count;
+			if (cmpxchg(&napi->state, val,
+				    val | NAPIF_STATE_IN_BUSY_POLL |
+					  NAPIF_STATE_SCHED) != val)
+				goto count;
+			have_poll_lock = netpoll_poll_lock(napi);
+			napi_poll = napi->poll;
+		}
+		rc = napi_poll(napi, BUSY_POLL_BUDGET);
+		trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
+count:
 		if (rc > 0)
 			__NET_ADD_STATS(sock_net(sk),
 					LINUX_MIB_BUSYPOLLRXPACKETS, rc);
@@ -4999,10 +5045,26 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
 		if (rc == LL_FLUSH_FAILED)
 			break; /* permanent failure */
 
-		cpu_relax();
-	} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
-		 !need_resched() && !busy_loop_timeout(end_time));
+		if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
+		    busy_loop_timeout(end_time))
+			break;
 
+		if (unlikely(need_resched())) {
+			if (napi_poll)
+				busy_poll_stop(napi, have_poll_lock);
+			preempt_enable();
+			rcu_read_unlock();
+			cond_resched();
+			rc = !skb_queue_empty(&sk->sk_receive_queue);
+			if (rc || busy_loop_timeout(end_time))
+				return rc;
+			goto restart;
+		}
+		cpu_relax_lowlatency();
+	}
+	if (napi_poll)
+		busy_poll_stop(napi, have_poll_lock);
+	preempt_enable();
 	rc = !skb_queue_empty(&sk->sk_receive_queue);
 out:
 	rcu_read_unlock();
-- 
cgit v1.2.3


From 364b6055738b4c752c30ccaaf25c624e69d76195 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 15 Nov 2016 10:15:13 -0800
Subject: net: busy-poll: return busypolling status to drivers

NAPI drivers use napi_complete_done() or napi_complete() when
they drained RX ring and right before re-enabling device interrupts.

In busy polling, we can avoid interrupts being delivered since
we are polling RX ring in a controlled loop.

Drivers can chose to use napi_complete_done() return value
to reduce interrupts overhead while busy polling is active.

This is optional, legacy drivers should work fine even
if not updated.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Adam Belay <abelay@google.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Cc: Yuval Mintz <Yuval.Mintz@cavium.com>
Cc: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  7 ++++---
 net/core/dev.c            | 10 ++++++----
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e71de66e3792..bcddf951ccee 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -463,16 +463,17 @@ static inline bool napi_reschedule(struct napi_struct *napi)
 	return false;
 }
 
-void __napi_complete(struct napi_struct *n);
-void napi_complete_done(struct napi_struct *n, int work_done);
+bool __napi_complete(struct napi_struct *n);
+bool napi_complete_done(struct napi_struct *n, int work_done);
 /**
  *	napi_complete - NAPI processing complete
  *	@n: NAPI context
  *
  * Mark NAPI processing as complete.
  * Consider using napi_complete_done() instead.
+ * Return false if device should avoid rearming interrupts.
  */
-static inline void napi_complete(struct napi_struct *n)
+static inline bool napi_complete(struct napi_struct *n)
 {
 	return napi_complete_done(n, 0);
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index 369dcc8efc01..edba9efeb2e9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4898,7 +4898,7 @@ void __napi_schedule_irqoff(struct napi_struct *n)
 }
 EXPORT_SYMBOL(__napi_schedule_irqoff);
 
-void __napi_complete(struct napi_struct *n)
+bool __napi_complete(struct napi_struct *n)
 {
 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
 
@@ -4906,15 +4906,16 @@ void __napi_complete(struct napi_struct *n)
 	 * napi_complete_done().
 	 */
 	if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
-		return;
+		return false;
 
 	list_del_init(&n->poll_list);
 	smp_mb__before_atomic();
 	clear_bit(NAPI_STATE_SCHED, &n->state);
+	return true;
 }
 EXPORT_SYMBOL(__napi_complete);
 
-void napi_complete_done(struct napi_struct *n, int work_done)
+bool napi_complete_done(struct napi_struct *n, int work_done)
 {
 	unsigned long flags;
 
@@ -4926,7 +4927,7 @@ void napi_complete_done(struct napi_struct *n, int work_done)
 	 */
 	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
 				 NAPIF_STATE_IN_BUSY_POLL)))
-		return;
+		return false;
 
 	if (n->gro_list) {
 		unsigned long timeout = 0;
@@ -4948,6 +4949,7 @@ void napi_complete_done(struct napi_struct *n, int work_done)
 		__napi_complete(n);
 		local_irq_restore(flags);
 	}
+	return true;
 }
 EXPORT_SYMBOL(napi_complete_done);
 
-- 
cgit v1.2.3


From 89c4b442b78bdba388337cc746fe63caba85f46c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Nov 2016 14:54:50 -0800
Subject: netpoll: more efficient locking

Callers of netpoll_poll_lock() own NAPI_STATE_SCHED

Callers of netpoll_poll_unlock() have BH blocked between
the NAPI_STATE_SCHED being cleared and poll_lock is released.

We can avoid the spinlock which has no contention, and use cmpxchg()
on poll_owner which we need to set anyway.

This removes a possible lockdep violation after the cited commit,
since sk_busy_loop() re-enables BH before calling busy_poll_stop()

Fixes: 217f69743681 ("net: busy-poll: allow preemption in sk_busy_loop()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 -
 include/linux/netpoll.h   | 13 +++++++------
 net/core/dev.c            |  1 -
 net/core/netpoll.c        |  6 +++---
 4 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bcddf951ccee..e84800edd249 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -316,7 +316,6 @@ struct napi_struct {
 	unsigned int		gro_count;
 	int			(*poll)(struct napi_struct *, int);
 #ifdef CONFIG_NETPOLL
-	spinlock_t		poll_lock;
 	int			poll_owner;
 #endif
 	struct net_device	*dev;
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index b25ee9ffdbe6..1828900c9411 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -78,8 +78,11 @@ static inline void *netpoll_poll_lock(struct napi_struct *napi)
 	struct net_device *dev = napi->dev;
 
 	if (dev && dev->npinfo) {
-		spin_lock(&napi->poll_lock);
-		napi->poll_owner = smp_processor_id();
+		int owner = smp_processor_id();
+
+		while (cmpxchg(&napi->poll_owner, -1, owner) != -1)
+			cpu_relax();
+
 		return napi;
 	}
 	return NULL;
@@ -89,10 +92,8 @@ static inline void netpoll_poll_unlock(void *have)
 {
 	struct napi_struct *napi = have;
 
-	if (napi) {
-		napi->poll_owner = -1;
-		spin_unlock(&napi->poll_lock);
-	}
+	if (napi)
+		smp_store_release(&napi->poll_owner, -1);
 }
 
 static inline bool netpoll_tx_running(struct net_device *dev)
diff --git a/net/core/dev.c b/net/core/dev.c
index edba9efeb2e9..f71b34ab57a5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5143,7 +5143,6 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 	list_add(&napi->dev_list, &dev->napi_list);
 	napi->dev = dev;
 #ifdef CONFIG_NETPOLL
-	spin_lock_init(&napi->poll_lock);
 	napi->poll_owner = -1;
 #endif
 	set_bit(NAPI_STATE_SCHED, &napi->state);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 53599bd0c82d..9424673009c1 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -171,12 +171,12 @@ static void poll_one_napi(struct napi_struct *napi)
 static void poll_napi(struct net_device *dev)
 {
 	struct napi_struct *napi;
+	int cpu = smp_processor_id();
 
 	list_for_each_entry(napi, &dev->napi_list, dev_list) {
-		if (napi->poll_owner != smp_processor_id() &&
-		    spin_trylock(&napi->poll_lock)) {
+		if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) {
 			poll_one_napi(napi);
-			spin_unlock(&napi->poll_lock);
+			smp_store_release(&napi->poll_owner, -1);
 		}
 	}
 }
-- 
cgit v1.2.3


From 0ac3ea70897fb9f84b620aeda074ecccf481629d Mon Sep 17 00:00:00 2001
From: Mohamad Haj Yahia <mohamad@mellanox.com>
Date: Thu, 17 Nov 2016 13:45:55 +0200
Subject: net/mlx5: Make the command interface cache more flexible

Add more cache command size sets and more entries for each set based on
the current commands set different sizes and commands frequency.

Fixes: e126ba97dba9 ('mlx5: Add driver for Mellanox Connect-IB adapters')
Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 145 ++++++++++++--------------
 include/linux/mlx5/driver.h                   |  14 +--
 2 files changed, 76 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 8561102f2563..0fe7a60bf66a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -53,14 +53,6 @@ enum {
 	CMD_MODE_EVENTS
 };
 
-enum {
-	NUM_LONG_LISTS	  = 2,
-	NUM_MED_LISTS	  = 64,
-	LONG_LIST_SIZE	  = (2ULL * 1024 * 1024 * 1024 / PAGE_SIZE) * 8 + 16 +
-				MLX5_CMD_DATA_BLOCK_SIZE,
-	MED_LIST_SIZE	  = 16 + MLX5_CMD_DATA_BLOCK_SIZE,
-};
-
 enum {
 	MLX5_CMD_DELIVERY_STAT_OK			= 0x0,
 	MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR		= 0x1,
@@ -1372,10 +1364,10 @@ static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg)
 {
 	unsigned long flags;
 
-	if (msg->cache) {
-		spin_lock_irqsave(&msg->cache->lock, flags);
-		list_add_tail(&msg->list, &msg->cache->head);
-		spin_unlock_irqrestore(&msg->cache->lock, flags);
+	if (msg->parent) {
+		spin_lock_irqsave(&msg->parent->lock, flags);
+		list_add_tail(&msg->list, &msg->parent->head);
+		spin_unlock_irqrestore(&msg->parent->lock, flags);
 	} else {
 		mlx5_free_cmd_msg(dev, msg);
 	}
@@ -1472,30 +1464,37 @@ static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size,
 				      gfp_t gfp)
 {
 	struct mlx5_cmd_msg *msg = ERR_PTR(-ENOMEM);
+	struct cmd_msg_cache *ch = NULL;
 	struct mlx5_cmd *cmd = &dev->cmd;
-	struct cache_ent *ent = NULL;
-
-	if (in_size > MED_LIST_SIZE && in_size <= LONG_LIST_SIZE)
-		ent = &cmd->cache.large;
-	else if (in_size > 16 && in_size <= MED_LIST_SIZE)
-		ent = &cmd->cache.med;
-
-	if (ent) {
-		spin_lock_irq(&ent->lock);
-		if (!list_empty(&ent->head)) {
-			msg = list_entry(ent->head.next, typeof(*msg), list);
-			/* For cached lists, we must explicitly state what is
-			 * the real size
-			 */
-			msg->len = in_size;
-			list_del(&msg->list);
+	int i;
+
+	if (in_size <= 16)
+		goto cache_miss;
+
+	for (i = 0; i < MLX5_NUM_COMMAND_CACHES; i++) {
+		ch = &cmd->cache[i];
+		if (in_size > ch->max_inbox_size)
+			continue;
+		spin_lock_irq(&ch->lock);
+		if (list_empty(&ch->head)) {
+			spin_unlock_irq(&ch->lock);
+			continue;
 		}
-		spin_unlock_irq(&ent->lock);
+		msg = list_entry(ch->head.next, typeof(*msg), list);
+		/* For cached lists, we must explicitly state what is
+		 * the real size
+		 */
+		msg->len = in_size;
+		list_del(&msg->list);
+		spin_unlock_irq(&ch->lock);
+		break;
 	}
 
-	if (IS_ERR(msg))
-		msg = mlx5_alloc_cmd_msg(dev, gfp, in_size, 0);
+	if (!IS_ERR(msg))
+		return msg;
 
+cache_miss:
+	msg = mlx5_alloc_cmd_msg(dev, gfp, in_size, 0);
 	return msg;
 }
 
@@ -1593,58 +1592,56 @@ EXPORT_SYMBOL(mlx5_cmd_exec_cb);
 
 static void destroy_msg_cache(struct mlx5_core_dev *dev)
 {
-	struct mlx5_cmd *cmd = &dev->cmd;
+	struct cmd_msg_cache *ch;
 	struct mlx5_cmd_msg *msg;
 	struct mlx5_cmd_msg *n;
+	int i;
 
-	list_for_each_entry_safe(msg, n, &cmd->cache.large.head, list) {
-		list_del(&msg->list);
-		mlx5_free_cmd_msg(dev, msg);
-	}
-
-	list_for_each_entry_safe(msg, n, &cmd->cache.med.head, list) {
-		list_del(&msg->list);
-		mlx5_free_cmd_msg(dev, msg);
+	for (i = 0; i < MLX5_NUM_COMMAND_CACHES; i++) {
+		ch = &dev->cmd.cache[i];
+		list_for_each_entry_safe(msg, n, &ch->head, list) {
+			list_del(&msg->list);
+			mlx5_free_cmd_msg(dev, msg);
+		}
 	}
 }
 
-static int create_msg_cache(struct mlx5_core_dev *dev)
+static unsigned cmd_cache_num_ent[MLX5_NUM_COMMAND_CACHES] = {
+	512, 32, 16, 8, 2
+};
+
+static unsigned cmd_cache_ent_size[MLX5_NUM_COMMAND_CACHES] = {
+	16 + MLX5_CMD_DATA_BLOCK_SIZE,
+	16 + MLX5_CMD_DATA_BLOCK_SIZE * 2,
+	16 + MLX5_CMD_DATA_BLOCK_SIZE * 16,
+	16 + MLX5_CMD_DATA_BLOCK_SIZE * 256,
+	16 + MLX5_CMD_DATA_BLOCK_SIZE * 512,
+};
+
+static void create_msg_cache(struct mlx5_core_dev *dev)
 {
 	struct mlx5_cmd *cmd = &dev->cmd;
+	struct cmd_msg_cache *ch;
 	struct mlx5_cmd_msg *msg;
-	int err;
 	int i;
-
-	spin_lock_init(&cmd->cache.large.lock);
-	INIT_LIST_HEAD(&cmd->cache.large.head);
-	spin_lock_init(&cmd->cache.med.lock);
-	INIT_LIST_HEAD(&cmd->cache.med.head);
-
-	for (i = 0; i < NUM_LONG_LISTS; i++) {
-		msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, LONG_LIST_SIZE, 0);
-		if (IS_ERR(msg)) {
-			err = PTR_ERR(msg);
-			goto ex_err;
-		}
-		msg->cache = &cmd->cache.large;
-		list_add_tail(&msg->list, &cmd->cache.large.head);
-	}
-
-	for (i = 0; i < NUM_MED_LISTS; i++) {
-		msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, MED_LIST_SIZE, 0);
-		if (IS_ERR(msg)) {
-			err = PTR_ERR(msg);
-			goto ex_err;
+	int k;
+
+	/* Initialize and fill the caches with initial entries */
+	for (k = 0; k < MLX5_NUM_COMMAND_CACHES; k++) {
+		ch = &cmd->cache[k];
+		spin_lock_init(&ch->lock);
+		INIT_LIST_HEAD(&ch->head);
+		ch->num_ent = cmd_cache_num_ent[k];
+		ch->max_inbox_size = cmd_cache_ent_size[k];
+		for (i = 0; i < ch->num_ent; i++) {
+			msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL | __GFP_NOWARN,
+						 ch->max_inbox_size, 0);
+			if (IS_ERR(msg))
+				break;
+			msg->parent = ch;
+			list_add_tail(&msg->list, &ch->head);
 		}
-		msg->cache = &cmd->cache.med;
-		list_add_tail(&msg->list, &cmd->cache.med.head);
 	}
-
-	return 0;
-
-ex_err:
-	destroy_msg_cache(dev);
-	return err;
 }
 
 static int alloc_cmd_page(struct mlx5_core_dev *dev, struct mlx5_cmd *cmd)
@@ -1767,11 +1764,7 @@ int mlx5_cmd_init(struct mlx5_core_dev *dev)
 
 	cmd->mode = CMD_MODE_POLLING;
 
-	err = create_msg_cache(dev);
-	if (err) {
-		dev_err(&dev->pdev->dev, "failed to create command cache\n");
-		goto err_free_page;
-	}
+	create_msg_cache(dev);
 
 	set_wqname(dev);
 	cmd->wq = create_singlethread_workqueue(cmd->wq_name);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ecc451d89ccd..5e7dbbcf47f0 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -208,7 +208,7 @@ struct mlx5_cmd_first {
 
 struct mlx5_cmd_msg {
 	struct list_head		list;
-	struct cache_ent	       *cache;
+	struct cmd_msg_cache	       *parent;
 	u32				len;
 	struct mlx5_cmd_first		first;
 	struct mlx5_cmd_mailbox	       *next;
@@ -228,17 +228,17 @@ struct mlx5_cmd_debug {
 	u16			outlen;
 };
 
-struct cache_ent {
+struct cmd_msg_cache {
 	/* protect block chain allocations
 	 */
 	spinlock_t		lock;
 	struct list_head	head;
+	unsigned int		max_inbox_size;
+	unsigned int		num_ent;
 };
 
-struct cmd_msg_cache {
-	struct cache_ent	large;
-	struct cache_ent	med;
-
+enum {
+	MLX5_NUM_COMMAND_CACHES = 5,
 };
 
 struct mlx5_cmd_stats {
@@ -281,7 +281,7 @@ struct mlx5_cmd {
 	struct mlx5_cmd_work_ent *ent_arr[MLX5_MAX_COMMANDS];
 	struct pci_pool *pool;
 	struct mlx5_cmd_debug dbg;
-	struct cmd_msg_cache cache;
+	struct cmd_msg_cache cache[MLX5_NUM_COMMAND_CACHES];
 	int checksum_disabled;
 	struct mlx5_cmd_stats stats[MLX5_CMD_OP_MAX];
 };
-- 
cgit v1.2.3


From 4ce3bf2fa8ba309b5ca19539fcc8671a0fc084f9 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Thu, 17 Nov 2016 13:45:56 +0200
Subject: net/mlx5: Port module event hardware structures

Add hardware structures and constants definitions needed for module
events support.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/device.h   | 11 +++++++++++
 include/linux/mlx5/mlx5_ifc.h |  3 ++-
 include/linux/mlx5/port.h     |  3 +++
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 58276144ba81..52b437431c6a 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -277,6 +277,7 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_INTERNAL_ERROR	   = 0x08,
 	MLX5_EVENT_TYPE_PORT_CHANGE	   = 0x09,
 	MLX5_EVENT_TYPE_GPIO_EVENT	   = 0x15,
+	MLX5_EVENT_TYPE_PORT_MODULE_EVENT  = 0x16,
 	MLX5_EVENT_TYPE_REMOTE_CONFIG	   = 0x19,
 
 	MLX5_EVENT_TYPE_DB_BF_CONGESTION   = 0x1a,
@@ -552,6 +553,15 @@ struct mlx5_eqe_vport_change {
 	__be32		rsvd1[6];
 } __packed;
 
+struct mlx5_eqe_port_module {
+	u8        reserved_at_0[1];
+	u8        module;
+	u8        reserved_at_2[1];
+	u8        module_status;
+	u8        reserved_at_4[2];
+	u8        error_type;
+} __packed;
+
 union ev_data {
 	__be32				raw[7];
 	struct mlx5_eqe_cmd		cmd;
@@ -565,6 +575,7 @@ union ev_data {
 	struct mlx5_eqe_page_req	req_pages;
 	struct mlx5_eqe_page_fault	page_fault;
 	struct mlx5_eqe_vport_change	vport_change;
+	struct mlx5_eqe_port_module	port_module;
 } __packed;
 
 struct mlx5_eqe {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 2632cb2caf10..cd1d530ca368 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -824,7 +824,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   early_vf_enable[0x1];
 	u8         reserved_at_1a9[0x2];
 	u8         local_ca_ack_delay[0x5];
-	u8         reserved_at_1af[0x2];
+	u8         port_module_event[0x1];
+	u8         reserved_at_1b0[0x1];
 	u8         ports_check[0x1];
 	u8         reserved_at_1b2[0x1];
 	u8         disable_link_up[0x1];
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index b3065acd20b4..dde8c7ec5ff1 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -94,6 +94,9 @@ enum mlx5e_link_mode {
 
 #define MLX5E_PROT_MASK(link_mode) (1 << link_mode)
 
+#define PORT_MODULE_EVENT_MODULE_STATUS_MASK 0xF
+#define PORT_MODULE_EVENT_ERROR_TYPE_MASK         0xF
+
 int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps);
 int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
 			 int ptys_size, int proto_mask, u8 local_port);
-- 
cgit v1.2.3


From d4eb4cd78b0774c7061db56844ed2ea7790cc77c Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Thu, 17 Nov 2016 13:45:57 +0200
Subject: net/mlx5: Add handling for port module event

For each asynchronous port module event:
  1. print with ratelimit to the dmesg log
  2. increment the corresponding event counter

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       | 12 +++++
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 57 ++++++++++++++++++++++
 include/linux/mlx5/driver.h                        | 27 ++++++++++
 4 files changed, 97 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index e74a73be5e0a..8ffcc8808e50 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -139,6 +139,8 @@ static const char *eqe_type_str(u8 type)
 		return "MLX5_EVENT_TYPE_PORT_CHANGE";
 	case MLX5_EVENT_TYPE_GPIO_EVENT:
 		return "MLX5_EVENT_TYPE_GPIO_EVENT";
+	case MLX5_EVENT_TYPE_PORT_MODULE_EVENT:
+		return "MLX5_EVENT_TYPE_PORT_MODULE_EVENT";
 	case MLX5_EVENT_TYPE_REMOTE_CONFIG:
 		return "MLX5_EVENT_TYPE_REMOTE_CONFIG";
 	case MLX5_EVENT_TYPE_DB_BF_CONGESTION:
@@ -285,6 +287,11 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 			mlx5_eswitch_vport_event(dev->priv.eswitch, eqe);
 			break;
 #endif
+
+		case MLX5_EVENT_TYPE_PORT_MODULE_EVENT:
+			mlx5_port_module_event(dev, eqe);
+			break;
+
 		default:
 			mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n",
 				       eqe->type, eq->eqn);
@@ -480,6 +487,11 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 	    mlx5_core_is_pf(dev))
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_NIC_VPORT_CHANGE);
 
+	if (MLX5_CAP_GEN(dev, port_module_event))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_PORT_MODULE_EVENT);
+	else
+		mlx5_core_dbg(dev, "port_module_event is not set\n");
+
 	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
 				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
 				 "mlx5_cmd_eq", &dev->priv.uuari.uars[0]);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 4762bb9d013c..7e635ebda199 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -81,6 +81,7 @@ int mlx5_cmd_init_hca(struct mlx5_core_dev *dev);
 int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
 void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
 		     unsigned long param);
+void mlx5_port_module_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
 void mlx5_enter_error_state(struct mlx5_core_dev *dev);
 void mlx5_disable_device(struct mlx5_core_dev *dev);
 void mlx5_recover_device(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index 34e7184e23c9..b77928f5b46e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -746,3 +746,60 @@ void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported,
 	*supported = !!(MLX5_GET(pcmr_reg, out, fcs_cap));
 	*enabled = !!(MLX5_GET(pcmr_reg, out, fcs_chk));
 }
+
+static const char *mlx5_pme_status[MLX5_MODULE_STATUS_NUM] = {
+	"Cable plugged",   /* MLX5_MODULE_STATUS_PLUGGED    = 0x1 */
+	"Cable unplugged", /* MLX5_MODULE_STATUS_UNPLUGGED  = 0x2 */
+	"Cable error",     /* MLX5_MODULE_STATUS_ERROR      = 0x3 */
+};
+
+static const char *mlx5_pme_error[MLX5_MODULE_EVENT_ERROR_NUM] = {
+	"Power budget exceeded",
+	"Long Range for non MLNX cable",
+	"Bus stuck(I2C or data shorted)",
+	"No EEPROM/retry timeout",
+	"Enforce part number list",
+	"Unknown identifier",
+	"High Temperature",
+	"Bad or shorted cable/module",
+	"Unknown status",
+};
+
+void mlx5_port_module_event(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe)
+{
+	enum port_module_event_status_type module_status;
+	enum port_module_event_error_type error_type;
+	struct mlx5_eqe_port_module *module_event_eqe;
+	struct mlx5_priv *priv = &dev->priv;
+	u8 module_num;
+
+	module_event_eqe = &eqe->data.port_module;
+	module_num = module_event_eqe->module;
+	module_status = module_event_eqe->module_status &
+			PORT_MODULE_EVENT_MODULE_STATUS_MASK;
+	error_type = module_event_eqe->error_type &
+		     PORT_MODULE_EVENT_ERROR_TYPE_MASK;
+
+	if (module_status < MLX5_MODULE_STATUS_ERROR) {
+		priv->pme_stats.status_counters[module_status - 1]++;
+	} else if (module_status == MLX5_MODULE_STATUS_ERROR) {
+		if (error_type >= MLX5_MODULE_EVENT_ERROR_UNKNOWN)
+			/* Unknown error type */
+			error_type = MLX5_MODULE_EVENT_ERROR_UNKNOWN;
+		priv->pme_stats.error_counters[error_type]++;
+	}
+
+	if (!printk_ratelimit())
+		return;
+
+	if (module_status < MLX5_MODULE_STATUS_ERROR)
+		mlx5_core_info(dev,
+			       "Port module event: module %u, %s\n",
+			       module_num, mlx5_pme_status[module_status - 1]);
+
+	else if (module_status == MLX5_MODULE_STATUS_ERROR)
+		mlx5_core_info(dev,
+			       "Port module event[error]: module %u, %s, %s\n",
+			       module_num, mlx5_pme_status[module_status - 1],
+			       mlx5_pme_error[error_type]);
+}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5e7dbbcf47f0..7336c8e529d7 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -498,6 +498,31 @@ struct mlx5_rl_table {
 	struct mlx5_rl_entry   *rl_entry;
 };
 
+enum port_module_event_status_type {
+	MLX5_MODULE_STATUS_PLUGGED   = 0x1,
+	MLX5_MODULE_STATUS_UNPLUGGED = 0x2,
+	MLX5_MODULE_STATUS_ERROR     = 0x3,
+	MLX5_MODULE_STATUS_NUM       = 0x3,
+};
+
+enum  port_module_event_error_type {
+	MLX5_MODULE_EVENT_ERROR_POWER_BUDGET_EXCEEDED,
+	MLX5_MODULE_EVENT_ERROR_LONG_RANGE_FOR_NON_MLNX_CABLE_MODULE,
+	MLX5_MODULE_EVENT_ERROR_BUS_STUCK,
+	MLX5_MODULE_EVENT_ERROR_NO_EEPROM_RETRY_TIMEOUT,
+	MLX5_MODULE_EVENT_ERROR_ENFORCE_PART_NUMBER_LIST,
+	MLX5_MODULE_EVENT_ERROR_UNKNOWN_IDENTIFIER,
+	MLX5_MODULE_EVENT_ERROR_HIGH_TEMPERATURE,
+	MLX5_MODULE_EVENT_ERROR_BAD_CABLE,
+	MLX5_MODULE_EVENT_ERROR_UNKNOWN,
+	MLX5_MODULE_EVENT_ERROR_NUM,
+};
+
+struct mlx5_port_module_event_stats {
+	u64 status_counters[MLX5_MODULE_STATUS_NUM];
+	u64 error_counters[MLX5_MODULE_EVENT_ERROR_NUM];
+};
+
 struct mlx5_priv {
 	char			name[MLX5_MAX_NAME_LEN];
 	struct mlx5_eq_table	eq_table;
@@ -559,6 +584,8 @@ struct mlx5_priv {
 	unsigned long		pci_dev_data;
 	struct mlx5_fc_stats		fc_stats;
 	struct mlx5_rl_table            rl_table;
+
+	struct mlx5_port_module_event_stats  pme_stats;
 };
 
 enum mlx5_device_state {
-- 
cgit v1.2.3


From 0dbc6fe09fbe5f5191bcc606f3bdc9a829f97066 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 17 Nov 2016 13:45:59 +0200
Subject: net/mlx5: Set driver version infrastructure

Add driver_version capability bit is enabled, and set driver
version command in mlx5_ifc firmware header.  The only purpose
of this command is to store a driver version/OS string in FW
to be reported and displayed in various management systems,
such as IPMI/BMC.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/mlx5_ifc.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index cd1d530ca368..f08a06247fba 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -83,6 +83,7 @@ enum {
 	MLX5_CMD_OP_SET_HCA_CAP                   = 0x109,
 	MLX5_CMD_OP_QUERY_ISSI                    = 0x10a,
 	MLX5_CMD_OP_SET_ISSI                      = 0x10b,
+	MLX5_CMD_OP_SET_DRIVER_VERSION            = 0x10d,
 	MLX5_CMD_OP_CREATE_MKEY                   = 0x200,
 	MLX5_CMD_OP_QUERY_MKEY                    = 0x201,
 	MLX5_CMD_OP_DESTROY_MKEY                  = 0x202,
@@ -909,7 +910,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_pg_sz[0x8];
 
 	u8         bf[0x1];
-	u8         reserved_at_261[0x1];
+	u8         driver_version[0x1];
 	u8         pad_tx_eth_packet[0x1];
 	u8         reserved_at_263[0x8];
 	u8         log_bf_reg_size[0x5];
@@ -4005,6 +4006,25 @@ struct mlx5_ifc_query_issi_in_bits {
 	u8         reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_set_driver_version_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_set_driver_version_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+	u8         driver_version[64][0x8];
+};
+
 struct mlx5_ifc_query_hca_vport_pkey_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
-- 
cgit v1.2.3


From 7f503169cabd70c1f13b9279c50eca7dfb9a7d51 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Thu, 17 Nov 2016 13:46:01 +0200
Subject: net/mlx5: Add MPCNT register infrastructure

Add the needed infrastructure for future use of MPCNT register.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/device.h   |  5 +++
 include/linux/mlx5/driver.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 93 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 99 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 52b437431c6a..9f489365b3d3 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1071,6 +1071,11 @@ enum {
 	MLX5_INFINIBAND_PORT_COUNTERS_GROUP   = 0x20,
 };
 
+enum {
+	MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP       = 0x0,
+	MLX5_PCIE_TIMERS_AND_STATES_COUNTERS_GROUP = 0x2,
+};
+
 static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz)
 {
 	if (pkey_sz > MLX5_MAX_LOG_PKEY_TABLE)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7336c8e529d7..ae1f451e8f89 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -121,6 +121,7 @@ enum {
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
 	MLX5_REG_MCIA		 = 0x9014,
 	MLX5_REG_MLCR		 = 0x902b,
+	MLX5_REG_MPCNT		 = 0x9051,
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f08a06247fba..a5f0fbedf1e7 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1757,6 +1757,80 @@ struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits {
 	u8         reserved_at_4c0[0x300];
 };
 
+struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits {
+	u8         life_time_counter_high[0x20];
+
+	u8         life_time_counter_low[0x20];
+
+	u8         rx_errors[0x20];
+
+	u8         tx_errors[0x20];
+
+	u8         l0_to_recovery_eieos[0x20];
+
+	u8         l0_to_recovery_ts[0x20];
+
+	u8         l0_to_recovery_framing[0x20];
+
+	u8         l0_to_recovery_retrain[0x20];
+
+	u8         crc_error_dllp[0x20];
+
+	u8         crc_error_tlp[0x20];
+
+	u8         reserved_at_140[0x680];
+};
+
+struct mlx5_ifc_pcie_tas_cntrs_grp_data_layout_bits {
+	u8         life_time_counter_high[0x20];
+
+	u8         life_time_counter_low[0x20];
+
+	u8         time_to_boot_image_start[0x20];
+
+	u8         time_to_link_image[0x20];
+
+	u8         calibration_time[0x20];
+
+	u8         time_to_first_perst[0x20];
+
+	u8         time_to_detect_state[0x20];
+
+	u8         time_to_l0[0x20];
+
+	u8         time_to_crs_en[0x20];
+
+	u8         time_to_plastic_image_start[0x20];
+
+	u8         time_to_iron_image_start[0x20];
+
+	u8         perst_handler[0x20];
+
+	u8         times_in_l1[0x20];
+
+	u8         times_in_l23[0x20];
+
+	u8         dl_down[0x20];
+
+	u8         config_cycle1usec[0x20];
+
+	u8         config_cycle2to7usec[0x20];
+
+	u8         config_cycle_8to15usec[0x20];
+
+	u8         config_cycle_16_to_63usec[0x20];
+
+	u8         config_cycle_64usec[0x20];
+
+	u8         correctable_err_msg_sent[0x20];
+
+	u8         non_fatal_err_msg_sent[0x20];
+
+	u8         fatal_err_msg_sent[0x20];
+
+	u8         reserved_at_2e0[0x4e0];
+};
+
 struct mlx5_ifc_cmd_inter_comp_event_bits {
 	u8         command_completion_vector[0x20];
 
@@ -2921,6 +2995,12 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits {
 	u8         reserved_at_0[0x7c0];
 };
 
+union mlx5_ifc_pcie_cntrs_grp_data_layout_auto_bits {
+	struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits pcie_perf_cntrs_grp_data_layout;
+	struct mlx5_ifc_pcie_tas_cntrs_grp_data_layout_bits pcie_tas_cntrs_grp_data_layout;
+	u8         reserved_at_0[0x7c0];
+};
+
 union mlx5_ifc_event_auto_bits {
 	struct mlx5_ifc_comp_event_bits comp_event;
 	struct mlx5_ifc_dct_events_bits dct_events;
@@ -7240,6 +7320,18 @@ struct mlx5_ifc_ppcnt_reg_bits {
 	union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set;
 };
 
+struct mlx5_ifc_mpcnt_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         pcie_index[0x8];
+	u8         reserved_at_10[0xa];
+	u8         grp[0x6];
+
+	u8         clr[0x1];
+	u8         reserved_at_21[0x1f];
+
+	union mlx5_ifc_pcie_cntrs_grp_data_layout_auto_bits counter_set;
+};
+
 struct mlx5_ifc_ppad_reg_bits {
 	u8         reserved_at_0[0x3];
 	u8         single_mac[0x1];
@@ -7845,6 +7937,7 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_pmtu_reg_bits pmtu_reg;
 	struct mlx5_ifc_ppad_reg_bits ppad_reg;
 	struct mlx5_ifc_ppcnt_reg_bits ppcnt_reg;
+	struct mlx5_ifc_mpcnt_reg_bits mpcnt_reg;
 	struct mlx5_ifc_pplm_reg_bits pplm_reg;
 	struct mlx5_ifc_pplr_reg_bits pplr_reg;
 	struct mlx5_ifc_ppsc_reg_bits ppsc_reg;
-- 
cgit v1.2.3


From 968ad9da7e0e333e25442950e10a1b631981ce84 Mon Sep 17 00:00:00 2001
From: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
Date: Thu, 17 Nov 2016 13:07:21 +0100
Subject: ethtool: Implements ETHTOOL_PHY_GTUNABLE/ETHTOOL_PHY_STUNABLE

Adding get_tunable/set_tunable function pointer to the phy_driver
structure, and uses these function pointers to implement the
ETHTOOL_PHY_GTUNABLE/ETHTOOL_PHY_STUNABLE ioctls.

Signed-off-by: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Allan W. Nielsen <allan.nielsen@microsemi.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h |  7 +++++
 net/core/ethtool.c  | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index b9bd3b4f4ea1..edde28ce163a 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -611,6 +611,13 @@ struct phy_driver {
 	void (*get_strings)(struct phy_device *dev, u8 *data);
 	void (*get_stats)(struct phy_device *dev,
 			  struct ethtool_stats *stats, u64 *data);
+
+	/* Get and Set PHY tunables */
+	int (*get_tunable)(struct phy_device *dev,
+			   struct ethtool_tunable *tuna, void *data);
+	int (*set_tunable)(struct phy_device *dev,
+			    struct ethtool_tunable *tuna,
+			    const void *data);
 };
 #define to_phy_driver(d) container_of(to_mdio_common_driver(d),		\
 				      struct phy_driver, mdiodrv)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 977489820eb9..61aebdf9c61b 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -119,6 +119,11 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
 	[ETHTOOL_TX_COPYBREAK]	= "tx-copybreak",
 };
 
+static const char
+phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+	[ETHTOOL_ID_UNSPEC]     = "Unspec",
+};
+
 static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_gfeatures cmd = {
@@ -227,6 +232,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
 	if (sset == ETH_SS_TUNABLES)
 		return ARRAY_SIZE(tunable_strings);
 
+	if (sset == ETH_SS_PHY_TUNABLES)
+		return ARRAY_SIZE(phy_tunable_strings);
+
 	if (sset == ETH_SS_PHY_STATS) {
 		if (dev->phydev)
 			return phy_get_sset_count(dev->phydev);
@@ -253,6 +261,8 @@ static void __ethtool_get_strings(struct net_device *dev,
 		       sizeof(rss_hash_func_strings));
 	else if (stringset == ETH_SS_TUNABLES)
 		memcpy(data, tunable_strings, sizeof(tunable_strings));
+	else if (stringset == ETH_SS_PHY_TUNABLES)
+		memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
 	else if (stringset == ETH_SS_PHY_STATS) {
 		struct phy_device *phydev = dev->phydev;
 
@@ -2422,6 +2432,76 @@ static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
 	};
 }
 
+static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna)
+{
+	switch (tuna->id) {
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int get_phy_tunable(struct net_device *dev, void __user *useraddr)
+{
+	int ret;
+	struct ethtool_tunable tuna;
+	struct phy_device *phydev = dev->phydev;
+	void *data;
+
+	if (!(phydev && phydev->drv && phydev->drv->get_tunable))
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+		return -EFAULT;
+	ret = ethtool_phy_tunable_valid(&tuna);
+	if (ret)
+		return ret;
+	data = kmalloc(tuna.len, GFP_USER);
+	if (!data)
+		return -ENOMEM;
+	ret = phydev->drv->get_tunable(phydev, &tuna, data);
+	if (ret)
+		goto out;
+	useraddr += sizeof(tuna);
+	ret = -EFAULT;
+	if (copy_to_user(useraddr, data, tuna.len))
+		goto out;
+	ret = 0;
+
+out:
+	kfree(data);
+	return ret;
+}
+
+static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
+{
+	int ret;
+	struct ethtool_tunable tuna;
+	struct phy_device *phydev = dev->phydev;
+	void *data;
+
+	if (!(phydev && phydev->drv && phydev->drv->set_tunable))
+		return -EOPNOTSUPP;
+	if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
+		return -EFAULT;
+	ret = ethtool_phy_tunable_valid(&tuna);
+	if (ret)
+		return ret;
+	data = kmalloc(tuna.len, GFP_USER);
+	if (!data)
+		return -ENOMEM;
+	useraddr += sizeof(tuna);
+	ret = -EFAULT;
+	if (copy_from_user(data, useraddr, tuna.len))
+		goto out;
+	ret = phydev->drv->set_tunable(phydev, &tuna, data);
+
+out:
+	kfree(data);
+	return ret;
+}
+
 /* The main entry point in this file.  Called from net/core/dev_ioctl.c */
 
 int dev_ethtool(struct net *net, struct ifreq *ifr)
@@ -2479,6 +2559,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GET_TS_INFO:
 	case ETHTOOL_GEEE:
 	case ETHTOOL_GTUNABLE:
+	case ETHTOOL_PHY_GTUNABLE:
 		break;
 	default:
 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -2684,6 +2765,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_SLINKSETTINGS:
 		rc = ethtool_set_link_ksettings(dev, useraddr);
 		break;
+	case ETHTOOL_PHY_GTUNABLE:
+		rc = get_phy_tunable(dev, useraddr);
+		break;
+	case ETHTOOL_PHY_STUNABLE:
+		rc = set_phy_tunable(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From 603ab57363a0ba66c77ca4b3027bc0b4505df504 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 17 Nov 2016 11:19:12 -0800
Subject: bus: mvebu-bus: Provide inline stub for mvebu_mbus_get_dram_win_info

In preparation for allowing CONFIG_MVNETA_BM to build with COMPILE_TEST,
provide an inline stub for mvebu_mbus_get_dram_win_info().

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mbus.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mbus.h b/include/linux/mbus.h
index 2931aa43dab1..0d3f14fd2621 100644
--- a/include/linux/mbus.h
+++ b/include/linux/mbus.h
@@ -82,6 +82,7 @@ static inline int mvebu_mbus_get_io_win_info(phys_addr_t phyaddr, u32 *size,
 }
 #endif
 
+#ifdef CONFIG_MVEBU_MBUS
 int mvebu_mbus_save_cpu_target(u32 __iomem *store_addr);
 void mvebu_mbus_get_pcie_mem_aperture(struct resource *res);
 void mvebu_mbus_get_pcie_io_aperture(struct resource *res);
@@ -97,5 +98,12 @@ int mvebu_mbus_init(const char *soc, phys_addr_t mbus_phys_base,
 		    size_t mbus_size, phys_addr_t sdram_phys_base,
 		    size_t sdram_size);
 int mvebu_mbus_dt_init(bool is_coherent);
+#else
+static inline int mvebu_mbus_get_dram_win_info(phys_addr_t phyaddr, u8 *target,
+					       u8 *attr)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_MVEBU_MBUS */
 
 #endif /* __LINUX_MBUS_H */
-- 
cgit v1.2.3


From d66016a77757b004b8637f44d87bedfc4a47b89c Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Fri, 18 Nov 2016 15:40:39 -0800
Subject: virtio_net.h: Fix comment.

Fix incorrent comment after the final #endif.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 1c912f85e041..74f1e3363506 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -98,4 +98,4 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 	return 0;
 }
 
-#endif /* _LINUX_VIRTIO_BYTEORDER */
+#endif /* _LINUX_VIRTIO_NET_H */
-- 
cgit v1.2.3


From 9403cd7cbb08aa3709c632decafa2014c8ed96e6 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Fri, 18 Nov 2016 15:40:40 -0800
Subject: virtio_net: Do not clear memory for struct virtio_net_hdr twice.

virtio_net_hdr_from_skb() clears the memory for the header, so there
is no point for the callers to do the same.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c          | 3 +--
 include/linux/virtio_net.h | 2 +-
 net/packet/af_packet.c     | 2 --
 3 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3b8d8cc6d2ea..64e694c68d99 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1360,8 +1360,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	}
 
 	if (vnet_hdr_sz) {
-		struct virtio_net_hdr gso = { 0 }; /* no info leak */
-		int ret;
+		struct virtio_net_hdr gso;
 
 		if (iov_iter_count(iter) < vnet_hdr_sz)
 			return -EINVAL;
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 74f1e3363506..66204007d7ac 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -58,7 +58,7 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 					  struct virtio_net_hdr *hdr,
 					  bool little_endian)
 {
-	memset(hdr, 0, sizeof(*hdr));
+	memset(hdr, 0, sizeof(*hdr));   /* no info leak */
 
 	if (skb_is_gso(skb)) {
 		struct skb_shared_info *sinfo = skb_shinfo(skb);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d2238b204691..abe6c0b6683c 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1970,8 +1970,6 @@ static unsigned int run_filter(struct sk_buff *skb,
 static int __packet_rcv_vnet(const struct sk_buff *skb,
 			     struct virtio_net_hdr *vnet_hdr)
 {
-	*vnet_hdr = (const struct virtio_net_hdr) { 0 };
-
 	if (virtio_net_hdr_from_skb(skb, vnet_hdr, vio_le()))
 		BUG();
 
-- 
cgit v1.2.3


From c72d8cdaa5dbd3baf918046ee5149ab69330923e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 19 Nov 2016 04:08:08 +0300
Subject: net: fix bogus cast in skb_pagelen() and use unsigned variables

1) cast to "int" is unnecessary:
   u8 will be promoted to int before decrementing,
   small positive numbers fit into "int", so their values won't be changed
   during promotion.

   Once everything is int including loop counters, signedness doesn't
   matter: 32-bit operations will stay 32-bit operations.

   But! Someone tried to make this loop smart by making everything of
   the same type apparently in an attempt to optimise it.
   Do the optimization, just differently.
   Do the cast where it matters. :^)

2) frag size is unsigned entity and sum of fragments sizes is also
   unsigned.

Make everything unsigned, leave no MOVSX instruction behind.

	add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-4 (-4)
	function                                     old     new   delta
	skb_cow_data                                 835     834      -1
	ip_do_fragment                              2549    2548      -1
	ip6_fragment                                3130    3128      -2
	Total: Before=154865032, After=154865028, chg -0.00%

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 +++---
 net/ipv4/ip_output.c   | 2 +-
 net/ipv6/ip6_output.c  | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a4aeeca7e805..9c535fbccf2c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1799,11 +1799,11 @@ static inline unsigned int skb_headlen(const struct sk_buff *skb)
 	return skb->len - skb->data_len;
 }
 
-static inline int skb_pagelen(const struct sk_buff *skb)
+static inline unsigned int skb_pagelen(const struct sk_buff *skb)
 {
-	int i, len = 0;
+	unsigned int i, len = 0;
 
-	for (i = (int)skb_shinfo(skb)->nr_frags - 1; i >= 0; i--)
+	for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--)
 		len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
 	return len + skb_headlen(skb);
 }
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index eaf720b65d7e..358f2c82b030 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -581,7 +581,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 	 */
 	if (skb_has_frag_list(skb)) {
 		struct sk_buff *frag, *frag2;
-		int first_len = skb_pagelen(skb);
+		unsigned int first_len = skb_pagelen(skb);
 
 		if (first_len - hlen > mtu ||
 		    ((first_len - hlen) & 7) ||
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b37054b1873d..312cbd0e5038 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -625,7 +625,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 
 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
 	if (skb_has_frag_list(skb)) {
-		int first_len = skb_pagelen(skb);
+		unsigned int first_len = skb_pagelen(skb);
 		struct sk_buff *frag2;
 
 		if (first_len - hlen > mtu ||
-- 
cgit v1.2.3


From 6d67942dd0ebc3dddc86edf9208169d064a9b3d7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 19 Nov 2016 01:45:03 +0100
Subject: bpf: add __must_check attributes to refcount manipulating helpers

Helpers like bpf_prog_add(), bpf_prog_inc(), bpf_map_inc() can fail
with an error, so make sure the caller properly checks their return
value and not just ignores it, which could worst-case lead to use
after free.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 01c1487277b2..69d0a7f12a3b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -233,14 +233,14 @@ void bpf_register_map_type(struct bpf_map_type_list *tl);
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
-struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i);
+struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
 void bpf_prog_sub(struct bpf_prog *prog, int i);
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog);
+struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
 
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
-struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref);
+struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 int bpf_map_precharge_memlock(u32 pages);
@@ -299,7 +299,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
-static inline struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
+static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog,
+							  int i)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
@@ -311,7 +312,8 @@ static inline void bpf_prog_sub(struct bpf_prog *prog, int i)
 static inline void bpf_prog_put(struct bpf_prog *prog)
 {
 }
-static inline struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+
+static inline struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
-- 
cgit v1.2.3


From d06f78c4232d6a84b50839f61d9d7fbb222d8118 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 22 Nov 2016 11:40:55 -0800
Subject: net: phy: broadcom: Add support code for downshift/Wirespeed

Broadcom's Wirespeed feature allows us to configure how auto-negotiation
should behave with fewer working pairs of wires on a cable. Add support
code for retrieving and setting such downshift counters using the
recently added ethtool downshift tunables.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/bcm-phy-lib.c | 86 +++++++++++++++++++++++++++++++++++++++++++
 drivers/net/phy/bcm-phy-lib.h |  5 +++
 include/linux/brcmphy.h       | 10 +++++
 3 files changed, 101 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/bcm-phy-lib.c b/drivers/net/phy/bcm-phy-lib.c
index 18e11b3a0f41..d742894816f6 100644
--- a/drivers/net/phy/bcm-phy-lib.c
+++ b/drivers/net/phy/bcm-phy-lib.c
@@ -225,6 +225,92 @@ int bcm_phy_enable_eee(struct phy_device *phydev)
 }
 EXPORT_SYMBOL_GPL(bcm_phy_enable_eee);
 
+int bcm_phy_downshift_get(struct phy_device *phydev, u8 *count)
+{
+	int val;
+
+	val = bcm54xx_auxctl_read(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC);
+	if (val < 0)
+		return val;
+
+	/* Check if wirespeed is enabled or not */
+	if (!(val & MII_BCM54XX_AUXCTL_SHDWSEL_MISC_WIRESPEED_EN)) {
+		*count = DOWNSHIFT_DEV_DISABLE;
+		return 0;
+	}
+
+	val = bcm_phy_read_shadow(phydev, BCM54XX_SHD_SCR2);
+	if (val < 0)
+		return val;
+
+	/* Downgrade after one link attempt */
+	if (val & BCM54XX_SHD_SCR2_WSPD_RTRY_DIS) {
+		*count = 1;
+	} else {
+		/* Downgrade after configured retry count */
+		val >>= BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_SHIFT;
+		val &= BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_MASK;
+		*count = val + BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_OFFSET;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bcm_phy_downshift_get);
+
+int bcm_phy_downshift_set(struct phy_device *phydev, u8 count)
+{
+	int val = 0, ret = 0;
+
+	/* Range check the number given */
+	if (count - BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_OFFSET >
+	    BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_MASK &&
+	    count != DOWNSHIFT_DEV_DEFAULT_COUNT) {
+		return -ERANGE;
+	}
+
+	val = bcm54xx_auxctl_read(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_MISC);
+	if (val < 0)
+		return val;
+
+	/* Se the write enable bit */
+	val |= MII_BCM54XX_AUXCTL_MISC_WREN;
+
+	if (count == DOWNSHIFT_DEV_DISABLE) {
+		val &= ~MII_BCM54XX_AUXCTL_SHDWSEL_MISC_WIRESPEED_EN;
+		return bcm54xx_auxctl_write(phydev,
+					    MII_BCM54XX_AUXCTL_SHDWSEL_MISC,
+					    val);
+	} else {
+		val |= MII_BCM54XX_AUXCTL_SHDWSEL_MISC_WIRESPEED_EN;
+		ret = bcm54xx_auxctl_write(phydev,
+					   MII_BCM54XX_AUXCTL_SHDWSEL_MISC,
+					   val);
+		if (ret < 0)
+			return ret;
+	}
+
+	val = bcm_phy_read_shadow(phydev, BCM54XX_SHD_SCR2);
+	val &= ~(BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_MASK <<
+		 BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_SHIFT |
+		 BCM54XX_SHD_SCR2_WSPD_RTRY_DIS);
+
+	switch (count) {
+	case 1:
+		val |= BCM54XX_SHD_SCR2_WSPD_RTRY_DIS;
+		break;
+	case DOWNSHIFT_DEV_DEFAULT_COUNT:
+		val |= 1 << BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_SHIFT;
+		break;
+	default:
+		val |= (count - BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_OFFSET) <<
+			BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_SHIFT;
+		break;
+	}
+
+	return bcm_phy_write_shadow(phydev, BCM54XX_SHD_SCR2, val);
+}
+EXPORT_SYMBOL_GPL(bcm_phy_downshift_set);
+
 MODULE_DESCRIPTION("Broadcom PHY Library");
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Broadcom Corporation");
diff --git a/drivers/net/phy/bcm-phy-lib.h b/drivers/net/phy/bcm-phy-lib.h
index 31cb4fdf5d5a..3f492e629094 100644
--- a/drivers/net/phy/bcm-phy-lib.h
+++ b/drivers/net/phy/bcm-phy-lib.h
@@ -37,4 +37,9 @@ int bcm_phy_config_intr(struct phy_device *phydev);
 int bcm_phy_enable_apd(struct phy_device *phydev, bool dll_pwr_down);
 
 int bcm_phy_enable_eee(struct phy_device *phydev);
+
+int bcm_phy_downshift_get(struct phy_device *phydev, u8 *count);
+
+int bcm_phy_downshift_set(struct phy_device *phydev, u8 count);
+
 #endif /* _LINUX_BCM_PHY_LIB_H */
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 848dc508ef57..f9f8aaf9c943 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -114,6 +114,7 @@
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC	0x0007
 #define MII_BCM54XX_AUXCTL_SHDWSEL_READ_SHIFT	12
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN	(1 << 8)
+#define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_WIRESPEED_EN	(1 << 4)
 
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MASK	0x0007
 
@@ -130,6 +131,7 @@
 #define BCM_LED_SRC_INTR	0x6
 #define BCM_LED_SRC_QUALITY	0x7
 #define BCM_LED_SRC_RCVLED	0x8
+#define BCM_LED_SRC_WIRESPEED	0x9
 #define BCM_LED_SRC_MULTICOLOR1	0xa
 #define BCM_LED_SRC_OPENSHORT	0xb
 #define BCM_LED_SRC_OFF		0xe	/* Tied high */
@@ -141,6 +143,14 @@
  * Shadow values go into bits [14:10] of register 0x1c to select a shadow
  * register to access.
  */
+
+/* 00100: Reserved control register 2 */
+#define BCM54XX_SHD_SCR2		0x04
+#define  BCM54XX_SHD_SCR2_WSPD_RTRY_DIS	0x100
+#define  BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_SHIFT	2
+#define  BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_OFFSET	2
+#define  BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_MASK	0x7
+
 /* 00101: Spare Control Register 3 */
 #define BCM54XX_SHD_SCR3		0x05
 #define  BCM54XX_SHD_SCR3_DEF_CLK125	0x0001
-- 
cgit v1.2.3


From 3df5b3c67546fb05266766b6abaf71563f82efe4 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Tue, 22 Nov 2016 23:09:54 +0200
Subject: net: Add net-device param to the get offloaded stats ndo

Some drivers would need to check few internal matters for
that. To be used in downstream mlx5 commit.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 +-
 include/linux/netdevice.h                      | 4 ++--
 net/core/rtnetlink.c                           | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 4a1f9d5f7c03..e0d7d5adbaee 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -857,7 +857,7 @@ mlxsw_sp_port_get_sw_stats64(const struct net_device *dev,
 	return 0;
 }
 
-static bool mlxsw_sp_port_has_offload_stats(int attr_id)
+static bool mlxsw_sp_port_has_offload_stats(const struct net_device *dev, int attr_id)
 {
 	switch (attr_id) {
 	case IFLA_OFFLOAD_XSTATS_CPU_HIT:
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e84800edd249..ae32a27523f9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -925,7 +925,7 @@ struct netdev_xdp {
  *	3. Update dev->stats asynchronously and atomically, and define
  *	   neither operation.
  *
- * bool (*ndo_has_offload_stats)(int attr_id)
+ * bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id)
  *	Return true if this device supports offload stats of this attr_id.
  *
  * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev,
@@ -1165,7 +1165,7 @@ struct net_device_ops {
 
 	struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
 						     struct rtnl_link_stats64 *storage);
-	bool			(*ndo_has_offload_stats)(int attr_id);
+	bool			(*ndo_has_offload_stats)(const struct net_device *dev, int attr_id);
 	int			(*ndo_get_offload_stats)(int attr_id,
 							 const struct net_device *dev,
 							 void *attr_data);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a99917b5de33..ef8a96010816 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3671,7 +3671,7 @@ static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
 		if (!size)
 			continue;
 
-		if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+		if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
 			continue;
 
 		attr = nla_reserve_64bit(skb, attr_id, size,
@@ -3712,7 +3712,7 @@ static int rtnl_get_offload_stats_size(const struct net_device *dev)
 
 	for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
 	     attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
-		if (!dev->netdev_ops->ndo_has_offload_stats(attr_id))
+		if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
 			continue;
 		size = rtnl_get_offload_stats_attr_size(attr_id);
 		nla_size += nla_total_size_64bit(size);
-- 
cgit v1.2.3


From 34e4e99078667d30f71a50c1e5181e4270e9d8bb Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@mellanox.com>
Date: Tue, 22 Nov 2016 23:09:58 +0200
Subject: net/mlx5: Enable to query min inline for a specific vport

Also move the inline capablities enum to a shared header vport.h

Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  6 ------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 11 +++++------
 drivers/net/ethernet/mellanox/mlx5/core/vport.c   | 14 ++++++++------
 include/linux/mlx5/vport.h                        | 10 ++++++++--
 4 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index ebf5dbc85bff..a2b32ed24315 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -150,12 +150,6 @@ static inline int mlx5_max_log_rq_size(int wq_type)
 	}
 }
 
-enum {
-	MLX5E_INLINE_MODE_L2,
-	MLX5E_INLINE_MODE_VPORT_CONTEXT,
-	MLX5_INLINE_MODE_NOT_REQUIRED,
-};
-
 struct mlx5e_tx_wqe {
 	struct mlx5_wqe_ctrl_seg ctrl;
 	struct mlx5_wqe_eth_seg  eth;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 8e8d809bf3fd..19403d6bf369 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -957,7 +957,7 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 	sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2;
 	sq->max_inline  = param->max_inline;
 	sq->min_inline_mode =
-		MLX5_CAP_ETH(mdev, wqe_inline_mode) == MLX5E_INLINE_MODE_VPORT_CONTEXT ?
+		MLX5_CAP_ETH(mdev, wqe_inline_mode) == MLX5_CAP_INLINE_MODE_VPORT_CONTEXT ?
 		param->min_inline_mode : 0;
 
 	err = mlx5e_alloc_sq_db(sq, cpu_to_node(c->cpu));
@@ -3417,14 +3417,13 @@ static void mlx5e_query_min_inline(struct mlx5_core_dev *mdev,
 				   u8 *min_inline_mode)
 {
 	switch (MLX5_CAP_ETH(mdev, wqe_inline_mode)) {
-	case MLX5E_INLINE_MODE_L2:
+	case MLX5_CAP_INLINE_MODE_L2:
 		*min_inline_mode = MLX5_INLINE_MODE_L2;
 		break;
-	case MLX5E_INLINE_MODE_VPORT_CONTEXT:
-		mlx5_query_nic_vport_min_inline(mdev,
-						min_inline_mode);
+	case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
+		mlx5_query_nic_vport_min_inline(mdev, 0, min_inline_mode);
 		break;
-	case MLX5_INLINE_MODE_NOT_REQUIRED:
+	case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
 		*min_inline_mode = MLX5_INLINE_MODE_NONE;
 		break;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 525f17af108e..269e4401c342 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -113,15 +113,17 @@ static int mlx5_modify_nic_vport_context(struct mlx5_core_dev *mdev, void *in,
 	return mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
 }
 
-void mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
-				     u8 *min_inline_mode)
+int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
+				    u16 vport, u8 *min_inline)
 {
 	u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {0};
+	int err;
 
-	mlx5_query_nic_vport_context(mdev, 0, out, sizeof(out));
-
-	*min_inline_mode = MLX5_GET(query_nic_vport_context_out, out,
-				    nic_vport_context.min_wqe_inline_mode);
+	err = mlx5_query_nic_vport_context(mdev, vport, out, sizeof(out));
+	if (!err)
+		*min_inline = MLX5_GET(query_nic_vport_context_out, out,
+				       nic_vport_context.min_wqe_inline_mode);
+	return err;
 }
 EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_min_inline);
 
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 451b0bde9083..ec35157ea725 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -36,6 +36,12 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/device.h>
 
+enum {
+	MLX5_CAP_INLINE_MODE_L2,
+	MLX5_CAP_INLINE_MODE_VPORT_CONTEXT,
+	MLX5_CAP_INLINE_MODE_NOT_REQUIRED,
+};
+
 u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport);
 u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
 				u16 vport);
@@ -43,8 +49,8 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
 				  u16 vport, u8 state);
 int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 				     u16 vport, u8 *addr);
-void mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
-				     u8 *min_inline);
+int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
+				    u16 vport, u8 *min_inline);
 int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev,
 				     u16 vport, u8 min_inline);
 int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev,
-- 
cgit v1.2.3


From 3007098494bec614fb55dee7bc0410bb7db5ad18 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 Nov 2016 16:52:26 +0100
Subject: cgroup: add support for eBPF programs

This patch adds two sets of eBPF program pointers to struct cgroup.
One for such that are directly pinned to a cgroup, and one for such
that are effective for it.

To illustrate the logic behind that, assume the following example
cgroup hierarchy.

  A - B - C
        \ D - E

If only B has a program attached, it will be effective for B, C, D
and E. If D then attaches a program itself, that will be effective for
both D and E, and the program in B will only affect B and C. Only one
program of a given type is effective for a cgroup.

Attaching and detaching programs will be done through the bpf(2)
syscall. For now, ingress and egress inet socket filtering are the
only supported use-cases.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h  |  79 +++++++++++++++++++++
 include/linux/cgroup-defs.h |   4 ++
 init/Kconfig                |  12 ++++
 kernel/bpf/Makefile         |   1 +
 kernel/bpf/cgroup.c         | 167 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/cgroup.c             |  18 +++++
 6 files changed, 281 insertions(+)
 create mode 100644 include/linux/bpf-cgroup.h
 create mode 100644 kernel/bpf/cgroup.c

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
new file mode 100644
index 000000000000..ec80d0c0953e
--- /dev/null
+++ b/include/linux/bpf-cgroup.h
@@ -0,0 +1,79 @@
+#ifndef _BPF_CGROUP_H
+#define _BPF_CGROUP_H
+
+#include <linux/bpf.h>
+#include <linux/jump_label.h>
+#include <uapi/linux/bpf.h>
+
+struct sock;
+struct cgroup;
+struct sk_buff;
+
+#ifdef CONFIG_CGROUP_BPF
+
+extern struct static_key_false cgroup_bpf_enabled_key;
+#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
+
+struct cgroup_bpf {
+	/*
+	 * Store two sets of bpf_prog pointers, one for programs that are
+	 * pinned directly to this cgroup, and one for those that are effective
+	 * when this cgroup is accessed.
+	 */
+	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
+	struct bpf_prog *effective[MAX_BPF_ATTACH_TYPE];
+};
+
+void cgroup_bpf_put(struct cgroup *cgrp);
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
+
+void __cgroup_bpf_update(struct cgroup *cgrp,
+			 struct cgroup *parent,
+			 struct bpf_prog *prog,
+			 enum bpf_attach_type type);
+
+/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
+void cgroup_bpf_update(struct cgroup *cgrp,
+		       struct bpf_prog *prog,
+		       enum bpf_attach_type type);
+
+int __cgroup_bpf_run_filter(struct sock *sk,
+			    struct sk_buff *skb,
+			    enum bpf_attach_type type);
+
+/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb)			\
+({									\
+	int __ret = 0;							\
+	if (cgroup_bpf_enabled)						\
+		__ret = __cgroup_bpf_run_filter(sk, skb,		\
+						BPF_CGROUP_INET_INGRESS); \
+									\
+	__ret;								\
+})
+
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb)				\
+({									\
+	int __ret = 0;							\
+	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		\
+		typeof(sk) __sk = sk_to_full_sk(sk);			\
+		if (sk_fullsock(__sk))					\
+			__ret = __cgroup_bpf_run_filter(__sk, skb,	\
+						BPF_CGROUP_INET_EGRESS); \
+	}								\
+	__ret;								\
+})
+
+#else
+
+struct cgroup_bpf {};
+static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
+static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
+				      struct cgroup *parent) {}
+
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
+
+#endif /* CONFIG_CGROUP_BPF */
+
+#endif /* _BPF_CGROUP_H */
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 5b17de62c962..861b4677fc5b 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,6 +16,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/workqueue.h>
+#include <linux/bpf-cgroup.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -300,6 +301,9 @@ struct cgroup {
 	/* used to schedule release agent */
 	struct work_struct release_agent_work;
 
+	/* used to store eBPF programs */
+	struct cgroup_bpf bpf;
+
 	/* ids of the ancestors at each level including self */
 	int ancestor_ids[];
 };
diff --git a/init/Kconfig b/init/Kconfig
index 34407f15e6d3..405120b5f13e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1154,6 +1154,18 @@ config CGROUP_PERF
 
 	  Say N if unsure.
 
+config CGROUP_BPF
+	bool "Support for eBPF programs attached to cgroups"
+	depends on BPF_SYSCALL && SOCK_CGROUP_DATA
+	help
+	  Allow attaching eBPF programs to a cgroup using the bpf(2)
+	  syscall command BPF_PROG_ATTACH.
+
+	  In which context these programs are accessed depends on the type
+	  of attachment. For instance, programs that are attached using
+	  BPF_CGROUP_INET_INGRESS will be executed on the ingress path of
+	  inet sockets.
+
 config CGROUP_DEBUG
 	bool "Example controller"
 	default n
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index c4d89d6e2058..1276474ac3cd 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -5,3 +5,4 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
+obj-$(CONFIG_CGROUP_BPF) += cgroup.o
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
new file mode 100644
index 000000000000..a0ab43f264b0
--- /dev/null
+++ b/kernel/bpf/cgroup.c
@@ -0,0 +1,167 @@
+/*
+ * Functions to manage eBPF programs attached to cgroups
+ *
+ * Copyright (c) 2016 Daniel Mack
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
+#include <net/sock.h>
+
+DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+
+/**
+ * cgroup_bpf_put() - put references of all bpf programs
+ * @cgrp: the cgroup to modify
+ */
+void cgroup_bpf_put(struct cgroup *cgrp)
+{
+	unsigned int type;
+
+	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
+		struct bpf_prog *prog = cgrp->bpf.prog[type];
+
+		if (prog) {
+			bpf_prog_put(prog);
+			static_branch_dec(&cgroup_bpf_enabled_key);
+		}
+	}
+}
+
+/**
+ * cgroup_bpf_inherit() - inherit effective programs from parent
+ * @cgrp: the cgroup to modify
+ * @parent: the parent to inherit from
+ */
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+{
+	unsigned int type;
+
+	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
+		struct bpf_prog *e;
+
+		e = rcu_dereference_protected(parent->bpf.effective[type],
+					      lockdep_is_held(&cgroup_mutex));
+		rcu_assign_pointer(cgrp->bpf.effective[type], e);
+	}
+}
+
+/**
+ * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ *                         propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
+ * @prog: A new program to pin
+ * @type: Type of pinning operation (ingress/egress)
+ *
+ * Each cgroup has a set of two pointers for bpf programs; one for eBPF
+ * programs it owns, and which is effective for execution.
+ *
+ * If @prog is %NULL, this function attaches a new program to the cgroup and
+ * releases the one that is currently attached, if any. @prog is then made
+ * the effective program of type @type in that cgroup.
+ *
+ * If @prog is %NULL, the currently attached program of type @type is released,
+ * and the effective program of the parent cgroup (if any) is inherited to
+ * @cgrp.
+ *
+ * Then, the descendants of @cgrp are walked and the effective program for
+ * each of them is set to the effective program of @cgrp unless the
+ * descendant has its own program attached, in which case the subbranch is
+ * skipped. This ensures that delegated subcgroups with own programs are left
+ * untouched.
+ *
+ * Must be called with cgroup_mutex held.
+ */
+void __cgroup_bpf_update(struct cgroup *cgrp,
+			 struct cgroup *parent,
+			 struct bpf_prog *prog,
+			 enum bpf_attach_type type)
+{
+	struct bpf_prog *old_prog, *effective;
+	struct cgroup_subsys_state *pos;
+
+	old_prog = xchg(cgrp->bpf.prog + type, prog);
+
+	effective = (!prog && parent) ?
+		rcu_dereference_protected(parent->bpf.effective[type],
+					  lockdep_is_held(&cgroup_mutex)) :
+		prog;
+
+	css_for_each_descendant_pre(pos, &cgrp->self) {
+		struct cgroup *desc = container_of(pos, struct cgroup, self);
+
+		/* skip the subtree if the descendant has its own program */
+		if (desc->bpf.prog[type] && desc != cgrp)
+			pos = css_rightmost_descendant(pos);
+		else
+			rcu_assign_pointer(desc->bpf.effective[type],
+					   effective);
+	}
+
+	if (prog)
+		static_branch_inc(&cgroup_bpf_enabled_key);
+
+	if (old_prog) {
+		bpf_prog_put(old_prog);
+		static_branch_dec(&cgroup_bpf_enabled_key);
+	}
+}
+
+/**
+ * __cgroup_bpf_run_filter() - Run a program for packet filtering
+ * @sk: The socken sending or receiving traffic
+ * @skb: The skb that is being sent or received
+ * @type: The type of program to be exectuted
+ *
+ * If no socket is passed, or the socket is not of type INET or INET6,
+ * this function does nothing and returns 0.
+ *
+ * The program type passed in via @type must be suitable for network
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter(struct sock *sk,
+			    struct sk_buff *skb,
+			    enum bpf_attach_type type)
+{
+	struct bpf_prog *prog;
+	struct cgroup *cgrp;
+	int ret = 0;
+
+	if (!sk || !sk_fullsock(sk))
+		return 0;
+
+	if (sk->sk_family != AF_INET &&
+	    sk->sk_family != AF_INET6)
+		return 0;
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+
+	rcu_read_lock();
+
+	prog = rcu_dereference(cgrp->bpf.effective[type]);
+	if (prog) {
+		unsigned int offset = skb->data - skb_network_header(skb);
+
+		__skb_push(skb, offset);
+		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
+		__skb_pull(skb, offset);
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 85bc9beb046d..2ee9ec3051b2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5074,6 +5074,8 @@ static void css_release_work_fn(struct work_struct *work)
 		if (cgrp->kn)
 			RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
 					 NULL);
+
+		cgroup_bpf_put(cgrp);
 	}
 
 	mutex_unlock(&cgroup_mutex);
@@ -5281,6 +5283,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (!cgroup_on_dfl(cgrp))
 		cgrp->subtree_control = cgroup_control(cgrp);
 
+	if (parent)
+		cgroup_bpf_inherit(cgrp, parent);
+
 	cgroup_propagate_control(cgrp);
 
 	/* @cgrp doesn't have dir yet so the following will only create csses */
@@ -6495,6 +6500,19 @@ static __init int cgroup_namespaces_init(void)
 }
 subsys_initcall(cgroup_namespaces_init);
 
+#ifdef CONFIG_CGROUP_BPF
+void cgroup_bpf_update(struct cgroup *cgrp,
+		       struct bpf_prog *prog,
+		       enum bpf_attach_type type)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+
+	mutex_lock(&cgroup_mutex);
+	__cgroup_bpf_update(cgrp, parent, prog, type);
+	mutex_unlock(&cgroup_mutex);
+}
+#endif /* CONFIG_CGROUP_BPF */
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *
 debug_css_alloc(struct cgroup_subsys_state *parent_css)
-- 
cgit v1.2.3


From 5a717f4f8f2830f297b5511022481bdc27b9d576 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 24 Nov 2016 07:04:08 +0200
Subject: netdevice: fix sparse warning for HARD_TX_LOCK

sparse warns about context imbalance in any code
that uses HARD_TX_LOCK/UNLOCK - this is because it's
unable to determine that flags don't change so
lock and unlock are paired.

Seems easy enough to fix by adding __acquire/__release
calls.

With this patch af_packet.c is now sparse-clean,

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ff57cd2eba3b..4ffcd874cc20 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3462,6 +3462,17 @@ static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
 	txq->xmit_lock_owner = cpu;
 }
 
+static inline bool __netif_tx_acquire(struct netdev_queue *txq)
+{
+	__acquire(&txq->_xmit_lock);
+	return true;
+}
+
+static inline void __netif_tx_release(struct netdev_queue *txq)
+{
+	__release(&txq->_xmit_lock);
+}
+
 static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
 {
 	spin_lock_bh(&txq->_xmit_lock);
@@ -3563,17 +3574,21 @@ static inline void netif_tx_unlock_bh(struct net_device *dev)
 #define HARD_TX_LOCK(dev, txq, cpu) {			\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
 		__netif_tx_lock(txq, cpu);		\
+	} else {					\
+		__netif_tx_acquire(txq);		\
 	}						\
 }
 
 #define HARD_TX_TRYLOCK(dev, txq)			\
 	(((dev->features & NETIF_F_LLTX) == 0) ?	\
 		__netif_tx_trylock(txq) :		\
-		true )
+		__netif_tx_acquire(txq))
 
 #define HARD_TX_UNLOCK(dev, txq) {			\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
 		__netif_tx_unlock(txq);			\
+	} else {					\
+		__netif_tx_release(txq);		\
 	}						\
 }
 
-- 
cgit v1.2.3


From 88575199cc65de99a156888629a68180c830eff2 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 26 Nov 2016 01:28:04 +0100
Subject: bpf: drop unnecessary context cast from BPF_PROG_RUN

Since long already bpf_func is not only about struct sk_buff * as
input anymore. Make it generic as void *, so that callers don't
need to cast for it each time they call BPF_PROG_RUN().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +-
 include/linux/filter.h                              | 6 +++---
 kernel/events/core.c                                | 2 +-
 kernel/seccomp.c                                    | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index eb3715700c95..876ab3a92ad5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1518,7 +1518,7 @@ static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, unsigned int len)
 	xdp.data = data;
 	xdp.data_end = data + len;
 
-	return BPF_PROG_RUN(prog, (void *)&xdp);
+	return BPF_PROG_RUN(prog, &xdp);
 }
 
 /**
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1f09c521adfe..7f246a281435 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -408,8 +408,8 @@ struct bpf_prog {
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
-	unsigned int		(*bpf_func)(const struct sk_buff *skb,
-					    const struct bpf_insn *filter);
+	unsigned int		(*bpf_func)(const void *ctx,
+					    const struct bpf_insn *insn);
 	/* Instructions for interpreter */
 	union {
 		struct sock_filter	insns[0];
@@ -504,7 +504,7 @@ static inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 	u32 ret;
 
 	rcu_read_lock();
-	ret = BPF_PROG_RUN(prog, (void *)xdp);
+	ret = BPF_PROG_RUN(prog, xdp);
 	rcu_read_unlock();
 
 	return ret;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6ee1febdf6ff..22cc734aa1b2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7726,7 +7726,7 @@ static void bpf_overflow_handler(struct perf_event *event,
 	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
 		goto out;
 	rcu_read_lock();
-	ret = BPF_PROG_RUN(event->prog, (void *)&ctx);
+	ret = BPF_PROG_RUN(event->prog, &ctx);
 	rcu_read_unlock();
 out:
 	__this_cpu_dec(bpf_prog_active);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 0db7c8a2afe2..bff9c774987a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -195,7 +195,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
 	 * value always takes priority (ignoring the DATA).
 	 */
 	for (; f; f = f->prev) {
-		u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
+		u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
 
 		if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
 			ret = cur_ret;
-- 
cgit v1.2.3


From c491680f8f489926eebfdf2cd006767fc8bdaa49 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 26 Nov 2016 01:28:06 +0100
Subject: bpf: reuse dev_is_mac_header_xmit for redirect

Commit dcf800344a91 ("net/sched: act_mirred: Refactor detection whether
dev needs xmit at mac header") added dev_is_mac_header_xmit(); since it's
also useful elsewhere, move it to if_arp.h and reuse it for BPF.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_arp.h | 16 ++++++++++++++++
 net/core/filter.c      | 14 ++++----------
 net/sched/act_mirred.c | 15 +--------------
 3 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h
index f563907ed776..3355efc89781 100644
--- a/include/linux/if_arp.h
+++ b/include/linux/if_arp.h
@@ -44,4 +44,20 @@ static inline int arp_hdr_len(struct net_device *dev)
 		return sizeof(struct arphdr) + (dev->addr_len + sizeof(u32)) * 2;
 	}
 }
+
+static inline bool dev_is_mac_header_xmit(const struct net_device *dev)
+{
+	switch (dev->type) {
+	case ARPHRD_TUNNEL:
+	case ARPHRD_TUNNEL6:
+	case ARPHRD_SIT:
+	case ARPHRD_IPGRE:
+	case ARPHRD_VOID:
+	case ARPHRD_NONE:
+		return false;
+	default:
+		return true;
+	}
+}
+
 #endif	/* _LINUX_IF_ARP_H */
diff --git a/net/core/filter.c b/net/core/filter.c
index ea315af56511..698a262b8ebb 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -30,6 +30,7 @@
 #include <linux/inet.h>
 #include <linux/netdevice.h>
 #include <linux/if_packet.h>
+#include <linux/if_arp.h>
 #include <linux/gfp.h>
 #include <net/ip.h>
 #include <net/protocol.h>
@@ -1696,17 +1697,10 @@ static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
 static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
 			  u32 flags)
 {
-	switch (dev->type) {
-	case ARPHRD_TUNNEL:
-	case ARPHRD_TUNNEL6:
-	case ARPHRD_SIT:
-	case ARPHRD_IPGRE:
-	case ARPHRD_VOID:
-	case ARPHRD_NONE:
-		return __bpf_redirect_no_mac(skb, dev, flags);
-	default:
+	if (dev_is_mac_header_xmit(dev))
 		return __bpf_redirect_common(skb, dev, flags);
-	}
+	else
+		return __bpf_redirect_no_mac(skb, dev, flags);
 }
 
 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index b2d417b8f46c..1af7baa732a3 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/gfp.h>
+#include <linux/if_arp.h>
 #include <net/net_namespace.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
@@ -73,20 +74,6 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
 static unsigned int mirred_net_id;
 static struct tc_action_ops act_mirred_ops;
 
-static bool dev_is_mac_header_xmit(const struct net_device *dev)
-{
-	switch (dev->type) {
-	case ARPHRD_TUNNEL:
-	case ARPHRD_TUNNEL6:
-	case ARPHRD_SIT:
-	case ARPHRD_IPGRE:
-	case ARPHRD_VOID:
-	case ARPHRD_NONE:
-		return false;
-	}
-	return true;
-}
-
 static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a, int ovr,
 			   int bind)
-- 
cgit v1.2.3


From 3a6a931dfb8e49a7377825b465d84e110fe89f68 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Sun, 27 Nov 2016 17:02:04 +0200
Subject: net/mlx5e: Support DCBX CEE API

Add DCBX CEE API interface for ConnectX-4. Configurations are stored in
a temporary structure and are applied to the card's firmware when
the CEE's setall callback function is called.

Note:
  priority group in CEE is equivalent to traffic class in ConnectX-4
  hardware spec.

  bw allocation per priority in CEE is not supported because ConnectX-4
  only supports bw allocation per traffic class.

  user priority in CEE does not have an equivalent term in ConnectX-4.
  Therefore, user priority to priority mapping in CEE is not supported.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  24 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 301 ++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/port.c     |  43 +++
 include/linux/mlx5/port.h                          |   4 +
 4 files changed, 370 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index a2b32ed24315..31387ed9113b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -221,6 +221,26 @@ struct mlx5e_params {
 	u32 lro_timeout;
 };
 
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+struct mlx5e_cee_config {
+	/* bw pct for priority group */
+	u8                         pg_bw_pct[CEE_DCBX_MAX_PGS];
+	u8                         prio_to_pg_map[CEE_DCBX_MAX_PRIO];
+	bool                       pfc_setting[CEE_DCBX_MAX_PRIO];
+	bool                       pfc_enable;
+};
+
+enum {
+	MLX5_DCB_CHG_RESET,
+	MLX5_DCB_NO_CHG,
+	MLX5_DCB_CHG_NO_RESET,
+};
+
+struct mlx5e_dcbx {
+	struct mlx5e_cee_config    cee_cfg; /* pending configuration */
+};
+#endif
+
 struct mlx5e_tstamp {
 	rwlock_t                   lock;
 	struct cyclecounter        cycles;
@@ -688,6 +708,10 @@ struct mlx5e_priv {
 	struct mlx5e_stats         stats;
 	struct mlx5e_tstamp        tstamp;
 	u16 q_counter;
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+	struct mlx5e_dcbx          dcbx;
+#endif
+
 	const struct mlx5e_profile *profile;
 	void                      *ppriv;
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
index 762af16ed021..059524324fdb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
@@ -38,6 +38,9 @@
 #define MLX5E_100MB (100000)
 #define MLX5E_1GB   (1000000)
 
+#define MLX5E_CEE_STATE_UP    1
+#define MLX5E_CEE_STATE_DOWN  0
+
 static int mlx5e_dcbnl_ieee_getets(struct net_device *netdev,
 				   struct ieee_ets *ets)
 {
@@ -222,13 +225,15 @@ static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev,
 
 static u8 mlx5e_dcbnl_getdcbx(struct net_device *dev)
 {
-	return DCB_CAP_DCBX_HOST | DCB_CAP_DCBX_VER_IEEE;
+	return DCB_CAP_DCBX_HOST |
+	       DCB_CAP_DCBX_VER_IEEE |
+	       DCB_CAP_DCBX_VER_CEE;
 }
 
 static u8 mlx5e_dcbnl_setdcbx(struct net_device *dev, u8 mode)
 {
 	if ((mode & DCB_CAP_DCBX_LLD_MANAGED) ||
-	    (mode & DCB_CAP_DCBX_VER_CEE) ||
+	    !(mode & DCB_CAP_DCBX_VER_CEE) ||
 	    !(mode & DCB_CAP_DCBX_VER_IEEE) ||
 	    !(mode & DCB_CAP_DCBX_HOST))
 		return 1;
@@ -304,6 +309,281 @@ static int mlx5e_dcbnl_ieee_setmaxrate(struct net_device *netdev,
 	return mlx5_modify_port_ets_rate_limit(mdev, max_bw_value, max_bw_unit);
 }
 
+static u8 mlx5e_dcbnl_setall(struct net_device *netdev)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg;
+	struct mlx5_core_dev *mdev = priv->mdev;
+	struct ieee_ets ets;
+	struct ieee_pfc pfc;
+	int err;
+	int i;
+
+	memset(&ets, 0, sizeof(ets));
+	memset(&pfc, 0, sizeof(pfc));
+
+	ets.ets_cap = IEEE_8021QAZ_MAX_TCS;
+	for (i = 0; i < CEE_DCBX_MAX_PGS; i++) {
+		ets.tc_tx_bw[i] = cee_cfg->pg_bw_pct[i];
+		ets.tc_rx_bw[i] = cee_cfg->pg_bw_pct[i];
+		ets.tc_tsa[i]   = IEEE_8021QAZ_TSA_ETS;
+		ets.prio_tc[i]  = cee_cfg->prio_to_pg_map[i];
+	}
+
+	err = mlx5e_dbcnl_validate_ets(netdev, &ets);
+	if (err) {
+		netdev_err(netdev,
+			   "%s, Failed to validate ETS: %d\n", __func__, err);
+		goto out;
+	}
+
+	err = mlx5e_dcbnl_ieee_setets_core(priv, &ets);
+	if (err) {
+		netdev_err(netdev,
+			   "%s, Failed to set ETS: %d\n", __func__, err);
+		goto out;
+	}
+
+	/* Set PFC */
+	pfc.pfc_cap = mlx5_max_tc(mdev) + 1;
+	if (!cee_cfg->pfc_enable)
+		pfc.pfc_en = 0;
+	else
+		for (i = 0; i < CEE_DCBX_MAX_PRIO; i++)
+			pfc.pfc_en |= cee_cfg->pfc_setting[i] << i;
+
+	err = mlx5e_dcbnl_ieee_setpfc(netdev, &pfc);
+	if (err) {
+		netdev_err(netdev,
+			   "%s, Failed to set PFC: %d\n", __func__, err);
+		goto out;
+	}
+out:
+	return err ? MLX5_DCB_NO_CHG : MLX5_DCB_CHG_RESET;
+}
+
+static u8 mlx5e_dcbnl_getstate(struct net_device *netdev)
+{
+	return MLX5E_CEE_STATE_UP;
+}
+
+static void mlx5e_dcbnl_getpermhwaddr(struct net_device *netdev,
+				      u8 *perm_addr)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	if (!perm_addr)
+		return;
+
+	mlx5_query_nic_vport_mac_address(priv->mdev, 0, perm_addr);
+}
+
+static void mlx5e_dcbnl_setpgtccfgtx(struct net_device *netdev,
+				     int priority, u8 prio_type,
+				     u8 pgid, u8 bw_pct, u8 up_map)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg;
+
+	if (priority >= CEE_DCBX_MAX_PRIO) {
+		netdev_err(netdev,
+			   "%s, priority is out of range\n", __func__);
+		return;
+	}
+
+	if (pgid >= CEE_DCBX_MAX_PGS) {
+		netdev_err(netdev,
+			   "%s, priority group is out of range\n", __func__);
+		return;
+	}
+
+	cee_cfg->prio_to_pg_map[priority] = pgid;
+}
+
+static void mlx5e_dcbnl_setpgbwgcfgtx(struct net_device *netdev,
+				      int pgid, u8 bw_pct)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg;
+
+	if (pgid >= CEE_DCBX_MAX_PGS) {
+		netdev_err(netdev,
+			   "%s, priority group is out of range\n", __func__);
+		return;
+	}
+
+	cee_cfg->pg_bw_pct[pgid] = bw_pct;
+}
+
+static void mlx5e_dcbnl_getpgtccfgtx(struct net_device *netdev,
+				     int priority, u8 *prio_type,
+				     u8 *pgid, u8 *bw_pct, u8 *up_map)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	if (priority >= CEE_DCBX_MAX_PRIO) {
+		netdev_err(netdev,
+			   "%s, priority is out of range\n", __func__);
+		return;
+	}
+
+	*prio_type = 0;
+	*bw_pct = 0;
+	*up_map = 0;
+
+	if (mlx5_query_port_prio_tc(mdev, priority, pgid))
+		*pgid = 0;
+}
+
+static void mlx5e_dcbnl_getpgbwgcfgtx(struct net_device *netdev,
+				      int pgid, u8 *bw_pct)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	if (pgid >= CEE_DCBX_MAX_PGS) {
+		netdev_err(netdev,
+			   "%s, priority group is out of range\n", __func__);
+		return;
+	}
+
+	if (mlx5_query_port_tc_bw_alloc(mdev, pgid, bw_pct))
+		*bw_pct = 0;
+}
+
+static void mlx5e_dcbnl_setpfccfg(struct net_device *netdev,
+				  int priority, u8 setting)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg;
+
+	if (priority >= CEE_DCBX_MAX_PRIO) {
+		netdev_err(netdev,
+			   "%s, priority is out of range\n", __func__);
+		return;
+	}
+
+	if (setting > 1)
+		return;
+
+	cee_cfg->pfc_setting[priority] = setting;
+}
+
+static int
+mlx5e_dcbnl_get_priority_pfc(struct net_device *netdev,
+			     int priority, u8 *setting)
+{
+	struct ieee_pfc pfc;
+	int err;
+
+	err = mlx5e_dcbnl_ieee_getpfc(netdev, &pfc);
+
+	if (err)
+		*setting = 0;
+	else
+		*setting = (pfc.pfc_en >> priority) & 0x01;
+
+	return err;
+}
+
+static void mlx5e_dcbnl_getpfccfg(struct net_device *netdev,
+				  int priority, u8 *setting)
+{
+	if (priority >= CEE_DCBX_MAX_PRIO) {
+		netdev_err(netdev,
+			   "%s, priority is out of range\n", __func__);
+		return;
+	}
+
+	if (!setting)
+		return;
+
+	mlx5e_dcbnl_get_priority_pfc(netdev, priority, setting);
+}
+
+static u8 mlx5e_dcbnl_getcap(struct net_device *netdev,
+			     int capid, u8 *cap)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+	u8 rval = 0;
+
+	switch (capid) {
+	case DCB_CAP_ATTR_PG:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_PFC:
+		*cap = true;
+		break;
+	case DCB_CAP_ATTR_UP2TC:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_PG_TCS:
+		*cap = 1 << mlx5_max_tc(mdev);
+		break;
+	case DCB_CAP_ATTR_PFC_TCS:
+		*cap = 1 << mlx5_max_tc(mdev);
+		break;
+	case DCB_CAP_ATTR_GSP:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_BCN:
+		*cap = false;
+		break;
+	case DCB_CAP_ATTR_DCBX:
+		*cap = (DCB_CAP_DCBX_LLD_MANAGED |
+			DCB_CAP_DCBX_VER_CEE |
+			DCB_CAP_DCBX_STATIC);
+		break;
+	default:
+		*cap = 0;
+		rval = 1;
+		break;
+	}
+
+	return rval;
+}
+
+static int mlx5e_dcbnl_getnumtcs(struct net_device *netdev,
+				 int tcs_id, u8 *num)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	switch (tcs_id) {
+	case DCB_NUMTCS_ATTR_PG:
+	case DCB_NUMTCS_ATTR_PFC:
+		*num = mlx5_max_tc(mdev) + 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static u8 mlx5e_dcbnl_getpfcstate(struct net_device *netdev)
+{
+	struct ieee_pfc pfc;
+
+	if (mlx5e_dcbnl_ieee_getpfc(netdev, &pfc))
+		return MLX5E_CEE_STATE_DOWN;
+
+	return pfc.pfc_en ? MLX5E_CEE_STATE_UP : MLX5E_CEE_STATE_DOWN;
+}
+
+static void mlx5e_dcbnl_setpfcstate(struct net_device *netdev, u8 state)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_cee_config *cee_cfg = &priv->dcbx.cee_cfg;
+
+	if ((state != MLX5E_CEE_STATE_UP) && (state != MLX5E_CEE_STATE_DOWN))
+		return;
+
+	cee_cfg->pfc_enable = state;
+}
+
 const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops = {
 	.ieee_getets	= mlx5e_dcbnl_ieee_getets,
 	.ieee_setets	= mlx5e_dcbnl_ieee_setets,
@@ -313,4 +593,21 @@ const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops = {
 	.ieee_setpfc	= mlx5e_dcbnl_ieee_setpfc,
 	.getdcbx	= mlx5e_dcbnl_getdcbx,
 	.setdcbx	= mlx5e_dcbnl_setdcbx,
+
+/* CEE interfaces */
+	.setall         = mlx5e_dcbnl_setall,
+	.getstate       = mlx5e_dcbnl_getstate,
+	.getpermhwaddr  = mlx5e_dcbnl_getpermhwaddr,
+
+	.setpgtccfgtx   = mlx5e_dcbnl_setpgtccfgtx,
+	.setpgbwgcfgtx  = mlx5e_dcbnl_setpgbwgcfgtx,
+	.getpgtccfgtx   = mlx5e_dcbnl_getpgtccfgtx,
+	.getpgbwgcfgtx  = mlx5e_dcbnl_getpgbwgcfgtx,
+
+	.setpfccfg      = mlx5e_dcbnl_setpfccfg,
+	.getpfccfg      = mlx5e_dcbnl_getpfccfg,
+	.getcap         = mlx5e_dcbnl_getcap,
+	.getnumtcs      = mlx5e_dcbnl_getnumtcs,
+	.getpfcstate    = mlx5e_dcbnl_getpfcstate,
+	.setpfcstate    = mlx5e_dcbnl_setpfcstate,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index b77928f5b46e..ed4898fcadc9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -572,6 +572,28 @@ int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc)
 }
 EXPORT_SYMBOL_GPL(mlx5_set_port_prio_tc);
 
+int mlx5_query_port_prio_tc(struct mlx5_core_dev *mdev,
+			    u8 prio, u8 *tc)
+{
+	u32 in[MLX5_ST_SZ_DW(qtct_reg)];
+	u32 out[MLX5_ST_SZ_DW(qtct_reg)];
+	int err;
+
+	memset(in, 0, sizeof(in));
+	memset(out, 0, sizeof(out));
+
+	MLX5_SET(qtct_reg, in, port_number, 1);
+	MLX5_SET(qtct_reg, in, prio, prio);
+
+	err = mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				   sizeof(out), MLX5_REG_QTCT, 0, 0);
+	if (!err)
+		*tc = MLX5_GET(qtct_reg, out, tclass);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_prio_tc);
+
 static int mlx5_set_port_qetcr_reg(struct mlx5_core_dev *mdev, u32 *in,
 				   int inlen)
 {
@@ -625,6 +647,27 @@ int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw)
 }
 EXPORT_SYMBOL_GPL(mlx5_set_port_tc_bw_alloc);
 
+int mlx5_query_port_tc_bw_alloc(struct mlx5_core_dev *mdev,
+				u8 tc, u8 *bw_pct)
+{
+	u32 out[MLX5_ST_SZ_DW(qetc_reg)];
+	void *ets_tcn_conf;
+	int err;
+
+	err = mlx5_query_port_qetcr_reg(mdev, out, sizeof(out));
+	if (err)
+		return err;
+
+	ets_tcn_conf = MLX5_ADDR_OF(qetc_reg, out,
+				    tc_configuration[tc]);
+
+	*bw_pct = MLX5_GET(ets_tcn_config_reg, ets_tcn_conf,
+			   bw_allocation);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_query_port_tc_bw_alloc);
+
 int mlx5_modify_port_ets_rate_limit(struct mlx5_core_dev *mdev,
 				    u8 *max_bw_value,
 				    u8 *max_bw_units)
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index dde8c7ec5ff1..bdee439f8cf3 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -141,8 +141,12 @@ int mlx5_query_port_pfc(struct mlx5_core_dev *dev, u8 *pfc_en_tx,
 int mlx5_max_tc(struct mlx5_core_dev *mdev);
 
 int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc);
+int mlx5_query_port_prio_tc(struct mlx5_core_dev *mdev,
+			    u8 prio, u8 *tc);
 int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group);
 int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw);
+int mlx5_query_port_tc_bw_alloc(struct mlx5_core_dev *mdev,
+				u8 tc, u8 *bw_pct);
 int mlx5_modify_port_ets_rate_limit(struct mlx5_core_dev *mdev,
 				    u8 *max_bw_value,
 				    u8 *max_bw_unit);
-- 
cgit v1.2.3


From 341c5ee2fb78420ffc441df36f93226be8069b0a Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Sun, 27 Nov 2016 17:02:06 +0200
Subject: net/mlx5: Add DCBX firmware commands support

Add set/query commands for DCBX_PARAM register

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 20 ++++++++++++++++++++
 include/linux/mlx5/driver.h                    |  7 +++++++
 include/linux/mlx5/port.h                      |  2 ++
 3 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index ed4898fcadc9..d2ec9d232a70 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -548,6 +548,26 @@ int mlx5_max_tc(struct mlx5_core_dev *mdev)
 	return num_tc - 1;
 }
 
+int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out)
+{
+	u32 in[MLX5_ST_SZ_DW(dcbx_param)] = {0};
+
+	MLX5_SET(dcbx_param, in, port_number, 1);
+
+	return  mlx5_core_access_reg(mdev, in, sizeof(in), out,
+				    sizeof(in), MLX5_REG_DCBX_PARAM, 0, 0);
+}
+
+int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in)
+{
+	u32 out[MLX5_ST_SZ_DW(dcbx_param)];
+
+	MLX5_SET(dcbx_param, in, port_number, 1);
+
+	return mlx5_core_access_reg(mdev, in, sizeof(out), out,
+				    sizeof(out), MLX5_REG_DCBX_PARAM, 0, 1);
+}
+
 int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc)
 {
 	u32 in[MLX5_ST_SZ_DW(qtct_reg)] = {0};
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ae1f451e8f89..68b85efc3908 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -104,6 +104,8 @@ enum {
 enum {
 	MLX5_REG_QETCR		 = 0x4005,
 	MLX5_REG_QTCT		 = 0x400a,
+	MLX5_REG_DCBX_PARAM      = 0x4020,
+	MLX5_REG_DCBX_APP        = 0x4021,
 	MLX5_REG_PCAP		 = 0x5001,
 	MLX5_REG_PMTU		 = 0x5003,
 	MLX5_REG_PTYS		 = 0x5004,
@@ -124,6 +126,11 @@ enum {
 	MLX5_REG_MPCNT		 = 0x9051,
 };
 
+enum mlx5_dcbx_oper_mode {
+	MLX5E_DCBX_PARAM_VER_OPER_HOST  = 0x0,
+	MLX5E_DCBX_PARAM_VER_OPER_AUTO  = 0x3,
+};
+
 enum {
 	MLX5_ATOMIC_OPS_CMP_SWAP	= 1 << 0,
 	MLX5_ATOMIC_OPS_FETCH_ADD	= 1 << 1,
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index bdee439f8cf3..e527732fb31b 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -162,4 +162,6 @@ void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported,
 int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
 			     u16 offset, u16 size, u8 *data);
 
+int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out);
+int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in);
 #endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From d853d145ea3e63387a2ac759aa41d5e43876e561 Mon Sep 17 00:00:00 2001
From: jbrunet <jbrunet@baylibre.com>
Date: Mon, 28 Nov 2016 10:46:46 +0100
Subject: net: phy: add an option to disable EEE advertisement
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds an option to disable EEE advertisement in the generic PHY
by providing a mask of prohibited modes corresponding to the value found in
the MDIO_AN_EEE_ADV register.

On some platforms, PHY Low power idle seems to be causing issues, even
breaking the link some cases. The patch provides a convenient way for these
platforms to disable EEE advertisement and work around the issue.

Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Tested-by: Yegor Yefremov <yegorslists@googlemail.com>
Tested-by: Andreas Färber <afaerber@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c        |  3 ++
 drivers/net/phy/phy_device.c | 80 +++++++++++++++++++++++++++++++++++++++-----
 include/linux/phy.h          |  3 ++
 3 files changed, 77 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 73adbaa9ac86..a3981cc6448a 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1396,6 +1396,9 @@ int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data)
 {
 	int val = ethtool_adv_to_mmd_eee_adv_t(data->advertised);
 
+	/* Mask prohibited EEE modes */
+	val &= ~phydev->eee_broken_modes;
+
 	phy_write_mmd_indirect(phydev, MDIO_AN_EEE_ADV, MDIO_MMD_AN, val);
 
 	return 0;
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index ba86c191a13e..83e52f1b80f2 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1120,6 +1120,43 @@ static int genphy_config_advert(struct phy_device *phydev)
 	return changed;
 }
 
+/**
+ * genphy_config_eee_advert - disable unwanted eee mode advertisement
+ * @phydev: target phy_device struct
+ *
+ * Description: Writes MDIO_AN_EEE_ADV after disabling unsupported energy
+ *   efficent ethernet modes. Returns 0 if the PHY's advertisement hasn't
+ *   changed, and 1 if it has changed.
+ */
+static int genphy_config_eee_advert(struct phy_device *phydev)
+{
+	u32 broken = phydev->eee_broken_modes;
+	u32 old_adv, adv;
+
+	/* Nothing to disable */
+	if (!broken)
+		return 0;
+
+	/* If the following call fails, we assume that EEE is not
+	 * supported by the phy. If we read 0, EEE is not advertised
+	 * In both case, we don't need to continue
+	 */
+	adv = phy_read_mmd_indirect(phydev, MDIO_AN_EEE_ADV, MDIO_MMD_AN);
+	if (adv <= 0)
+		return 0;
+
+	old_adv = adv;
+	adv &= ~broken;
+
+	/* Advertising remains unchanged with the broken mask */
+	if (old_adv == adv)
+		return 0;
+
+	phy_write_mmd_indirect(phydev, MDIO_AN_EEE_ADV, MDIO_MMD_AN, adv);
+
+	return 1;
+}
+
 /**
  * genphy_setup_forced - configures/forces speed/duplex from @phydev
  * @phydev: target phy_device struct
@@ -1178,15 +1215,20 @@ EXPORT_SYMBOL(genphy_restart_aneg);
  */
 int genphy_config_aneg(struct phy_device *phydev)
 {
-	int result;
+	int err, changed;
+
+	changed = genphy_config_eee_advert(phydev);
 
 	if (AUTONEG_ENABLE != phydev->autoneg)
 		return genphy_setup_forced(phydev);
 
-	result = genphy_config_advert(phydev);
-	if (result < 0) /* error */
-		return result;
-	if (result == 0) {
+	err = genphy_config_advert(phydev);
+	if (err < 0) /* error */
+		return err;
+
+	changed |= err;
+
+	if (changed == 0) {
 		/* Advertisement hasn't changed, but maybe aneg was never on to
 		 * begin with?  Or maybe phy was isolated?
 		 */
@@ -1196,16 +1238,16 @@ int genphy_config_aneg(struct phy_device *phydev)
 			return ctl;
 
 		if (!(ctl & BMCR_ANENABLE) || (ctl & BMCR_ISOLATE))
-			result = 1; /* do restart aneg */
+			changed = 1; /* do restart aneg */
 	}
 
 	/* Only restart aneg if we are advertising something different
 	 * than we were before.
 	 */
-	if (result > 0)
-		result = genphy_restart_aneg(phydev);
+	if (changed > 0)
+		return genphy_restart_aneg(phydev);
 
-	return result;
+	return 0;
 }
 EXPORT_SYMBOL(genphy_config_aneg);
 
@@ -1563,6 +1605,21 @@ static void of_set_phy_supported(struct phy_device *phydev)
 		__set_phy_supported(phydev, max_speed);
 }
 
+static void of_set_phy_eee_broken(struct phy_device *phydev)
+{
+	struct device_node *node = phydev->mdio.dev.of_node;
+	u32 broken;
+
+	if (!IS_ENABLED(CONFIG_OF_MDIO))
+		return;
+
+	if (!node)
+		return;
+
+	if (!of_property_read_u32(node, "eee-broken-modes", &broken))
+		phydev->eee_broken_modes = broken;
+}
+
 /**
  * phy_probe - probe and init a PHY device
  * @dev: device to probe and init
@@ -1600,6 +1657,11 @@ static int phy_probe(struct device *dev)
 	of_set_phy_supported(phydev);
 	phydev->advertising = phydev->supported;
 
+	/* Get the EEE modes we want to prohibit. We will ask
+	 * the PHY stop advertising these mode later on
+	 */
+	of_set_phy_eee_broken(phydev);
+
 	/* Set the state to READY by default */
 	phydev->state = PHY_READY;
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index edde28ce163a..b53177fd38af 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -417,6 +417,9 @@ struct phy_device {
 	u32 advertising;
 	u32 lp_advertising;
 
+	/* Energy efficient ethernet modes which should be prohibited */
+	u32 eee_broken_modes;
+
 	int autoneg;
 
 	int link_timeout;
-- 
cgit v1.2.3


From 05b055e89121394058c75dc354e9a46e1e765579 Mon Sep 17 00:00:00 2001
From: Francis Yan <francisyyan@gmail.com>
Date: Sun, 27 Nov 2016 23:07:13 -0800
Subject: tcp: instrument tcp sender limits chronographs

This patch implements the skeleton of the TCP chronograph
instrumentation on sender side limits:

	1) idle (unspec)
	2) busy sending data other than 3-4 below
	3) rwnd-limited
	4) sndbuf-limited

The limits are enumerated 'tcp_chrono'. Since a connection in
theory can idle forever, we do not track the actual length of this
uninteresting idle period. For the rest we track how long the sender
spends in each limit. At any point during the life time of a
connection, the sender must be in one of the four states.

If there are multiple conditions worthy of tracking in a chronograph
then the highest priority enum takes precedence over
the other conditions. So that if something "more interesting"
starts happening, stop the previous chrono and start a new one.

The time unit is jiffy(u32) in order to save space in tcp_sock.
This implies application must sample the stats no longer than every
49 days of 1ms jiffy.

Signed-off-by: Francis Yan <francisyyan@gmail.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  7 +++++--
 include/net/tcp.h     | 14 ++++++++++++++
 net/ipv4/tcp_output.c | 30 ++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 32a7c7e35b71..d5d3bd814338 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -211,8 +211,11 @@ struct tcp_sock {
 		u8 reord;    /* reordering detected */
 	} rack;
 	u16	advmss;		/* Advertised MSS			*/
-	u8	rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
-		unused:7;
+	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
+	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
+	u8	chrono_type:2,	/* current chronograph type */
+		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
+		unused:5;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		thin_dupack : 1,/* Fast retransmit on first dupack      */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7de80739adab..e5ff4083870d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1516,6 +1516,20 @@ struct tcp_fastopen_context {
 	struct rcu_head		rcu;
 };
 
+/* Latencies incurred by various limits for a sender. They are
+ * chronograph-like stats that are mutually exclusive.
+ */
+enum tcp_chrono {
+	TCP_CHRONO_UNSPEC,
+	TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */
+	TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */
+	TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */
+	__TCP_CHRONO_MAX,
+};
+
+void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
+void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
+
 /* write queue abstraction */
 static inline void tcp_write_queue_purge(struct sock *sk)
 {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 19105b46a304..34f751776a01 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2081,6 +2081,36 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 	return false;
 }
 
+static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
+{
+	const u32 now = tcp_time_stamp;
+
+	if (tp->chrono_type > TCP_CHRONO_UNSPEC)
+		tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start;
+	tp->chrono_start = now;
+	tp->chrono_type = new;
+}
+
+void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* If there are multiple conditions worthy of tracking in a
+	 * chronograph then the highest priority enum takes precedence over
+	 * the other conditions. So that if something "more interesting"
+	 * starts happening, stop the previous chrono and start a new one.
+	 */
+	if (type > tp->chrono_type)
+		tcp_chrono_set(tp, type);
+}
+
+void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
+}
+
 /* This routine writes packets to the network.  It advances the
  * send_head.  This happens as incoming acks open up the remote
  * window for us.
-- 
cgit v1.2.3


From 1c885808e45601b2b6f68b30ac1d999e10b6f606 Mon Sep 17 00:00:00 2001
From: Francis Yan <francisyyan@gmail.com>
Date: Sun, 27 Nov 2016 23:07:18 -0800
Subject: tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING

This patch exports the sender chronograph stats via the socket
SO_TIMESTAMPING channel. Currently we can instrument how long a
particular application unit of data was queued in TCP by tracking
SOF_TIMESTAMPING_TX_SOFTWARE and SOF_TIMESTAMPING_TX_SCHED. Having
these sender chronograph stats exported simultaneously along with
these timestamps allow further breaking down the various sender
limitation.  For example, a video server can tell if a particular
chunk of video on a connection takes a long time to deliver because
TCP was experiencing small receive window. It is not possible to
tell before this patch without packet traces.

To prepare these stats, the user needs to set
SOF_TIMESTAMPING_OPT_STATS and SOF_TIMESTAMPING_OPT_TSONLY flags
while requesting other SOF_TIMESTAMPING TX timestamps. When the
timestamps are available in the error queue, the stats are returned
in a separate control message of type SCM_TIMESTAMPING_OPT_STATS,
in a list of TLVs (struct nlattr) of types: TCP_NLA_BUSY_TIME,
TCP_NLA_RWND_LIMITED, TCP_NLA_SNDBUF_LIMITED. Unit is microsecond.

Signed-off-by: Francis Yan <francisyyan@gmail.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/timestamping.txt | 10 ++++++++++
 arch/alpha/include/uapi/asm/socket.h      |  2 ++
 arch/frv/include/uapi/asm/socket.h        |  2 ++
 arch/ia64/include/uapi/asm/socket.h       |  2 ++
 arch/m32r/include/uapi/asm/socket.h       |  2 ++
 arch/mips/include/uapi/asm/socket.h       |  2 ++
 arch/mn10300/include/uapi/asm/socket.h    |  2 ++
 arch/parisc/include/uapi/asm/socket.h     |  2 ++
 arch/powerpc/include/uapi/asm/socket.h    |  2 ++
 arch/s390/include/uapi/asm/socket.h       |  2 ++
 arch/sparc/include/uapi/asm/socket.h      |  2 ++
 arch/xtensa/include/uapi/asm/socket.h     |  2 ++
 include/linux/tcp.h                       |  2 ++
 include/uapi/asm-generic/socket.h         |  2 ++
 include/uapi/linux/net_tstamp.h           |  3 ++-
 include/uapi/linux/tcp.h                  |  8 ++++++++
 net/core/skbuff.c                         | 14 +++++++++++---
 net/core/sock.c                           |  7 +++++++
 net/ipv4/tcp.c                            | 20 ++++++++++++++++++++
 net/socket.c                              |  7 ++++++-
 20 files changed, 90 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt
index 671cccf0dcd2..96f50694a748 100644
--- a/Documentation/networking/timestamping.txt
+++ b/Documentation/networking/timestamping.txt
@@ -182,6 +182,16 @@ SOF_TIMESTAMPING_OPT_TSONLY:
   the timestamp even if sysctl net.core.tstamp_allow_data is 0.
   This option disables SOF_TIMESTAMPING_OPT_CMSG.
 
+SOF_TIMESTAMPING_OPT_STATS:
+
+  Optional stats that are obtained along with the transmit timestamps.
+  It must be used together with SOF_TIMESTAMPING_OPT_TSONLY. When the
+  transmit timestamp is available, the stats are available in a
+  separate control message of type SCM_TIMESTAMPING_OPT_STATS, as a
+  list of TLVs (struct nlattr) of types. These stats allow the
+  application to associate various transport layer stats with
+  the transmit timestamps, such as how long a certain block of
+  data was limited by peer's receiver window.
 
 New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to
 disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 9e46d6e656d9..afc901b7a6f6 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -97,4 +97,6 @@
 
 #define SO_CNX_ADVICE		53
 
+#define SCM_TIMESTAMPING_OPT_STATS	54
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
index afbc98f02d27..81e03530ed39 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -90,5 +90,7 @@
 
 #define SO_CNX_ADVICE		53
 
+#define SCM_TIMESTAMPING_OPT_STATS	54
+
 #endif /* _ASM_SOCKET_H */
 
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index 0018fad9039f..57feb0c1f7d7 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -99,4 +99,6 @@
 
 #define SO_CNX_ADVICE		53
 
+#define SCM_TIMESTAMPING_OPT_STATS	54
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
index 5fe42fc7b6c5..5853f8e92c20 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -90,4 +90,6 @@
 
 #define SO_CNX_ADVICE		53
 
+#define SCM_TIMESTAMPING_OPT_STATS	54
+
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 2027240aafbb..566ecdcb5b4b 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -108,4 +108,6 @@
 
 #define SO_CNX_ADVICE		53
 
+#define SCM_TIMESTAMPING_OPT_STATS	54
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
index 5129f23a9ee1..0e12527c4b0e 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -90,4 +90,6 @@
 
 #define SO_CNX_ADVICE		53
 
+#define SCM_TIMESTAMPING_OPT_STATS	54
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 9c935d717df9..7a109b73ddf7 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -89,4 +89,6 @@
 
 #define SO_CNX_ADVICE		0x402E
 
+#define SCM_TIMESTAMPING_OPT_STATS	0x402F
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
index 1672e3398270..44583a52f882 100644
--- a/arch/powerpc/include/uapi/asm/socket.h
+++ b/arch/powerpc/include/uapi/asm/socket.h
@@ -97,4 +97,6 @@
 
 #define SO_CNX_ADVICE		53
 
+#define SCM_TIMESTAMPING_OPT_STATS	54
+
 #endif	/* _ASM_POWERPC_SOCKET_H */
diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
index 41b51c2f4f1b..b24a64cbfeb1 100644
--- a/arch/s390/include/uapi/asm/socket.h
+++ b/arch/s390/include/uapi/asm/socket.h
@@ -96,4 +96,6 @@
 
 #define SO_CNX_ADVICE		53
 
+#define SCM_TIMESTAMPING_OPT_STATS	54
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 31aede3af088..a25dc32f5d6a 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -86,6 +86,8 @@
 
 #define SO_CNX_ADVICE		0x0037
 
+#define SCM_TIMESTAMPING_OPT_STATS	0x0038
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
index 81435d995e11..9fdbe1fe0473 100644
--- a/arch/xtensa/include/uapi/asm/socket.h
+++ b/arch/xtensa/include/uapi/asm/socket.h
@@ -101,4 +101,6 @@
 
 #define SO_CNX_ADVICE		53
 
+#define SCM_TIMESTAMPING_OPT_STATS	54
+
 #endif	/* _XTENSA_SOCKET_H */
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index d5d3bd814338..00e0ee8f001f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -428,4 +428,6 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp)
 	tp->saved_syn = NULL;
 }
 
+struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk);
+
 #endif	/* _LINUX_TCP_H */
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 67d632f1743d..2c748ddad5f8 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -92,4 +92,6 @@
 
 #define SO_CNX_ADVICE		53
 
+#define SCM_TIMESTAMPING_OPT_STATS	54
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 264e515de16f..464dcca5ed68 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -25,8 +25,9 @@ enum {
 	SOF_TIMESTAMPING_TX_ACK = (1<<9),
 	SOF_TIMESTAMPING_OPT_CMSG = (1<<10),
 	SOF_TIMESTAMPING_OPT_TSONLY = (1<<11),
+	SOF_TIMESTAMPING_OPT_STATS = (1<<12),
 
-	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TSONLY,
+	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_STATS,
 	SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
 				 SOF_TIMESTAMPING_LAST
 };
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 2863b661d6e1..c53de2691cec 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -220,6 +220,14 @@ struct tcp_info {
 	__u64	tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
 };
 
+/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
+enum {
+	TCP_NLA_PAD,
+	TCP_NLA_BUSY,		/* Time (usec) busy sending data */
+	TCP_NLA_RWND_LIMITED,	/* Time (usec) limited by receive window */
+	TCP_NLA_SNDBUF_LIMITED,	/* Time (usec) limited by send buffer */
+};
+
 /* for TCP_MD5SIG socket option */
 #define TCP_MD5SIG_MAXKEYLEN	80
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d1d1a5a5ad24..ea6fa954c7a0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3839,10 +3839,18 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
 	if (!skb_may_tx_timestamp(sk, tsonly))
 		return;
 
-	if (tsonly)
-		skb = alloc_skb(0, GFP_ATOMIC);
-	else
+	if (tsonly) {
+#ifdef CONFIG_INET
+		if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
+		    sk->sk_protocol == IPPROTO_TCP &&
+		    sk->sk_type == SOCK_STREAM)
+			skb = tcp_get_timestamping_opt_stats(sk);
+		else
+#endif
+			skb = alloc_skb(0, GFP_ATOMIC);
+	} else {
 		skb = skb_clone(orig_skb, GFP_ATOMIC);
+	}
 	if (!skb)
 		return;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 14e6145be33b..d8c7f8c877ca 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -854,6 +854,13 @@ set_rcvbuf:
 				sk->sk_tskey = 0;
 			}
 		}
+
+		if (val & SOF_TIMESTAMPING_OPT_STATS &&
+		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
+			ret = -EINVAL;
+			break;
+		}
+
 		sk->sk_tsflags = val;
 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 			sock_enable_timestamp(sk,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index cdde20f49999..1149b48700a1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2841,6 +2841,26 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
+struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *stats;
+	struct tcp_info info;
+
+	stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
+	if (!stats)
+		return NULL;
+
+	tcp_get_info_chrono_stats(tp, &info);
+	nla_put_u64_64bit(stats, TCP_NLA_BUSY,
+			  info.tcpi_busy_time, TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
+			  info.tcpi_rwnd_limited, TCP_NLA_PAD);
+	nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
+			  info.tcpi_sndbuf_limited, TCP_NLA_PAD);
+	return stats;
+}
+
 static int do_tcp_getsockopt(struct sock *sk, int level,
 		int optname, char __user *optval, int __user *optlen)
 {
diff --git a/net/socket.c b/net/socket.c
index e2584c51aa1f..e6318943ad07 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -693,9 +693,14 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 	    (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
 	    ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2))
 		empty = 0;
-	if (!empty)
+	if (!empty) {
 		put_cmsg(msg, SOL_SOCKET,
 			 SCM_TIMESTAMPING, sizeof(tss), &tss);
+
+		if (skb->len && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS))
+			put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS,
+				 skb->len, skb->data);
+	}
 }
 EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
 
-- 
cgit v1.2.3


From 820ee17b8d3b2a57b1ea20b247cc6a1dddaf8b8d Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 29 Nov 2016 09:57:17 -0800
Subject: net: phy: broadcom: Add support code for reading PHY counters

Broadcom PHYs expose a number of PHY error counters: receive errors,
false carrier sense, SerDes BER count, local and remote receive errors.
Add support code to allow retrieving these error counters. Since the
Broadcom PHY library code is used by several drivers, make it possible
for them to specify the storage for the software copy of the statistics.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/bcm-phy-lib.c | 70 +++++++++++++++++++++++++++++++++++++++++++
 drivers/net/phy/bcm-phy-lib.h |  5 ++++
 include/linux/brcmphy.h       |  3 ++
 3 files changed, 78 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/bcm-phy-lib.c b/drivers/net/phy/bcm-phy-lib.c
index 3156ce6d5861..ab9ad689617c 100644
--- a/drivers/net/phy/bcm-phy-lib.c
+++ b/drivers/net/phy/bcm-phy-lib.c
@@ -17,6 +17,7 @@
 #include <linux/mdio.h>
 #include <linux/module.h>
 #include <linux/phy.h>
+#include <linux/ethtool.h>
 
 #define MII_BCM_CHANNEL_WIDTH     0x2000
 #define BCM_CL45VEN_EEE_ADV       0x3c
@@ -317,6 +318,75 @@ int bcm_phy_downshift_set(struct phy_device *phydev, u8 count)
 }
 EXPORT_SYMBOL_GPL(bcm_phy_downshift_set);
 
+struct bcm_phy_hw_stat {
+	const char *string;
+	u8 reg;
+	u8 shift;
+	u8 bits;
+};
+
+/* Counters freeze at either 0xffff or 0xff, better than nothing */
+static const struct bcm_phy_hw_stat bcm_phy_hw_stats[] = {
+	{ "phy_receive_errors", MII_BRCM_CORE_BASE12, 0, 16 },
+	{ "phy_serdes_ber_errors", MII_BRCM_CORE_BASE13, 8, 8 },
+	{ "phy_false_carrier_sense_errors", MII_BRCM_CORE_BASE13, 0, 8 },
+	{ "phy_local_rcvr_nok", MII_BRCM_CORE_BASE14, 8, 8 },
+	{ "phy_remote_rcv_nok", MII_BRCM_CORE_BASE14, 0, 8 },
+};
+
+int bcm_phy_get_sset_count(struct phy_device *phydev)
+{
+	return ARRAY_SIZE(bcm_phy_hw_stats);
+}
+EXPORT_SYMBOL_GPL(bcm_phy_get_sset_count);
+
+void bcm_phy_get_strings(struct phy_device *phydev, u8 *data)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(bcm_phy_hw_stats); i++)
+		memcpy(data + i * ETH_GSTRING_LEN,
+		       bcm_phy_hw_stats[i].string, ETH_GSTRING_LEN);
+}
+EXPORT_SYMBOL_GPL(bcm_phy_get_strings);
+
+#ifndef UINT64_MAX
+#define UINT64_MAX              (u64)(~((u64)0))
+#endif
+
+/* Caller is supposed to provide appropriate storage for the library code to
+ * access the shadow copy
+ */
+static u64 bcm_phy_get_stat(struct phy_device *phydev, u64 *shadow,
+			    unsigned int i)
+{
+	struct bcm_phy_hw_stat stat = bcm_phy_hw_stats[i];
+	int val;
+	u64 ret;
+
+	val = phy_read(phydev, stat.reg);
+	if (val < 0) {
+		ret = UINT64_MAX;
+	} else {
+		val >>= stat.shift;
+		val = val & ((1 << stat.bits) - 1);
+		shadow[i] += val;
+		ret = shadow[i];
+	}
+
+	return ret;
+}
+
+void bcm_phy_get_stats(struct phy_device *phydev, u64 *shadow,
+		       struct ethtool_stats *stats, u64 *data)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(bcm_phy_hw_stats); i++)
+		data[i] = bcm_phy_get_stat(phydev, shadow, i);
+}
+EXPORT_SYMBOL_GPL(bcm_phy_get_stats);
+
 MODULE_DESCRIPTION("Broadcom PHY Library");
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Broadcom Corporation");
diff --git a/drivers/net/phy/bcm-phy-lib.h b/drivers/net/phy/bcm-phy-lib.h
index a117f657c6d7..7c73808cbbde 100644
--- a/drivers/net/phy/bcm-phy-lib.h
+++ b/drivers/net/phy/bcm-phy-lib.h
@@ -42,4 +42,9 @@ int bcm_phy_downshift_get(struct phy_device *phydev, u8 *count);
 
 int bcm_phy_downshift_set(struct phy_device *phydev, u8 count);
 
+int bcm_phy_get_sset_count(struct phy_device *phydev);
+void bcm_phy_get_strings(struct phy_device *phydev, u8 *data);
+void bcm_phy_get_stats(struct phy_device *phydev, u64 *shadow,
+		       struct ethtool_stats *stats, u64 *data);
+
 #endif /* _LINUX_BCM_PHY_LIB_H */
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index f9f8aaf9c943..4f7d8be9ddbf 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -244,6 +244,9 @@
 #define LPI_FEATURE_EN_DIG1000X		0x4000
 
 /* Core register definitions*/
+#define MII_BRCM_CORE_BASE12	0x12
+#define MII_BRCM_CORE_BASE13	0x13
+#define MII_BRCM_CORE_BASE14	0x14
 #define MII_BRCM_CORE_BASE1E	0x1E
 #define MII_BRCM_CORE_EXPB0	0xB0
 #define MII_BRCM_CORE_EXPB1	0xB1
-- 
cgit v1.2.3


From 85de8576a0b14aecc99136cfbf90e367fa2142cb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 28 Nov 2016 23:16:54 +0100
Subject: bpf, xdp: allow to pass flags to dev_change_xdp_fd

Add an IFLA_XDP_FLAGS attribute that can be passed for setting up
XDP along with IFLA_XDP_FD, which eventually allows user space to
implement typical add/replace/delete logic for programs. Right now,
calling into dev_change_xdp_fd() will always replace previous programs.

When passed XDP_FLAGS_UPDATE_IF_NOEXIST, we can handle this more
graceful when requested by returning -EBUSY in case we try to
attach a new program, but we find that another one is already
attached. This will be used by upcoming front-end for iproute2 as
well.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    |  2 +-
 include/uapi/linux/if_link.h |  4 ++++
 net/core/dev.c               | 20 ++++++++++++++++++--
 net/core/rtnetlink.c         | 14 +++++++++++++-
 4 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4ffcd874cc20..3755317cc6a9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3253,7 +3253,7 @@ int dev_get_phys_port_id(struct net_device *dev,
 int dev_get_phys_port_name(struct net_device *dev,
 			   char *name, size_t len);
 int dev_change_proto_down(struct net_device *dev, bool proto_down);
-int dev_change_xdp_fd(struct net_device *dev, int fd);
+int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 92b2d4928bf1..6b13e591abc9 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -876,10 +876,14 @@ enum {
 
 /* XDP section */
 
+#define XDP_FLAGS_UPDATE_IF_NOEXIST	(1U << 0)
+#define XDP_FLAGS_MASK			(XDP_FLAGS_UPDATE_IF_NOEXIST)
+
 enum {
 	IFLA_XDP_UNSPEC,
 	IFLA_XDP_FD,
 	IFLA_XDP_ATTACHED,
+	IFLA_XDP_FLAGS,
 	__IFLA_XDP_MAX,
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 048b46b7c92a..bffb5253e778 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6692,26 +6692,42 @@ EXPORT_SYMBOL(dev_change_proto_down);
  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
  *	@dev: device
  *	@fd: new program fd or negative value to clear
+ *	@flags: xdp-related flags
  *
  *	Set or clear a bpf program for a device
  */
-int dev_change_xdp_fd(struct net_device *dev, int fd)
+int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	struct bpf_prog *prog = NULL;
-	struct netdev_xdp xdp = {};
+	struct netdev_xdp xdp;
 	int err;
 
+	ASSERT_RTNL();
+
 	if (!ops->ndo_xdp)
 		return -EOPNOTSUPP;
 	if (fd >= 0) {
+		if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
+			memset(&xdp, 0, sizeof(xdp));
+			xdp.command = XDP_QUERY_PROG;
+
+			err = ops->ndo_xdp(dev, &xdp);
+			if (err < 0)
+				return err;
+			if (xdp.prog_attached)
+				return -EBUSY;
+		}
+
 		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
 	}
 
+	memset(&xdp, 0, sizeof(xdp));
 	xdp.command = XDP_SETUP_PROG;
 	xdp.prog = prog;
+
 	err = ops->ndo_xdp(dev, &xdp);
 	if (err < 0 && prog)
 		bpf_prog_put(prog);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 4e60525ea586..bd85570e6e4b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1505,6 +1505,7 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
 static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
 	[IFLA_XDP_FD]		= { .type = NLA_S32 },
 	[IFLA_XDP_ATTACHED]	= { .type = NLA_U8 },
+	[IFLA_XDP_FLAGS]	= { .type = NLA_U32 },
 };
 
 static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
@@ -2164,6 +2165,7 @@ static int do_setlink(const struct sk_buff *skb,
 
 	if (tb[IFLA_XDP]) {
 		struct nlattr *xdp[IFLA_XDP_MAX + 1];
+		u32 xdp_flags = 0;
 
 		err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP],
 				       ifla_xdp_policy);
@@ -2174,9 +2176,19 @@ static int do_setlink(const struct sk_buff *skb,
 			err = -EINVAL;
 			goto errout;
 		}
+
+		if (xdp[IFLA_XDP_FLAGS]) {
+			xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]);
+			if (xdp_flags & ~XDP_FLAGS_MASK) {
+				err = -EINVAL;
+				goto errout;
+			}
+		}
+
 		if (xdp[IFLA_XDP_FD]) {
 			err = dev_change_xdp_fd(dev,
-						nla_get_s32(xdp[IFLA_XDP_FD]));
+						nla_get_s32(xdp[IFLA_XDP_FD]),
+						xdp_flags);
 			if (err)
 				goto errout;
 			status |= DO_SETLINK_NOTIFY;
-- 
cgit v1.2.3


From b634d30a79ecc2d28e61cbe5b1f4443952f37a8f Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 30 Nov 2016 10:16:08 -0800
Subject: cgroup, bpf: remove unnecessary #include

this #include is unnecessary and brings whole set of
other headers into cgroup-defs.h. Remove it.

Fixes: 3007098494be ("cgroup: add support for eBPF programs")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Rami Rosen <roszenrami@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index ec80d0c0953e..0cf1adfadd2d 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -1,7 +1,6 @@
 #ifndef _BPF_CGROUP_H
 #define _BPF_CGROUP_H
 
-#include <linux/bpf.h>
 #include <linux/jump_label.h>
 #include <uapi/linux/bpf.h>
 
-- 
cgit v1.2.3


From 6d937acfb3f166f6e10abd978fafafa120d6f0d7 Mon Sep 17 00:00:00 2001
From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
Date: Tue, 29 Nov 2016 16:47:01 +0200
Subject: qed: Optimize qed_chain datapath usage

The chain structure and functions are widely used by the qed* modules,
both for configuration and datapath.
E.g., qede's Tx has one such chain and its Rx has two.

Currently, the strucutre's fields which are required for datapath
related functions [produce/consume] are intertwined with fields which
are required only for configuration purposes [init/destroy/etc.].

This patch re-arranges the chain structure so that all the fields which
are required for datapath usage could reside in a single cacheline instead
of the two which are required today.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c         |   7 +-
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |   4 +-
 include/linux/qed/qed_chain.h                     | 144 ++++++++++++----------
 3 files changed, 86 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 5be7b8a25425..80162ee0391f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -2283,12 +2283,12 @@ static void qed_chain_free_pbl(struct qed_dev *cdev, struct qed_chain *p_chain)
 {
 	void **pp_virt_addr_tbl = p_chain->pbl.pp_virt_addr_tbl;
 	u32 page_cnt = p_chain->page_cnt, i, pbl_size;
-	u8 *p_pbl_virt = p_chain->pbl.p_virt_table;
+	u8 *p_pbl_virt = p_chain->pbl_sp.p_virt_table;
 
 	if (!pp_virt_addr_tbl)
 		return;
 
-	if (!p_chain->pbl.p_virt_table)
+	if (!p_pbl_virt)
 		goto out;
 
 	for (i = 0; i < page_cnt; i++) {
@@ -2306,7 +2306,8 @@ static void qed_chain_free_pbl(struct qed_dev *cdev, struct qed_chain *p_chain)
 	pbl_size = page_cnt * QED_CHAIN_PBL_ENTRY_SIZE;
 	dma_free_coherent(&cdev->pdev->dev,
 			  pbl_size,
-			  p_chain->pbl.p_virt_table, p_chain->pbl.p_phys_table);
+			  p_chain->pbl_sp.p_virt_table,
+			  p_chain->pbl_sp.p_phys_table);
 out:
 	vfree(p_chain->pbl.pp_virt_addr_tbl);
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 2888eb0628f8..d0a58282f2a8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -347,11 +347,11 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 
 	/* Place EQ address in RAMROD */
 	DMA_REGPAIR_LE(p_ramrod->event_ring_pbl_addr,
-		       p_hwfn->p_eq->chain.pbl.p_phys_table);
+		       p_hwfn->p_eq->chain.pbl_sp.p_phys_table);
 	page_cnt = (u8)qed_chain_get_page_cnt(&p_hwfn->p_eq->chain);
 	p_ramrod->event_ring_num_pages = page_cnt;
 	DMA_REGPAIR_LE(p_ramrod->consolid_q_pbl_addr,
-		       p_hwfn->p_consq->chain.pbl.p_phys_table);
+		       p_hwfn->p_consq->chain.pbl_sp.p_phys_table);
 
 	qed_tunn_set_pf_start_params(p_hwfn, p_tunn, &p_ramrod->tunnel_config);
 
diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h
index 72d88cf3ca25..37dfba101c6c 100644
--- a/include/linux/qed/qed_chain.h
+++ b/include/linux/qed/qed_chain.h
@@ -56,23 +56,6 @@ struct qed_chain_pbl_u32 {
 	u32 cons_page_idx;
 };
 
-struct qed_chain_pbl {
-	/* Base address of a pre-allocated buffer for pbl */
-	dma_addr_t	p_phys_table;
-	void		*p_virt_table;
-
-	/* Table for keeping the virtual addresses of the chain pages,
-	 * respectively to the physical addresses in the pbl table.
-	 */
-	void **pp_virt_addr_tbl;
-
-	/* Index to current used page by producer/consumer */
-	union {
-		struct qed_chain_pbl_u16 pbl16;
-		struct qed_chain_pbl_u32 pbl32;
-	} u;
-};
-
 struct qed_chain_u16 {
 	/* Cyclic index of next element to produce/consme */
 	u16 prod_idx;
@@ -86,46 +69,78 @@ struct qed_chain_u32 {
 };
 
 struct qed_chain {
-	void			*p_virt_addr;
-	dma_addr_t		p_phys_addr;
-	void			*p_prod_elem;
-	void			*p_cons_elem;
+	/* fastpath portion of the chain - required for commands such
+	 * as produce / consume.
+	 */
+	/* Point to next element to produce/consume */
+	void *p_prod_elem;
+	void *p_cons_elem;
+
+	/* Fastpath portions of the PBL [if exists] */
+	struct {
+		/* Table for keeping the virtual addresses of the chain pages,
+		 * respectively to the physical addresses in the pbl table.
+		 */
+		void **pp_virt_addr_tbl;
 
-	enum qed_chain_mode	mode;
-	enum qed_chain_use_mode intended_use; /* used to produce/consume */
-	enum qed_chain_cnt_type cnt_type;
+		union {
+			struct qed_chain_pbl_u16 u16;
+			struct qed_chain_pbl_u32 u32;
+		} c;
+	} pbl;
 
 	union {
 		struct qed_chain_u16 chain16;
 		struct qed_chain_u32 chain32;
 	} u;
 
+	/* Capacity counts only usable elements */
+	u32 capacity;
 	u32 page_cnt;
 
-	/* Number of elements - capacity is for usable elements only,
-	 * while size will contain total number of elements [for entire chain].
+	enum qed_chain_mode mode;
+
+	/* Elements information for fast calculations */
+	u16 elem_per_page;
+	u16 elem_per_page_mask;
+	u16 elem_size;
+	u16 next_page_mask;
+	u16 usable_per_page;
+	u8 elem_unusable;
+
+	u8 cnt_type;
+
+	/* Slowpath of the chain - required for initialization and destruction,
+	 * but isn't involved in regular functionality.
 	 */
-	u32 capacity;
+
+	/* Base address of a pre-allocated buffer for pbl */
+	struct {
+		dma_addr_t p_phys_table;
+		void *p_virt_table;
+	} pbl_sp;
+
+	/* Address of first page of the chain - the address is required
+	 * for fastpath operation [consume/produce] but only for the the SINGLE
+	 * flavour which isn't considered fastpath [== SPQ].
+	 */
+	void *p_virt_addr;
+	dma_addr_t p_phys_addr;
+
+	/* Total number of elements [for entire chain] */
 	u32 size;
 
-	/* Elements information for fast calculations */
-	u16			elem_per_page;
-	u16			elem_per_page_mask;
-	u16			elem_unusable;
-	u16			usable_per_page;
-	u16			elem_size;
-	u16			next_page_mask;
-	struct qed_chain_pbl	pbl;
+	u8 intended_use;
 };
 
 #define QED_CHAIN_PBL_ENTRY_SIZE        (8)
 #define QED_CHAIN_PAGE_SIZE             (0x1000)
 #define ELEMS_PER_PAGE(elem_size)       (QED_CHAIN_PAGE_SIZE / (elem_size))
 
-#define UNUSABLE_ELEMS_PER_PAGE(elem_size, mode)     \
-	((mode == QED_CHAIN_MODE_NEXT_PTR) ?	     \
-	 (1 + ((sizeof(struct qed_chain_next) - 1) / \
-	       (elem_size))) : 0)
+#define UNUSABLE_ELEMS_PER_PAGE(elem_size, mode)	 \
+	(((mode) == QED_CHAIN_MODE_NEXT_PTR) ?		 \
+	 (u8)(1 + ((sizeof(struct qed_chain_next) - 1) / \
+		   (elem_size))) : 0)
 
 #define USABLE_ELEMS_PER_PAGE(elem_size, mode) \
 	((u32)(ELEMS_PER_PAGE(elem_size) -     \
@@ -186,7 +201,7 @@ static inline u16 qed_chain_get_usable_per_page(struct qed_chain *p_chain)
 	return p_chain->usable_per_page;
 }
 
-static inline u16 qed_chain_get_unusable_per_page(struct qed_chain *p_chain)
+static inline u8 qed_chain_get_unusable_per_page(struct qed_chain *p_chain)
 {
 	return p_chain->elem_unusable;
 }
@@ -198,7 +213,7 @@ static inline u32 qed_chain_get_page_cnt(struct qed_chain *p_chain)
 
 static inline dma_addr_t qed_chain_get_pbl_phys(struct qed_chain *p_chain)
 {
-	return p_chain->pbl.p_phys_table;
+	return p_chain->pbl_sp.p_phys_table;
 }
 
 /**
@@ -214,10 +229,10 @@ static inline dma_addr_t qed_chain_get_pbl_phys(struct qed_chain *p_chain)
 static inline void
 qed_chain_advance_page(struct qed_chain *p_chain,
 		       void **p_next_elem, void *idx_to_inc, void *page_to_inc)
-
 {
 	struct qed_chain_next *p_next = NULL;
 	u32 page_index = 0;
+
 	switch (p_chain->mode) {
 	case QED_CHAIN_MODE_NEXT_PTR:
 		p_next = *p_next_elem;
@@ -305,7 +320,7 @@ static inline void *qed_chain_produce(struct qed_chain *p_chain)
 		if ((p_chain->u.chain16.prod_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_prod_idx = &p_chain->u.chain16.prod_idx;
-			p_prod_page_idx = &p_chain->pbl.u.pbl16.prod_page_idx;
+			p_prod_page_idx = &p_chain->pbl.c.u16.prod_page_idx;
 			qed_chain_advance_page(p_chain, &p_chain->p_prod_elem,
 					       p_prod_idx, p_prod_page_idx);
 		}
@@ -314,7 +329,7 @@ static inline void *qed_chain_produce(struct qed_chain *p_chain)
 		if ((p_chain->u.chain32.prod_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_prod_idx = &p_chain->u.chain32.prod_idx;
-			p_prod_page_idx = &p_chain->pbl.u.pbl32.prod_page_idx;
+			p_prod_page_idx = &p_chain->pbl.c.u32.prod_page_idx;
 			qed_chain_advance_page(p_chain, &p_chain->p_prod_elem,
 					       p_prod_idx, p_prod_page_idx);
 		}
@@ -378,7 +393,7 @@ static inline void *qed_chain_consume(struct qed_chain *p_chain)
 		if ((p_chain->u.chain16.cons_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_cons_idx = &p_chain->u.chain16.cons_idx;
-			p_cons_page_idx = &p_chain->pbl.u.pbl16.cons_page_idx;
+			p_cons_page_idx = &p_chain->pbl.c.u16.cons_page_idx;
 			qed_chain_advance_page(p_chain, &p_chain->p_cons_elem,
 					       p_cons_idx, p_cons_page_idx);
 		}
@@ -387,8 +402,8 @@ static inline void *qed_chain_consume(struct qed_chain *p_chain)
 		if ((p_chain->u.chain32.cons_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_cons_idx = &p_chain->u.chain32.cons_idx;
-			p_cons_page_idx = &p_chain->pbl.u.pbl32.cons_page_idx;
-		qed_chain_advance_page(p_chain, &p_chain->p_cons_elem,
+			p_cons_page_idx = &p_chain->pbl.c.u32.cons_page_idx;
+			qed_chain_advance_page(p_chain, &p_chain->p_cons_elem,
 					       p_cons_idx, p_cons_page_idx);
 		}
 		p_chain->u.chain32.cons_idx++;
@@ -429,25 +444,26 @@ static inline void qed_chain_reset(struct qed_chain *p_chain)
 		u32 reset_val = p_chain->page_cnt - 1;
 
 		if (is_chain_u16(p_chain)) {
-			p_chain->pbl.u.pbl16.prod_page_idx = (u16)reset_val;
-			p_chain->pbl.u.pbl16.cons_page_idx = (u16)reset_val;
+			p_chain->pbl.c.u16.prod_page_idx = (u16)reset_val;
+			p_chain->pbl.c.u16.cons_page_idx = (u16)reset_val;
 		} else {
-			p_chain->pbl.u.pbl32.prod_page_idx = reset_val;
-			p_chain->pbl.u.pbl32.cons_page_idx = reset_val;
+			p_chain->pbl.c.u32.prod_page_idx = reset_val;
+			p_chain->pbl.c.u32.cons_page_idx = reset_val;
 		}
 	}
 
 	switch (p_chain->intended_use) {
-	case QED_CHAIN_USE_TO_CONSUME_PRODUCE:
-	case QED_CHAIN_USE_TO_PRODUCE:
-		/* Do nothing */
-		break;
-
 	case QED_CHAIN_USE_TO_CONSUME:
 		/* produce empty elements */
 		for (i = 0; i < p_chain->capacity; i++)
 			qed_chain_recycle_consumed(p_chain);
 		break;
+
+	case QED_CHAIN_USE_TO_CONSUME_PRODUCE:
+	case QED_CHAIN_USE_TO_PRODUCE:
+	default:
+		/* Do nothing */
+		break;
 	}
 }
 
@@ -473,13 +489,13 @@ static inline void qed_chain_init_params(struct qed_chain *p_chain,
 	p_chain->p_virt_addr = NULL;
 	p_chain->p_phys_addr = 0;
 	p_chain->elem_size	= elem_size;
-	p_chain->intended_use = intended_use;
+	p_chain->intended_use = (u8)intended_use;
 	p_chain->mode		= mode;
-	p_chain->cnt_type = cnt_type;
+	p_chain->cnt_type = (u8)cnt_type;
 
-	p_chain->elem_per_page		= ELEMS_PER_PAGE(elem_size);
+	p_chain->elem_per_page = ELEMS_PER_PAGE(elem_size);
 	p_chain->usable_per_page = USABLE_ELEMS_PER_PAGE(elem_size, mode);
-	p_chain->elem_per_page_mask	= p_chain->elem_per_page - 1;
+	p_chain->elem_per_page_mask = p_chain->elem_per_page - 1;
 	p_chain->elem_unusable = UNUSABLE_ELEMS_PER_PAGE(elem_size, mode);
 	p_chain->next_page_mask = (p_chain->usable_per_page &
 				   p_chain->elem_per_page_mask);
@@ -488,8 +504,8 @@ static inline void qed_chain_init_params(struct qed_chain *p_chain,
 	p_chain->capacity = p_chain->usable_per_page * page_cnt;
 	p_chain->size = p_chain->elem_per_page * page_cnt;
 
-	p_chain->pbl.p_phys_table = 0;
-	p_chain->pbl.p_virt_table = NULL;
+	p_chain->pbl_sp.p_phys_table = 0;
+	p_chain->pbl_sp.p_virt_table = NULL;
 	p_chain->pbl.pp_virt_addr_tbl = NULL;
 }
 
@@ -530,8 +546,8 @@ static inline void qed_chain_init_pbl_mem(struct qed_chain *p_chain,
 					  dma_addr_t p_phys_pbl,
 					  void **pp_virt_addr_tbl)
 {
-	p_chain->pbl.p_phys_table = p_phys_pbl;
-	p_chain->pbl.p_virt_table = p_virt_pbl;
+	p_chain->pbl_sp.p_phys_table = p_phys_pbl;
+	p_chain->pbl_sp.p_virt_table = p_virt_pbl;
 	p_chain->pbl.pp_virt_addr_tbl = pp_virt_addr_tbl;
 }
 
-- 
cgit v1.2.3


From 3da7a37ae6886cfba9ef35428eb976fc2ef561fa Mon Sep 17 00:00:00 2001
From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
Date: Tue, 29 Nov 2016 16:47:06 +0200
Subject: qed*: Handle-based L2-queues.

The driver needs to maintain several FW/HW-indices for each one of
its queues. Currently, that mapping is done by the QED where it uses
an rx/tx array of so-called hw-cids, populating them whenever a new
queue is opened and clearing them upon destruction of said queues.

This maintenance is far from ideal - there's no real reason why
QED needs to maintain such a data-structure. It becomes even worse
when considering the fact that the PF's queues and its child VFs' queues
are all mapped into the same data-structure.
As a by-product, the set of parameters an interface needs to supply for
queue APIs is non-trivial, and some of the variables in the API
structures have different meaning depending on their exact place
in the configuration flow.

This patch re-organizes the way L2 queues are configured and maintained.
In short:
  - Required parameters for queue init are now well-defined.
  - Qed would allocate a queue-cid based on parameters.
    Upon initialization success, it would return a handle to caller.
  - Queue-handle would be maintained by entity requesting queue-init,
    not necessarily qed.
  - All further queue-APIs [update, destroy] would use the opaque
    handle as reference for the queue instead of various indices.

The possible owners of such handles:
  - PF queues [qede] - complete handles based on provided configuration.
  - VF queues [qede] - fw-context-less handles, containing only relative
    information; Only the PF-side would need the absolute indices
    for configuration, so they're omitted here.
  - VF queues [qed, PF-side] - complete handles based on VF initialization.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h        |  12 -
 drivers/net/ethernet/qlogic/qed/qed_dev.c    |  26 --
 drivers/net/ethernet/qlogic/qed/qed_l2.c     | 595 +++++++++++++++------------
 drivers/net/ethernet/qlogic/qed/qed_l2.h     | 133 ++++--
 drivers/net/ethernet/qlogic/qed/qed_sriov.c  | 275 +++++++++----
 drivers/net/ethernet/qlogic/qed/qed_sriov.h  |  21 +-
 drivers/net/ethernet/qlogic/qed/qed_vf.c     |  90 ++--
 drivers/net/ethernet/qlogic/qed/qed_vf.h     |  40 +-
 drivers/net/ethernet/qlogic/qede/qede.h      |   4 +
 drivers/net/ethernet/qlogic/qede/qede_main.c | 109 ++---
 include/linux/qed/qed_eth_if.h               |  56 +--
 11 files changed, 791 insertions(+), 570 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 50b8a01ff512..244dd40ccac3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -241,15 +241,6 @@ struct qed_hw_info {
 	enum qed_wol_support b_wol_support;
 };
 
-struct qed_hw_cid_data {
-	u32	cid;
-	bool	b_cid_allocated;
-
-	/* Additional identifiers */
-	u16	opaque_fid;
-	u8	vport_id;
-};
-
 /* maximun size of read/write commands (HW limit) */
 #define DMAE_MAX_RW_SIZE        0x2000
 
@@ -416,9 +407,6 @@ struct qed_hwfn {
 
 	struct qed_dcbx_info		*p_dcbx_info;
 
-	struct qed_hw_cid_data		*p_tx_cids;
-	struct qed_hw_cid_data		*p_rx_cids;
-
 	struct qed_dmae_info		dmae_info;
 
 	/* QM init */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 80162ee0391f..00b9a67ba359 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -134,15 +134,6 @@ void qed_resc_free(struct qed_dev *cdev)
 
 	kfree(cdev->reset_stats);
 
-	for_each_hwfn(cdev, i) {
-		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
-
-		kfree(p_hwfn->p_tx_cids);
-		p_hwfn->p_tx_cids = NULL;
-		kfree(p_hwfn->p_rx_cids);
-		p_hwfn->p_rx_cids = NULL;
-	}
-
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
@@ -425,23 +416,6 @@ int qed_resc_alloc(struct qed_dev *cdev)
 	if (!cdev->fw_data)
 		return -ENOMEM;
 
-	/* Allocate Memory for the Queue->CID mapping */
-	for_each_hwfn(cdev, i) {
-		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
-		int tx_size = sizeof(struct qed_hw_cid_data) *
-				     RESC_NUM(p_hwfn, QED_L2_QUEUE);
-		int rx_size = sizeof(struct qed_hw_cid_data) *
-				     RESC_NUM(p_hwfn, QED_L2_QUEUE);
-
-		p_hwfn->p_tx_cids = kzalloc(tx_size, GFP_KERNEL);
-		if (!p_hwfn->p_tx_cids)
-			goto alloc_no_mem;
-
-		p_hwfn->p_rx_cids = kzalloc(rx_size, GFP_KERNEL);
-		if (!p_hwfn->p_rx_cids)
-			goto alloc_no_mem;
-	}
-
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 		u32 n_eqes, num_cons;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 900b253be317..6a3727c4c0c6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -23,6 +23,7 @@
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
 #include <linux/bug.h>
+#include <linux/vmalloc.h>
 #include "qed.h"
 #include <linux/qed/qed_chain.h>
 #include "qed_cxt.h"
@@ -41,6 +42,124 @@
 #define QED_MAX_SGES_NUM 16
 #define CRC32_POLY 0x1edc6f41
 
+void qed_eth_queue_cid_release(struct qed_hwfn *p_hwfn,
+			       struct qed_queue_cid *p_cid)
+{
+	/* VFs' CIDs are 0-based in PF-view, and uninitialized on VF */
+	if (!p_cid->is_vf && IS_PF(p_hwfn->cdev))
+		qed_cxt_release_cid(p_hwfn, p_cid->cid);
+	vfree(p_cid);
+}
+
+/* The internal is only meant to be directly called by PFs initializeing CIDs
+ * for their VFs.
+ */
+struct qed_queue_cid *
+_qed_eth_queue_to_cid(struct qed_hwfn *p_hwfn,
+		      u16 opaque_fid,
+		      u32 cid,
+		      u8 vf_qid,
+		      struct qed_queue_start_common_params *p_params)
+{
+	bool b_is_same = (p_hwfn->hw_info.opaque_fid == opaque_fid);
+	struct qed_queue_cid *p_cid;
+	int rc;
+
+	p_cid = vmalloc(sizeof(*p_cid));
+	if (!p_cid)
+		return NULL;
+	memset(p_cid, 0, sizeof(*p_cid));
+
+	p_cid->opaque_fid = opaque_fid;
+	p_cid->cid = cid;
+	p_cid->vf_qid = vf_qid;
+	p_cid->rel = *p_params;
+
+	/* Don't try calculating the absolute indices for VFs */
+	if (IS_VF(p_hwfn->cdev)) {
+		p_cid->abs = p_cid->rel;
+		goto out;
+	}
+
+	/* Calculate the engine-absolute indices of the resources.
+	 * This would guarantee they're valid later on.
+	 * In some cases [SBs] we already have the right values.
+	 */
+	rc = qed_fw_vport(p_hwfn, p_cid->rel.vport_id, &p_cid->abs.vport_id);
+	if (rc)
+		goto fail;
+
+	rc = qed_fw_l2_queue(p_hwfn, p_cid->rel.queue_id, &p_cid->abs.queue_id);
+	if (rc)
+		goto fail;
+
+	/* In case of a PF configuring its VF's queues, the stats-id is already
+	 * absolute [since there's a single index that's suitable per-VF].
+	 */
+	if (b_is_same) {
+		rc = qed_fw_vport(p_hwfn, p_cid->rel.stats_id,
+				  &p_cid->abs.stats_id);
+		if (rc)
+			goto fail;
+	} else {
+		p_cid->abs.stats_id = p_cid->rel.stats_id;
+	}
+
+	/* SBs relevant information was already provided as absolute */
+	p_cid->abs.sb = p_cid->rel.sb;
+	p_cid->abs.sb_idx = p_cid->rel.sb_idx;
+
+	/* This is tricky - we're actually interested in whehter this is a PF
+	 * entry meant for the VF.
+	 */
+	if (!b_is_same)
+		p_cid->is_vf = true;
+out:
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_SP,
+		   "opaque_fid: %04x CID %08x vport %02x [%02x] qzone %04x [%04x] stats %02x [%02x] SB %04x PI %02x\n",
+		   p_cid->opaque_fid,
+		   p_cid->cid,
+		   p_cid->rel.vport_id,
+		   p_cid->abs.vport_id,
+		   p_cid->rel.queue_id,
+		   p_cid->abs.queue_id,
+		   p_cid->rel.stats_id,
+		   p_cid->abs.stats_id, p_cid->abs.sb, p_cid->abs.sb_idx);
+
+	return p_cid;
+
+fail:
+	vfree(p_cid);
+	return NULL;
+}
+
+static struct qed_queue_cid *qed_eth_queue_to_cid(struct qed_hwfn *p_hwfn,
+						  u16 opaque_fid, struct
+						  qed_queue_start_common_params
+						  *p_params)
+{
+	struct qed_queue_cid *p_cid;
+	u32 cid = 0;
+
+	/* Get a unique firmware CID for this queue, in case it's a PF.
+	 * VF's don't need a CID as the queue configuration will be done
+	 * by PF.
+	 */
+	if (IS_PF(p_hwfn->cdev)) {
+		if (qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_ETH, &cid)) {
+			DP_NOTICE(p_hwfn, "Failed to acquire cid\n");
+			return NULL;
+		}
+	}
+
+	p_cid = _qed_eth_queue_to_cid(p_hwfn, opaque_fid, cid, 0, p_params);
+	if (!p_cid && IS_PF(p_hwfn->cdev))
+		qed_cxt_release_cid(p_hwfn, cid);
+
+	return p_cid;
+}
+
 int qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn,
 			   struct qed_sp_vport_start_params *p_params)
 {
@@ -496,61 +615,26 @@ static int qed_filter_accept_cmd(struct qed_dev *cdev,
 	return 0;
 }
 
-static int qed_sp_release_queue_cid(
-	struct qed_hwfn *p_hwfn,
-	struct qed_hw_cid_data *p_cid_data)
-{
-	if (!p_cid_data->b_cid_allocated)
-		return 0;
-
-	qed_cxt_release_cid(p_hwfn, p_cid_data->cid);
-
-	p_cid_data->b_cid_allocated = false;
-
-	return 0;
-}
-
-int qed_sp_eth_rxq_start_ramrod(struct qed_hwfn *p_hwfn,
-				u16 opaque_fid,
-				u32 cid,
-				struct qed_queue_start_common_params *p_params,
-				u8 stats_id,
-				u16 bd_max_bytes,
-				dma_addr_t bd_chain_phys_addr,
-				dma_addr_t cqe_pbl_addr,
-				u16 cqe_pbl_size, bool b_use_zone_a_prod)
+int qed_eth_rxq_start_ramrod(struct qed_hwfn *p_hwfn,
+			     struct qed_queue_cid *p_cid,
+			     u16 bd_max_bytes,
+			     dma_addr_t bd_chain_phys_addr,
+			     dma_addr_t cqe_pbl_addr, u16 cqe_pbl_size)
 {
 	struct rx_queue_start_ramrod_data *p_ramrod = NULL;
 	struct qed_spq_entry *p_ent = NULL;
 	struct qed_sp_init_data init_data;
-	struct qed_hw_cid_data *p_rx_cid;
-	u16 abs_rx_q_id = 0;
-	u8 abs_vport_id = 0;
 	int rc = -EINVAL;
 
-	/* Store information for the stop */
-	p_rx_cid = &p_hwfn->p_rx_cids[p_params->queue_id];
-	p_rx_cid->cid = cid;
-	p_rx_cid->opaque_fid = opaque_fid;
-	p_rx_cid->vport_id = p_params->vport_id;
-
-	rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id);
-	if (rc)
-		return rc;
-
-	rc = qed_fw_l2_queue(p_hwfn, p_params->queue_id, &abs_rx_q_id);
-	if (rc)
-		return rc;
-
 	DP_VERBOSE(p_hwfn, QED_MSG_SP,
-		   "opaque_fid=0x%x, cid=0x%x, rx_qid=0x%x, vport_id=0x%x, sb_id=0x%x\n",
-		   opaque_fid,
-		   cid, p_params->queue_id, p_params->vport_id, p_params->sb);
+		   "opaque_fid=0x%x, cid=0x%x, rx_qzone=0x%x, vport_id=0x%x, sb_id=0x%x\n",
+		   p_cid->opaque_fid, p_cid->cid,
+		   p_cid->abs.queue_id, p_cid->abs.vport_id, p_cid->abs.sb);
 
 	/* Get SPQ entry */
 	memset(&init_data, 0, sizeof(init_data));
-	init_data.cid = cid;
-	init_data.opaque_fid = opaque_fid;
+	init_data.cid = p_cid->cid;
+	init_data.opaque_fid = p_cid->opaque_fid;
 	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
@@ -561,11 +645,11 @@ int qed_sp_eth_rxq_start_ramrod(struct qed_hwfn *p_hwfn,
 
 	p_ramrod = &p_ent->ramrod.rx_queue_start;
 
-	p_ramrod->sb_id = cpu_to_le16(p_params->sb);
-	p_ramrod->sb_index = p_params->sb_idx;
-	p_ramrod->vport_id = abs_vport_id;
-	p_ramrod->stats_counter_id = stats_id;
-	p_ramrod->rx_queue_id = cpu_to_le16(abs_rx_q_id);
+	p_ramrod->sb_id = cpu_to_le16(p_cid->abs.sb);
+	p_ramrod->sb_index = p_cid->abs.sb_idx;
+	p_ramrod->vport_id = p_cid->abs.vport_id;
+	p_ramrod->stats_counter_id = p_cid->abs.stats_id;
+	p_ramrod->rx_queue_id = cpu_to_le16(p_cid->abs.queue_id);
 	p_ramrod->complete_cqe_flg = 0;
 	p_ramrod->complete_event_flg = 1;
 
@@ -575,85 +659,85 @@ int qed_sp_eth_rxq_start_ramrod(struct qed_hwfn *p_hwfn,
 	p_ramrod->num_of_pbl_pages = cpu_to_le16(cqe_pbl_size);
 	DMA_REGPAIR_LE(p_ramrod->cqe_pbl_addr, cqe_pbl_addr);
 
-	if (p_params->vf_qid || b_use_zone_a_prod) {
-		p_ramrod->vf_rx_prod_index = p_params->vf_qid;
+	if (p_cid->is_vf) {
+		p_ramrod->vf_rx_prod_index = p_cid->vf_qid;
 		DP_VERBOSE(p_hwfn, QED_MSG_SP,
 			   "Queue%s is meant for VF rxq[%02x]\n",
-			   b_use_zone_a_prod ? " [legacy]" : "",
-			   p_params->vf_qid);
-		p_ramrod->vf_rx_prod_use_zone_a = b_use_zone_a_prod;
+			   !!p_cid->b_legacy_vf ? " [legacy]" : "",
+			   p_cid->vf_qid);
+		p_ramrod->vf_rx_prod_use_zone_a = !!p_cid->b_legacy_vf;
 	}
 
 	return qed_spq_post(p_hwfn, p_ent, NULL);
 }
 
 static int
-qed_sp_eth_rx_queue_start(struct qed_hwfn *p_hwfn,
-			  u16 opaque_fid,
-			  struct qed_queue_start_common_params *p_params,
+qed_eth_pf_rx_queue_start(struct qed_hwfn *p_hwfn,
+			  struct qed_queue_cid *p_cid,
 			  u16 bd_max_bytes,
 			  dma_addr_t bd_chain_phys_addr,
 			  dma_addr_t cqe_pbl_addr,
 			  u16 cqe_pbl_size, void __iomem **pp_prod)
 {
-	struct qed_hw_cid_data *p_rx_cid;
 	u32 init_prod_val = 0;
-	u16 abs_l2_queue = 0;
-	u8 abs_stats_id = 0;
-	int rc;
 
-	if (IS_VF(p_hwfn->cdev)) {
-		return qed_vf_pf_rxq_start(p_hwfn,
-					   p_params->queue_id,
-					   p_params->sb,
-					   (u8)p_params->sb_idx,
-					   bd_max_bytes,
-					   bd_chain_phys_addr,
-					   cqe_pbl_addr, cqe_pbl_size, pp_prod);
-	}
-
-	rc = qed_fw_l2_queue(p_hwfn, p_params->queue_id, &abs_l2_queue);
-	if (rc)
-		return rc;
-
-	rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_stats_id);
-	if (rc)
-		return rc;
-
-	*pp_prod = (u8 __iomem *)p_hwfn->regview +
-				 GTT_BAR0_MAP_REG_MSDM_RAM +
-				 MSTORM_ETH_PF_PRODS_OFFSET(abs_l2_queue);
+	*pp_prod = p_hwfn->regview +
+		   GTT_BAR0_MAP_REG_MSDM_RAM +
+		    MSTORM_ETH_PF_PRODS_OFFSET(p_cid->abs.queue_id);
 
 	/* Init the rcq, rx bd and rx sge (if valid) producers to 0 */
 	__internal_ram_wr(p_hwfn, *pp_prod, sizeof(u32),
 			  (u32 *)(&init_prod_val));
 
+	return qed_eth_rxq_start_ramrod(p_hwfn, p_cid,
+					bd_max_bytes,
+					bd_chain_phys_addr,
+					cqe_pbl_addr, cqe_pbl_size);
+}
+
+static int
+qed_eth_rx_queue_start(struct qed_hwfn *p_hwfn,
+		       u16 opaque_fid,
+		       struct qed_queue_start_common_params *p_params,
+		       u16 bd_max_bytes,
+		       dma_addr_t bd_chain_phys_addr,
+		       dma_addr_t cqe_pbl_addr,
+		       u16 cqe_pbl_size,
+		       struct qed_rxq_start_ret_params *p_ret_params)
+{
+	struct qed_queue_cid *p_cid;
+	int rc;
+
 	/* Allocate a CID for the queue */
-	p_rx_cid = &p_hwfn->p_rx_cids[p_params->queue_id];
-	rc = qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_ETH, &p_rx_cid->cid);
-	if (rc) {
-		DP_NOTICE(p_hwfn, "Failed to acquire cid\n");
-		return rc;
-	}
-	p_rx_cid->b_cid_allocated = true;
+	p_cid = qed_eth_queue_to_cid(p_hwfn, opaque_fid, p_params);
+	if (!p_cid)
+		return -ENOMEM;
 
-	rc = qed_sp_eth_rxq_start_ramrod(p_hwfn,
-					 opaque_fid,
-					 p_rx_cid->cid,
-					 p_params,
-					 abs_stats_id,
+	if (IS_PF(p_hwfn->cdev)) {
+		rc = qed_eth_pf_rx_queue_start(p_hwfn, p_cid,
+					       bd_max_bytes,
+					       bd_chain_phys_addr,
+					       cqe_pbl_addr, cqe_pbl_size,
+					       &p_ret_params->p_prod);
+	} else {
+		rc = qed_vf_pf_rxq_start(p_hwfn, p_cid,
 					 bd_max_bytes,
 					 bd_chain_phys_addr,
-					 cqe_pbl_addr, cqe_pbl_size, false);
+					 cqe_pbl_addr,
+					 cqe_pbl_size, &p_ret_params->p_prod);
+	}
 
+	/* Provide the caller with a reference to as handler */
 	if (rc)
-		qed_sp_release_queue_cid(p_hwfn, p_rx_cid);
+		qed_eth_queue_cid_release(p_hwfn, p_cid);
+	else
+		p_ret_params->p_handle = (void *)p_cid;
 
 	return rc;
 }
 
 int qed_sp_eth_rx_queues_update(struct qed_hwfn *p_hwfn,
-				u16 rx_queue_id,
+				void **pp_rxq_handles,
 				u8 num_rxqs,
 				u8 complete_cqe_flg,
 				u8 complete_event_flg,
@@ -663,8 +747,7 @@ int qed_sp_eth_rx_queues_update(struct qed_hwfn *p_hwfn,
 	struct rx_queue_update_ramrod_data *p_ramrod = NULL;
 	struct qed_spq_entry *p_ent = NULL;
 	struct qed_sp_init_data init_data;
-	struct qed_hw_cid_data *p_rx_cid;
-	u16 qid, abs_rx_q_id = 0;
+	struct qed_queue_cid *p_cid;
 	int rc = -EINVAL;
 	u8 i;
 
@@ -673,12 +756,11 @@ int qed_sp_eth_rx_queues_update(struct qed_hwfn *p_hwfn,
 	init_data.p_comp_data = p_comp_data;
 
 	for (i = 0; i < num_rxqs; i++) {
-		qid = rx_queue_id + i;
-		p_rx_cid = &p_hwfn->p_rx_cids[qid];
+		p_cid = ((struct qed_queue_cid **)pp_rxq_handles)[i];
 
 		/* Get SPQ entry */
-		init_data.cid = p_rx_cid->cid;
-		init_data.opaque_fid = p_rx_cid->opaque_fid;
+		init_data.cid = p_cid->cid;
+		init_data.opaque_fid = p_cid->opaque_fid;
 
 		rc = qed_sp_init_request(p_hwfn, &p_ent,
 					 ETH_RAMROD_RX_QUEUE_UPDATE,
@@ -687,10 +769,9 @@ int qed_sp_eth_rx_queues_update(struct qed_hwfn *p_hwfn,
 			return rc;
 
 		p_ramrod = &p_ent->ramrod.rx_queue_update;
+		p_ramrod->vport_id = p_cid->abs.vport_id;
 
-		qed_fw_vport(p_hwfn, p_rx_cid->vport_id, &p_ramrod->vport_id);
-		qed_fw_l2_queue(p_hwfn, qid, &abs_rx_q_id);
-		p_ramrod->rx_queue_id = cpu_to_le16(abs_rx_q_id);
+		p_ramrod->rx_queue_id = cpu_to_le16(p_cid->abs.queue_id);
 		p_ramrod->complete_cqe_flg = complete_cqe_flg;
 		p_ramrod->complete_event_flg = complete_event_flg;
 
@@ -702,24 +783,19 @@ int qed_sp_eth_rx_queues_update(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
-int qed_sp_eth_rx_queue_stop(struct qed_hwfn *p_hwfn,
-			     u16 rx_queue_id,
-			     bool eq_completion_only, bool cqe_completion)
+static int
+qed_eth_pf_rx_queue_stop(struct qed_hwfn *p_hwfn,
+			 struct qed_queue_cid *p_cid,
+			 bool b_eq_completion_only, bool b_cqe_completion)
 {
-	struct qed_hw_cid_data *p_rx_cid = &p_hwfn->p_rx_cids[rx_queue_id];
 	struct rx_queue_stop_ramrod_data *p_ramrod = NULL;
 	struct qed_spq_entry *p_ent = NULL;
 	struct qed_sp_init_data init_data;
-	u16 abs_rx_q_id = 0;
-	int rc = -EINVAL;
-
-	if (IS_VF(p_hwfn->cdev))
-		return qed_vf_pf_rxq_stop(p_hwfn, rx_queue_id, cqe_completion);
+	int rc;
 
-	/* Get SPQ entry */
 	memset(&init_data, 0, sizeof(init_data));
-	init_data.cid = p_rx_cid->cid;
-	init_data.opaque_fid = p_rx_cid->opaque_fid;
+	init_data.cid = p_cid->cid;
+	init_data.opaque_fid = p_cid->opaque_fid;
 	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
@@ -729,62 +805,53 @@ int qed_sp_eth_rx_queue_stop(struct qed_hwfn *p_hwfn,
 		return rc;
 
 	p_ramrod = &p_ent->ramrod.rx_queue_stop;
-
-	qed_fw_vport(p_hwfn, p_rx_cid->vport_id, &p_ramrod->vport_id);
-	qed_fw_l2_queue(p_hwfn, rx_queue_id, &abs_rx_q_id);
-	p_ramrod->rx_queue_id = cpu_to_le16(abs_rx_q_id);
+	p_ramrod->vport_id = p_cid->abs.vport_id;
+	p_ramrod->rx_queue_id = cpu_to_le16(p_cid->abs.queue_id);
 
 	/* Cleaning the queue requires the completion to arrive there.
 	 * In addition, VFs require the answer to come as eqe to PF.
 	 */
-	p_ramrod->complete_cqe_flg =
-		(!!(p_rx_cid->opaque_fid == p_hwfn->hw_info.opaque_fid) &&
-		 !eq_completion_only) || cqe_completion;
-	p_ramrod->complete_event_flg =
-		!(p_rx_cid->opaque_fid == p_hwfn->hw_info.opaque_fid) ||
-		eq_completion_only;
+	p_ramrod->complete_cqe_flg = (!p_cid->is_vf &&
+				      !b_eq_completion_only) ||
+				     b_cqe_completion;
+	p_ramrod->complete_event_flg = p_cid->is_vf || b_eq_completion_only;
 
-	rc = qed_spq_post(p_hwfn, p_ent, NULL);
-	if (rc)
-		return rc;
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+int qed_eth_rx_queue_stop(struct qed_hwfn *p_hwfn,
+			  void *p_rxq,
+			  bool eq_completion_only, bool cqe_completion)
+{
+	struct qed_queue_cid *p_cid = (struct qed_queue_cid *)p_rxq;
+	int rc = -EINVAL;
 
-	return qed_sp_release_queue_cid(p_hwfn, p_rx_cid);
+	if (IS_PF(p_hwfn->cdev))
+		rc = qed_eth_pf_rx_queue_stop(p_hwfn, p_cid,
+					      eq_completion_only,
+					      cqe_completion);
+	else
+		rc = qed_vf_pf_rxq_stop(p_hwfn, p_cid, cqe_completion);
+
+	if (!rc)
+		qed_eth_queue_cid_release(p_hwfn, p_cid);
+	return rc;
 }
 
-int qed_sp_eth_txq_start_ramrod(struct qed_hwfn  *p_hwfn,
-				u16  opaque_fid,
-				u32  cid,
-				struct qed_queue_start_common_params *p_params,
-				u8  stats_id,
-				dma_addr_t pbl_addr,
-				u16 pbl_size,
-				union qed_qm_pq_params *p_pq_params)
+int
+qed_eth_txq_start_ramrod(struct qed_hwfn *p_hwfn,
+			 struct qed_queue_cid *p_cid,
+			 dma_addr_t pbl_addr, u16 pbl_size, u16 pq_id)
 {
 	struct tx_queue_start_ramrod_data *p_ramrod = NULL;
 	struct qed_spq_entry *p_ent = NULL;
 	struct qed_sp_init_data init_data;
-	struct qed_hw_cid_data *p_tx_cid;
-	u16 pq_id, abs_tx_q_id = 0;
 	int rc = -EINVAL;
-	u8 abs_vport_id;
-
-	/* Store information for the stop */
-	p_tx_cid = &p_hwfn->p_tx_cids[p_params->queue_id];
-	p_tx_cid->cid		= cid;
-	p_tx_cid->opaque_fid	= opaque_fid;
-
-	rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id);
-	if (rc)
-		return rc;
-
-	rc = qed_fw_l2_queue(p_hwfn, p_params->queue_id, &abs_tx_q_id);
-	if (rc)
-		return rc;
 
 	/* Get SPQ entry */
 	memset(&init_data, 0, sizeof(init_data));
-	init_data.cid = cid;
-	init_data.opaque_fid = opaque_fid;
+	init_data.cid = p_cid->cid;
+	init_data.opaque_fid = p_cid->opaque_fid;
 	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
@@ -794,96 +861,92 @@ int qed_sp_eth_txq_start_ramrod(struct qed_hwfn  *p_hwfn,
 		return rc;
 
 	p_ramrod = &p_ent->ramrod.tx_queue_start;
-	p_ramrod->vport_id = abs_vport_id;
+	p_ramrod->vport_id = p_cid->abs.vport_id;
 
-	p_ramrod->sb_id = cpu_to_le16(p_params->sb);
-	p_ramrod->sb_index = p_params->sb_idx;
-	p_ramrod->stats_counter_id = stats_id;
+	p_ramrod->sb_id = cpu_to_le16(p_cid->abs.sb);
+	p_ramrod->sb_index = p_cid->abs.sb_idx;
+	p_ramrod->stats_counter_id = p_cid->abs.stats_id;
 
-	p_ramrod->queue_zone_id = cpu_to_le16(abs_tx_q_id);
+	p_ramrod->queue_zone_id = cpu_to_le16(p_cid->abs.queue_id);
+	p_ramrod->same_as_last_id = cpu_to_le16(p_cid->abs.queue_id);
 
 	p_ramrod->pbl_size = cpu_to_le16(pbl_size);
 	DMA_REGPAIR_LE(p_ramrod->pbl_base_addr, pbl_addr);
 
-	pq_id = qed_get_qm_pq(p_hwfn, PROTOCOLID_ETH, p_pq_params);
 	p_ramrod->qm_pq_id = cpu_to_le16(pq_id);
 
 	return qed_spq_post(p_hwfn, p_ent, NULL);
 }
 
 static int
-qed_sp_eth_tx_queue_start(struct qed_hwfn *p_hwfn,
-			  u16 opaque_fid,
-			  struct qed_queue_start_common_params *p_params,
+qed_eth_pf_tx_queue_start(struct qed_hwfn *p_hwfn,
+			  struct qed_queue_cid *p_cid,
+			  u8 tc,
 			  dma_addr_t pbl_addr,
 			  u16 pbl_size, void __iomem **pp_doorbell)
 {
-	struct qed_hw_cid_data *p_tx_cid;
 	union qed_qm_pq_params pq_params;
-	u8 abs_stats_id = 0;
 	int rc;
 
-	if (IS_VF(p_hwfn->cdev)) {
-		return qed_vf_pf_txq_start(p_hwfn,
-					   p_params->queue_id,
-					   p_params->sb,
-					   p_params->sb_idx,
-					   pbl_addr, pbl_size, pp_doorbell);
-	}
+	memset(&pq_params, 0, sizeof(pq_params));
 
-	rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_stats_id);
+	rc = qed_eth_txq_start_ramrod(p_hwfn, p_cid,
+				      pbl_addr, pbl_size,
+				      qed_get_qm_pq(p_hwfn, PROTOCOLID_ETH,
+						    &pq_params));
 	if (rc)
 		return rc;
 
-	p_tx_cid = &p_hwfn->p_tx_cids[p_params->queue_id];
-	memset(p_tx_cid, 0, sizeof(*p_tx_cid));
-	memset(&pq_params, 0, sizeof(pq_params));
+	/* Provide the caller with the necessary return values */
+	*pp_doorbell = p_hwfn->doorbells +
+		       qed_db_addr(p_cid->cid, DQ_DEMS_LEGACY);
 
-	/* Allocate a CID for the queue */
-	rc = qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_ETH, &p_tx_cid->cid);
-	if (rc) {
-		DP_NOTICE(p_hwfn, "Failed to acquire cid\n");
-		return rc;
-	}
-	p_tx_cid->b_cid_allocated = true;
+	return 0;
+}
 
-	DP_VERBOSE(p_hwfn, QED_MSG_SP,
-		   "opaque_fid=0x%x, cid=0x%x, tx_qid=0x%x, vport_id=0x%x, sb_id=0x%x\n",
-		   opaque_fid, p_tx_cid->cid,
-		   p_params->queue_id, p_params->vport_id, p_params->sb);
-
-	rc = qed_sp_eth_txq_start_ramrod(p_hwfn,
-					 opaque_fid,
-					 p_tx_cid->cid,
-					 p_params,
-					 abs_stats_id,
-					 pbl_addr,
-					 pbl_size,
-					 &pq_params);
-
-	*pp_doorbell = (u8 __iomem *)p_hwfn->doorbells +
-				     qed_db_addr(p_tx_cid->cid, DQ_DEMS_LEGACY);
+static int
+qed_eth_tx_queue_start(struct qed_hwfn *p_hwfn,
+		       u16 opaque_fid,
+		       struct qed_queue_start_common_params *p_params,
+		       u8 tc,
+		       dma_addr_t pbl_addr,
+		       u16 pbl_size,
+		       struct qed_txq_start_ret_params *p_ret_params)
+{
+	struct qed_queue_cid *p_cid;
+	int rc;
+
+	p_cid = qed_eth_queue_to_cid(p_hwfn, opaque_fid, p_params);
+	if (!p_cid)
+		return -EINVAL;
+
+	if (IS_PF(p_hwfn->cdev))
+		rc = qed_eth_pf_tx_queue_start(p_hwfn, p_cid, tc,
+					       pbl_addr, pbl_size,
+					       &p_ret_params->p_doorbell);
+	else
+		rc = qed_vf_pf_txq_start(p_hwfn, p_cid,
+					 pbl_addr, pbl_size,
+					 &p_ret_params->p_doorbell);
 
 	if (rc)
-		qed_sp_release_queue_cid(p_hwfn, p_tx_cid);
+		qed_eth_queue_cid_release(p_hwfn, p_cid);
+	else
+		p_ret_params->p_handle = (void *)p_cid;
 
 	return rc;
 }
 
-int qed_sp_eth_tx_queue_stop(struct qed_hwfn *p_hwfn, u16 tx_queue_id)
+static int
+qed_eth_pf_tx_queue_stop(struct qed_hwfn *p_hwfn, struct qed_queue_cid *p_cid)
 {
-	struct qed_hw_cid_data *p_tx_cid = &p_hwfn->p_tx_cids[tx_queue_id];
 	struct qed_spq_entry *p_ent = NULL;
 	struct qed_sp_init_data init_data;
-	int rc = -EINVAL;
-
-	if (IS_VF(p_hwfn->cdev))
-		return qed_vf_pf_txq_stop(p_hwfn, tx_queue_id);
+	int rc;
 
-	/* Get SPQ entry */
 	memset(&init_data, 0, sizeof(init_data));
-	init_data.cid = p_tx_cid->cid;
-	init_data.opaque_fid = p_tx_cid->opaque_fid;
+	init_data.cid = p_cid->cid;
+	init_data.opaque_fid = p_cid->opaque_fid;
 	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
@@ -892,11 +955,22 @@ int qed_sp_eth_tx_queue_stop(struct qed_hwfn *p_hwfn, u16 tx_queue_id)
 	if (rc)
 		return rc;
 
-	rc = qed_spq_post(p_hwfn, p_ent, NULL);
-	if (rc)
-		return rc;
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+int qed_eth_tx_queue_stop(struct qed_hwfn *p_hwfn, void *p_handle)
+{
+	struct qed_queue_cid *p_cid = (struct qed_queue_cid *)p_handle;
+	int rc;
+
+	if (IS_PF(p_hwfn->cdev))
+		rc = qed_eth_pf_tx_queue_stop(p_hwfn, p_cid);
+	else
+		rc = qed_vf_pf_txq_stop(p_hwfn, p_cid);
 
-	return qed_sp_release_queue_cid(p_hwfn, p_tx_cid);
+	if (!rc)
+		qed_eth_queue_cid_release(p_hwfn, p_cid);
+	return rc;
 }
 
 static enum eth_filter_action qed_filter_action(enum qed_filter_opcode opcode)
@@ -1880,58 +1954,53 @@ static int qed_update_vport(struct qed_dev *cdev,
 }
 
 static int qed_start_rxq(struct qed_dev *cdev,
-			 struct qed_queue_start_common_params *params,
+			 u8 rss_num,
+			 struct qed_queue_start_common_params *p_params,
 			 u16 bd_max_bytes,
 			 dma_addr_t bd_chain_phys_addr,
 			 dma_addr_t cqe_pbl_addr,
 			 u16 cqe_pbl_size,
-			 void __iomem **pp_prod)
+			 struct qed_rxq_start_ret_params *ret_params)
 {
 	struct qed_hwfn *p_hwfn;
 	int rc, hwfn_index;
 
-	hwfn_index = params->rss_id % cdev->num_hwfns;
+	hwfn_index = rss_num % cdev->num_hwfns;
 	p_hwfn = &cdev->hwfns[hwfn_index];
 
-	/* Fix queue ID in 100g mode */
-	params->queue_id /= cdev->num_hwfns;
-
-	rc = qed_sp_eth_rx_queue_start(p_hwfn,
-				       p_hwfn->hw_info.opaque_fid,
-				       params,
-				       bd_max_bytes,
-				       bd_chain_phys_addr,
-				       cqe_pbl_addr,
-				       cqe_pbl_size,
-				       pp_prod);
+	p_params->queue_id = p_params->queue_id / cdev->num_hwfns;
+	p_params->stats_id = p_params->vport_id;
 
+	rc = qed_eth_rx_queue_start(p_hwfn,
+				    p_hwfn->hw_info.opaque_fid,
+				    p_params,
+				    bd_max_bytes,
+				    bd_chain_phys_addr,
+				    cqe_pbl_addr, cqe_pbl_size, ret_params);
 	if (rc) {
-		DP_ERR(cdev, "Failed to start RXQ#%d\n", params->queue_id);
+		DP_ERR(cdev, "Failed to start RXQ#%d\n", p_params->queue_id);
 		return rc;
 	}
 
 	DP_VERBOSE(cdev, (QED_MSG_SPQ | NETIF_MSG_IFUP),
-		   "Started RX-Q %d [rss %d] on V-PORT %d and SB %d\n",
-		   params->queue_id, params->rss_id, params->vport_id,
-		   params->sb);
+		   "Started RX-Q %d [rss_num %d] on V-PORT %d and SB %d\n",
+		   p_params->queue_id, rss_num, p_params->vport_id,
+		   p_params->sb);
 
 	return 0;
 }
 
-static int qed_stop_rxq(struct qed_dev *cdev,
-			struct qed_stop_rxq_params *params)
+static int qed_stop_rxq(struct qed_dev *cdev, u8 rss_id, void *handle)
 {
 	int rc, hwfn_index;
 	struct qed_hwfn *p_hwfn;
 
-	hwfn_index	= params->rss_id % cdev->num_hwfns;
-	p_hwfn		= &cdev->hwfns[hwfn_index];
+	hwfn_index = rss_id % cdev->num_hwfns;
+	p_hwfn = &cdev->hwfns[hwfn_index];
 
-	rc = qed_sp_eth_rx_queue_stop(p_hwfn,
-				      params->rx_queue_id / cdev->num_hwfns,
-				      params->eq_completion_only, false);
+	rc = qed_eth_rx_queue_stop(p_hwfn, handle, false, false);
 	if (rc) {
-		DP_ERR(cdev, "Failed to stop RXQ#%d\n", params->rx_queue_id);
+		DP_ERR(cdev, "Failed to stop RXQ#%02x\n", rss_id);
 		return rc;
 	}
 
@@ -1939,26 +2008,24 @@ static int qed_stop_rxq(struct qed_dev *cdev,
 }
 
 static int qed_start_txq(struct qed_dev *cdev,
+			 u8 rss_num,
 			 struct qed_queue_start_common_params *p_params,
 			 dma_addr_t pbl_addr,
 			 u16 pbl_size,
-			 void __iomem **pp_doorbell)
+			 struct qed_txq_start_ret_params *ret_params)
 {
 	struct qed_hwfn *p_hwfn;
 	int rc, hwfn_index;
 
-	hwfn_index	= p_params->rss_id % cdev->num_hwfns;
-	p_hwfn		= &cdev->hwfns[hwfn_index];
-
-	/* Fix queue ID in 100g mode */
-	p_params->queue_id /= cdev->num_hwfns;
+	hwfn_index = rss_num % cdev->num_hwfns;
+	p_hwfn = &cdev->hwfns[hwfn_index];
+	p_params->queue_id = p_params->queue_id / cdev->num_hwfns;
+	p_params->stats_id = p_params->vport_id;
 
-	rc = qed_sp_eth_tx_queue_start(p_hwfn,
-				       p_hwfn->hw_info.opaque_fid,
-				       p_params,
-				       pbl_addr,
-				       pbl_size,
-				       pp_doorbell);
+	rc = qed_eth_tx_queue_start(p_hwfn,
+				    p_hwfn->hw_info.opaque_fid,
+				    p_params, 0,
+				    pbl_addr, pbl_size, ret_params);
 
 	if (rc) {
 		DP_ERR(cdev, "Failed to start TXQ#%d\n", p_params->queue_id);
@@ -1966,8 +2033,8 @@ static int qed_start_txq(struct qed_dev *cdev,
 	}
 
 	DP_VERBOSE(cdev, (QED_MSG_SPQ | NETIF_MSG_IFUP),
-		   "Started TX-Q %d [rss %d] on V-PORT %d and SB %d\n",
-		   p_params->queue_id, p_params->rss_id, p_params->vport_id,
+		   "Started TX-Q %d [rss_num %d] on V-PORT %d and SB %d\n",
+		   p_params->queue_id, rss_num, p_params->vport_id,
 		   p_params->sb);
 
 	return 0;
@@ -1981,19 +2048,17 @@ static int qed_fastpath_stop(struct qed_dev *cdev)
 	return 0;
 }
 
-static int qed_stop_txq(struct qed_dev *cdev,
-			struct qed_stop_txq_params *params)
+static int qed_stop_txq(struct qed_dev *cdev, u8 rss_id, void *handle)
 {
 	struct qed_hwfn *p_hwfn;
 	int rc, hwfn_index;
 
-	hwfn_index	= params->rss_id % cdev->num_hwfns;
-	p_hwfn		= &cdev->hwfns[hwfn_index];
+	hwfn_index = rss_id % cdev->num_hwfns;
+	p_hwfn = &cdev->hwfns[hwfn_index];
 
-	rc = qed_sp_eth_tx_queue_stop(p_hwfn,
-				      params->tx_queue_id / cdev->num_hwfns);
+	rc = qed_eth_tx_queue_stop(p_hwfn, handle);
 	if (rc) {
-		DP_ERR(cdev, "Failed to stop TXQ#%d\n", params->tx_queue_id);
+		DP_ERR(cdev, "Failed to stop TXQ#%02x\n", rss_id);
 		return rc;
 	}
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index e495d62fcc03..48c9bfc28140 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -78,11 +78,34 @@ struct qed_filter_mcast {
 	unsigned char mac[QED_MAX_MC_ADDRS][ETH_ALEN];
 };
 
-int qed_sp_eth_rx_queue_stop(struct qed_hwfn *p_hwfn,
-			     u16 rx_queue_id,
-			     bool eq_completion_only, bool cqe_completion);
+/**
+ * @brief qed_eth_rx_queue_stop - This ramrod closes an Rx queue
+ *
+ * @param p_hwfn
+ * @param p_rxq			Handler of queue to close
+ * @param eq_completion_only	If True completion will be on
+ *				EQe, if False completion will be
+ *				on EQe if p_hwfn opaque
+ *				different from the RXQ opaque
+ *				otherwise on CQe.
+ * @param cqe_completion	If True completion will be
+ *				receive on CQe.
+ * @return int
+ */
+int
+qed_eth_rx_queue_stop(struct qed_hwfn *p_hwfn,
+		      void *p_rxq,
+		      bool eq_completion_only, bool cqe_completion);
 
-int qed_sp_eth_tx_queue_stop(struct qed_hwfn *p_hwfn, u16 tx_queue_id);
+/**
+ * @brief qed_eth_tx_queue_stop - closes a Tx queue
+ *
+ * @param p_hwfn
+ * @param p_txq - handle to Tx queue needed to be closed
+ *
+ * @return int
+ */
+int qed_eth_tx_queue_stop(struct qed_hwfn *p_hwfn, void *p_txq);
 
 enum qed_tpa_mode {
 	QED_TPA_MODE_NONE,
@@ -196,19 +219,19 @@ int qed_sp_eth_filter_ucast(struct qed_hwfn *p_hwfn,
  * @note At the moment - only used by non-linux VFs.
  *
  * @param p_hwfn
- * @param rx_queue_id		RX Queue ID
- * @param num_rxqs		Allow to update multiple rx
- *				queues, from rx_queue_id to
- *				(rx_queue_id + num_rxqs)
+ * @param pp_rxq_handlers	An array of queue handlers to be updated.
+ * @param num_rxqs              number of queues to update.
  * @param complete_cqe_flg	Post completion to the CQE Ring if set
  * @param complete_event_flg	Post completion to the Event Ring if set
+ * @param comp_mode
+ * @param p_comp_data
  *
  * @return int
  */
 
 int
 qed_sp_eth_rx_queues_update(struct qed_hwfn *p_hwfn,
-			    u16 rx_queue_id,
+			    void **pp_rxq_handlers,
 			    u8 num_rxqs,
 			    u8 complete_cqe_flg,
 			    u8 complete_event_flg,
@@ -217,27 +240,79 @@ qed_sp_eth_rx_queues_update(struct qed_hwfn *p_hwfn,
 
 void qed_get_vport_stats(struct qed_dev *cdev, struct qed_eth_stats *stats);
 
-int qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn,
-			   struct qed_sp_vport_start_params *p_params);
+void qed_reset_vport_stats(struct qed_dev *cdev);
+
+struct qed_queue_cid {
+	/* 'Relative' is a relative term ;-). Usually the indices [not counting
+	 * SBs] would be PF-relative, but there are some cases where that isn't
+	 * the case - specifically for a PF configuring its VF indices it's
+	 * possible some fields [E.g., stats-id] in 'rel' would already be abs.
+	 */
+	struct qed_queue_start_common_params rel;
+	struct qed_queue_start_common_params abs;
+	u32 cid;
+	u16 opaque_fid;
+
+	/* VFs queues are mapped differently, so we need to know the
+	 * relative queue associated with them [0-based].
+	 * Notice this is relevant on the *PF* queue-cid of its VF's queues,
+	 * and not on the VF itself.
+	 */
+	bool is_vf;
+	u8 vf_qid;
+
+	/* Legacy VFs might have Rx producer located elsewhere */
+	bool b_legacy_vf;
+};
 
-int qed_sp_eth_rxq_start_ramrod(struct qed_hwfn *p_hwfn,
-				u16 opaque_fid,
-				u32 cid,
-				struct qed_queue_start_common_params *params,
-				u8 stats_id,
-				u16 bd_max_bytes,
-				dma_addr_t bd_chain_phys_addr,
-				dma_addr_t cqe_pbl_addr,
-				u16 cqe_pbl_size, bool b_use_zone_a_prod);
-
-int qed_sp_eth_txq_start_ramrod(struct qed_hwfn  *p_hwfn,
-				u16  opaque_fid,
-				u32  cid,
-				struct qed_queue_start_common_params *p_params,
-				u8  stats_id,
-				dma_addr_t pbl_addr,
-				u16 pbl_size,
-				union qed_qm_pq_params *p_pq_params);
+void qed_eth_queue_cid_release(struct qed_hwfn *p_hwfn,
+			       struct qed_queue_cid *p_cid);
+
+struct qed_queue_cid *_qed_eth_queue_to_cid(struct qed_hwfn *p_hwfn,
+					    u16 opaque_fid,
+					    u32 cid,
+					    u8 vf_qid,
+					    struct qed_queue_start_common_params
+					    *p_params);
+
+int
+qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn,
+		       struct qed_sp_vport_start_params *p_params);
+
+/**
+ * @brief - Starts an Rx queue, when queue_cid is already prepared
+ *
+ * @param p_hwfn
+ * @param p_cid
+ * @param bd_max_bytes
+ * @param bd_chain_phys_addr
+ * @param cqe_pbl_addr
+ * @param cqe_pbl_size
+ *
+ * @return int
+ */
+int
+qed_eth_rxq_start_ramrod(struct qed_hwfn *p_hwfn,
+			 struct qed_queue_cid *p_cid,
+			 u16 bd_max_bytes,
+			 dma_addr_t bd_chain_phys_addr,
+			 dma_addr_t cqe_pbl_addr, u16 cqe_pbl_size);
+
+/**
+ * @brief - Starts a Tx queue, where queue_cid is already prepared
+ *
+ * @param p_hwfn
+ * @param p_cid
+ * @param pbl_addr
+ * @param pbl_size
+ * @param p_pq_params - parameters for choosing the PQ for this Tx queue
+ *
+ * @return int
+ */
+int
+qed_eth_txq_start_ramrod(struct qed_hwfn *p_hwfn,
+			 struct qed_queue_cid *p_cid,
+			 dma_addr_t pbl_addr, u16 pbl_size, u16 pq_id);
 
 u8 qed_mcast_bin_from_mac(u8 *mac);
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index f3f742a4e59a..85b09dd1787a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -808,37 +808,70 @@ static void qed_iov_free_vf_igu_sbs(struct qed_hwfn *p_hwfn,
 
 static int qed_iov_init_hw_for_vf(struct qed_hwfn *p_hwfn,
 				  struct qed_ptt *p_ptt,
-				  u16 rel_vf_id, u16 num_rx_queues)
+				  struct qed_iov_vf_init_params *p_params)
 {
 	u8 num_of_vf_avaiable_chains = 0;
 	struct qed_vf_info *vf = NULL;
+	u16 qid, num_irqs;
 	int rc = 0;
 	u32 cids;
 	u8 i;
 
-	vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, false);
+	vf = qed_iov_get_vf_info(p_hwfn, p_params->rel_vf_id, false);
 	if (!vf) {
 		DP_ERR(p_hwfn, "qed_iov_init_hw_for_vf : vf is NULL\n");
 		return -EINVAL;
 	}
 
 	if (vf->b_init) {
-		DP_NOTICE(p_hwfn, "VF[%d] is already active.\n", rel_vf_id);
+		DP_NOTICE(p_hwfn, "VF[%d] is already active.\n",
+			  p_params->rel_vf_id);
 		return -EINVAL;
 	}
 
+	/* Perform sanity checking on the requested queue_id */
+	for (i = 0; i < p_params->num_queues; i++) {
+		u16 min_vf_qzone = FEAT_NUM(p_hwfn, QED_PF_L2_QUE);
+		u16 max_vf_qzone = min_vf_qzone +
+		    FEAT_NUM(p_hwfn, QED_VF_L2_QUE) - 1;
+
+		qid = p_params->req_rx_queue[i];
+		if (qid < min_vf_qzone || qid > max_vf_qzone) {
+			DP_NOTICE(p_hwfn,
+				  "Can't enable Rx qid [%04x] for VF[%d]: qids [0x%04x,...,0x%04x] available\n",
+				  qid,
+				  p_params->rel_vf_id,
+				  min_vf_qzone, max_vf_qzone);
+			return -EINVAL;
+		}
+
+		qid = p_params->req_tx_queue[i];
+		if (qid > max_vf_qzone) {
+			DP_NOTICE(p_hwfn,
+				  "Can't enable Tx qid [%04x] for VF[%d]: max qid 0x%04x\n",
+				  qid, p_params->rel_vf_id, max_vf_qzone);
+			return -EINVAL;
+		}
+
+		/* If client *really* wants, Tx qid can be shared with PF */
+		if (qid < min_vf_qzone)
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF[%d] is using PF qid [0x%04x] for Txq[0x%02x]\n",
+				   p_params->rel_vf_id, qid, i);
+	}
+
 	/* Limit number of queues according to number of CIDs */
 	qed_cxt_get_proto_cid_count(p_hwfn, PROTOCOLID_ETH, &cids);
 	DP_VERBOSE(p_hwfn,
 		   QED_MSG_IOV,
 		   "VF[%d] - requesting to initialize for 0x%04x queues [0x%04x CIDs available]\n",
-		   vf->relative_vf_id, num_rx_queues, (u16) cids);
-	num_rx_queues = min_t(u16, num_rx_queues, ((u16) cids));
+		   vf->relative_vf_id, p_params->num_queues, (u16)cids);
+	num_irqs = min_t(u16, p_params->num_queues, ((u16)cids));
 
 	num_of_vf_avaiable_chains = qed_iov_alloc_vf_igu_sbs(p_hwfn,
 							     p_ptt,
-							     vf,
-							     num_rx_queues);
+							     vf, num_irqs);
 	if (!num_of_vf_avaiable_chains) {
 		DP_ERR(p_hwfn, "no available igu sbs\n");
 		return -ENOMEM;
@@ -849,25 +882,22 @@ static int qed_iov_init_hw_for_vf(struct qed_hwfn *p_hwfn,
 	vf->num_txqs = num_of_vf_avaiable_chains;
 
 	for (i = 0; i < vf->num_rxqs; i++) {
-		u16 queue_id = qed_int_queue_id_from_sb_id(p_hwfn,
-							   vf->igu_sbs[i]);
+		struct qed_vf_q_info *p_queue = &vf->vf_queues[i];
 
-		if (queue_id > RESC_NUM(p_hwfn, QED_L2_QUEUE)) {
-			DP_NOTICE(p_hwfn,
-				  "VF[%d] will require utilizing of out-of-bounds queues - %04x\n",
-				  vf->relative_vf_id, queue_id);
-			return -EINVAL;
-		}
+		p_queue->fw_rx_qid = p_params->req_rx_queue[i];
+		p_queue->fw_tx_qid = p_params->req_tx_queue[i];
 
 		/* CIDs are per-VF, so no problem having them 0-based. */
-		vf->vf_queues[i].fw_rx_qid = queue_id;
-		vf->vf_queues[i].fw_tx_qid = queue_id;
-		vf->vf_queues[i].fw_cid = i;
+		p_queue->fw_cid = i;
 
 		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
-			   "VF[%d] - [%d] SB %04x, Tx/Rx queue %04x CID %04x\n",
-			   vf->relative_vf_id, i, vf->igu_sbs[i], queue_id, i);
+			   "VF[%d] - Q[%d] SB %04x, qid [Rx %04x Tx %04x]  CID %04x\n",
+			   vf->relative_vf_id,
+			   i, vf->igu_sbs[i],
+			   p_queue->fw_rx_qid,
+			   p_queue->fw_tx_qid, p_queue->fw_cid);
 	}
+
 	rc = qed_iov_enable_vf_access(p_hwfn, p_ptt, vf);
 	if (!rc) {
 		vf->b_init = true;
@@ -1187,8 +1217,19 @@ static void qed_iov_vf_cleanup(struct qed_hwfn *p_hwfn,
 
 	p_vf->num_active_rxqs = 0;
 
-	for (i = 0; i < QED_MAX_VF_CHAINS_PER_PF; i++)
-		p_vf->vf_queues[i].rxq_active = 0;
+	for (i = 0; i < QED_MAX_VF_CHAINS_PER_PF; i++) {
+		struct qed_vf_q_info *p_queue = &p_vf->vf_queues[i];
+
+		if (p_queue->p_rx_cid) {
+			qed_eth_queue_cid_release(p_hwfn, p_queue->p_rx_cid);
+			p_queue->p_rx_cid = NULL;
+		}
+
+		if (p_queue->p_tx_cid) {
+			qed_eth_queue_cid_release(p_hwfn, p_queue->p_tx_cid);
+			p_queue->p_tx_cid = NULL;
+		}
+	}
 
 	memset(&p_vf->shadow_config, 0, sizeof(p_vf->shadow_config));
 	memset(&p_vf->acquire, 0, sizeof(p_vf->acquire));
@@ -1594,21 +1635,21 @@ static int qed_iov_configure_vport_forced(struct qed_hwfn *p_hwfn,
 
 		/* Update all the Rx queues */
 		for (i = 0; i < QED_MAX_VF_CHAINS_PER_PF; i++) {
-			u16 qid;
+			struct qed_queue_cid *p_cid;
 
-			if (!p_vf->vf_queues[i].rxq_active)
+			p_cid = p_vf->vf_queues[i].p_rx_cid;
+			if (!p_cid)
 				continue;
 
-			qid = p_vf->vf_queues[i].fw_rx_qid;
-
-			rc = qed_sp_eth_rx_queues_update(p_hwfn, qid,
+			rc = qed_sp_eth_rx_queues_update(p_hwfn,
+							 (void **)&p_cid,
 							 1, 0, 1,
 							 QED_SPQ_MODE_EBLOCK,
 							 NULL);
 			if (rc) {
 				DP_NOTICE(p_hwfn,
 					  "Failed to send Rx update fo queue[0x%04x]\n",
-					  qid);
+					  p_cid->rel.queue_id);
 				return rc;
 			}
 		}
@@ -1782,23 +1823,34 @@ static void qed_iov_vf_mbx_start_rxq(struct qed_hwfn *p_hwfn,
 	struct qed_queue_start_common_params params;
 	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
 	u8 status = PFVF_STATUS_NO_RESOURCE;
+	struct qed_vf_q_info *p_queue;
 	struct vfpf_start_rxq_tlv *req;
 	bool b_legacy_vf = false;
 	int rc;
 
-	memset(&params, 0, sizeof(params));
 	req = &mbx->req_virt->start_rxq;
 
 	if (!qed_iov_validate_rxq(p_hwfn, vf, req->rx_qid) ||
 	    !qed_iov_validate_sb(p_hwfn, vf, req->hw_sb))
 		goto out;
 
-	params.queue_id =  vf->vf_queues[req->rx_qid].fw_rx_qid;
-	params.vf_qid = req->rx_qid;
+	/* Acquire a new queue-cid */
+	p_queue = &vf->vf_queues[req->rx_qid];
+
+	memset(&params, 0, sizeof(params));
+	params.queue_id = p_queue->fw_rx_qid;
 	params.vport_id = vf->vport_id;
+	params.stats_id = vf->abs_vf_id + 0x10;
 	params.sb = req->hw_sb;
 	params.sb_idx = req->sb_index;
 
+	p_queue->p_rx_cid = _qed_eth_queue_to_cid(p_hwfn,
+						  vf->opaque_fid,
+						  p_queue->fw_cid,
+						  req->rx_qid, &params);
+	if (!p_queue->p_rx_cid)
+		goto out;
+
 	/* Legacy VFs have their Producers in a different location, which they
 	 * calculate on their own and clean the producer prior to this.
 	 */
@@ -1811,21 +1863,19 @@ static void qed_iov_vf_mbx_start_rxq(struct qed_hwfn *p_hwfn,
 		       MSTORM_ETH_VF_PRODS_OFFSET(vf->abs_vf_id, req->rx_qid),
 		       0);
 	}
+	p_queue->p_rx_cid->b_legacy_vf = b_legacy_vf;
 
-	rc = qed_sp_eth_rxq_start_ramrod(p_hwfn, vf->opaque_fid,
-					 vf->vf_queues[req->rx_qid].fw_cid,
-					 &params,
-					 vf->abs_vf_id + 0x10,
-					 req->bd_max_bytes,
-					 req->rxq_addr,
-					 req->cqe_pbl_addr, req->cqe_pbl_size,
-					 b_legacy_vf);
-
+	rc = qed_eth_rxq_start_ramrod(p_hwfn,
+				      p_queue->p_rx_cid,
+				      req->bd_max_bytes,
+				      req->rxq_addr,
+				      req->cqe_pbl_addr, req->cqe_pbl_size);
 	if (rc) {
 		status = PFVF_STATUS_FAILURE;
+		qed_eth_queue_cid_release(p_hwfn, p_queue->p_rx_cid);
+		p_queue->p_rx_cid = NULL;
 	} else {
 		status = PFVF_STATUS_SUCCESS;
-		vf->vf_queues[req->rx_qid].rxq_active = true;
 		vf->num_active_rxqs++;
 	}
 
@@ -1882,7 +1932,9 @@ static void qed_iov_vf_mbx_start_txq(struct qed_hwfn *p_hwfn,
 	u8 status = PFVF_STATUS_NO_RESOURCE;
 	union qed_qm_pq_params pq_params;
 	struct vfpf_start_txq_tlv *req;
+	struct qed_vf_q_info *p_queue;
 	int rc;
+	u16 pq;
 
 	/* Prepare the parameters which would choose the right PQ */
 	memset(&pq_params, 0, sizeof(pq_params));
@@ -1896,24 +1948,31 @@ static void qed_iov_vf_mbx_start_txq(struct qed_hwfn *p_hwfn,
 	    !qed_iov_validate_sb(p_hwfn, vf, req->hw_sb))
 		goto out;
 
-	params.queue_id =  vf->vf_queues[req->tx_qid].fw_tx_qid;
+	/* Acquire a new queue-cid */
+	p_queue = &vf->vf_queues[req->tx_qid];
+
+	params.queue_id = p_queue->fw_tx_qid;
 	params.vport_id = vf->vport_id;
+	params.stats_id = vf->abs_vf_id + 0x10;
 	params.sb = req->hw_sb;
 	params.sb_idx = req->sb_index;
 
-	rc = qed_sp_eth_txq_start_ramrod(p_hwfn,
-					 vf->opaque_fid,
-					 vf->vf_queues[req->tx_qid].fw_cid,
-					 &params,
-					 vf->abs_vf_id + 0x10,
-					 req->pbl_addr,
-					 req->pbl_size, &pq_params);
+	p_queue->p_tx_cid = _qed_eth_queue_to_cid(p_hwfn,
+						  vf->opaque_fid,
+						  p_queue->fw_cid,
+						  req->tx_qid, &params);
+	if (!p_queue->p_tx_cid)
+		goto out;
 
+	pq = qed_get_qm_pq(p_hwfn, PROTOCOLID_ETH, &pq_params);
+	rc = qed_eth_txq_start_ramrod(p_hwfn, p_queue->p_tx_cid,
+				      req->pbl_addr, req->pbl_size, pq);
 	if (rc) {
 		status = PFVF_STATUS_FAILURE;
+		qed_eth_queue_cid_release(p_hwfn, p_queue->p_tx_cid);
+		p_queue->p_tx_cid = NULL;
 	} else {
 		status = PFVF_STATUS_SUCCESS;
-		vf->vf_queues[req->tx_qid].txq_active = true;
 	}
 
 out:
@@ -1924,6 +1983,7 @@ static int qed_iov_vf_stop_rxqs(struct qed_hwfn *p_hwfn,
 				struct qed_vf_info *vf,
 				u16 rxq_id, u8 num_rxqs, bool cqe_completion)
 {
+	struct qed_vf_q_info *p_queue;
 	int rc = 0;
 	int qid;
 
@@ -1931,16 +1991,18 @@ static int qed_iov_vf_stop_rxqs(struct qed_hwfn *p_hwfn,
 		return -EINVAL;
 
 	for (qid = rxq_id; qid < rxq_id + num_rxqs; qid++) {
-		if (vf->vf_queues[qid].rxq_active) {
-			rc = qed_sp_eth_rx_queue_stop(p_hwfn,
-						      vf->vf_queues[qid].
-						      fw_rx_qid, false,
-						      cqe_completion);
+		p_queue = &vf->vf_queues[qid];
 
-			if (rc)
-				return rc;
-		}
-		vf->vf_queues[qid].rxq_active = false;
+		if (!p_queue->p_rx_cid)
+			continue;
+
+		rc = qed_eth_rx_queue_stop(p_hwfn,
+					   p_queue->p_rx_cid,
+					   false, cqe_completion);
+		if (rc)
+			return rc;
+
+		vf->vf_queues[qid].p_rx_cid = NULL;
 		vf->num_active_rxqs--;
 	}
 
@@ -1951,22 +2013,24 @@ static int qed_iov_vf_stop_txqs(struct qed_hwfn *p_hwfn,
 				struct qed_vf_info *vf, u16 txq_id, u8 num_txqs)
 {
 	int rc = 0;
+	struct qed_vf_q_info *p_queue;
 	int qid;
 
 	if (txq_id + num_txqs > ARRAY_SIZE(vf->vf_queues))
 		return -EINVAL;
 
 	for (qid = txq_id; qid < txq_id + num_txqs; qid++) {
-		if (vf->vf_queues[qid].txq_active) {
-			rc = qed_sp_eth_tx_queue_stop(p_hwfn,
-						      vf->vf_queues[qid].
-						      fw_tx_qid);
+		p_queue = &vf->vf_queues[qid];
+		if (!p_queue->p_tx_cid)
+			continue;
 
-			if (rc)
-				return rc;
-		}
-		vf->vf_queues[qid].txq_active = false;
+		rc = qed_eth_tx_queue_stop(p_hwfn, p_queue->p_tx_cid);
+		if (rc)
+			return rc;
+
+		p_queue->p_tx_cid = NULL;
 	}
+
 	return rc;
 }
 
@@ -2021,10 +2085,11 @@ static void qed_iov_vf_mbx_update_rxqs(struct qed_hwfn *p_hwfn,
 				       struct qed_ptt *p_ptt,
 				       struct qed_vf_info *vf)
 {
+	struct qed_queue_cid *handlers[QED_MAX_VF_CHAINS_PER_PF];
 	u16 length = sizeof(struct pfvf_def_resp_tlv);
 	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
 	struct vfpf_update_rxq_tlv *req;
-	u8 status = PFVF_STATUS_SUCCESS;
+	u8 status = PFVF_STATUS_FAILURE;
 	u8 complete_event_flg;
 	u8 complete_cqe_flg;
 	u16 qid;
@@ -2035,29 +2100,36 @@ static void qed_iov_vf_mbx_update_rxqs(struct qed_hwfn *p_hwfn,
 	complete_cqe_flg = !!(req->flags & VFPF_RXQ_UPD_COMPLETE_CQE_FLAG);
 	complete_event_flg = !!(req->flags & VFPF_RXQ_UPD_COMPLETE_EVENT_FLAG);
 
+	/* Validate inputs */
+	if (req->num_rxqs + req->rx_qid > QED_MAX_VF_CHAINS_PER_PF ||
+	    !qed_iov_validate_rxq(p_hwfn, vf, req->rx_qid)) {
+		DP_INFO(p_hwfn, "VF[%d]: Incorrect Rxqs [%04x, %02x]\n",
+			vf->relative_vf_id, req->rx_qid, req->num_rxqs);
+		goto out;
+	}
+
 	for (i = 0; i < req->num_rxqs; i++) {
 		qid = req->rx_qid + i;
-
-		if (!vf->vf_queues[qid].rxq_active) {
-			DP_NOTICE(p_hwfn, "VF rx_qid = %d isn`t active!\n",
-				  qid);
-			status = PFVF_STATUS_FAILURE;
-			break;
+		if (!vf->vf_queues[qid].p_rx_cid) {
+			DP_INFO(p_hwfn,
+				"VF[%d] rx_qid = %d isn`t active!\n",
+				vf->relative_vf_id, qid);
+			goto out;
 		}
 
-		rc = qed_sp_eth_rx_queues_update(p_hwfn,
-						 vf->vf_queues[qid].fw_rx_qid,
-						 1,
-						 complete_cqe_flg,
-						 complete_event_flg,
-						 QED_SPQ_MODE_EBLOCK, NULL);
-
-		if (rc) {
-			status = PFVF_STATUS_FAILURE;
-			break;
-		}
+		handlers[i] = vf->vf_queues[qid].p_rx_cid;
 	}
 
+	rc = qed_sp_eth_rx_queues_update(p_hwfn, (void **)&handlers,
+					 req->num_rxqs,
+					 complete_cqe_flg,
+					 complete_event_flg,
+					 QED_SPQ_MODE_EBLOCK, NULL);
+	if (rc)
+		goto out;
+
+	status = PFVF_STATUS_SUCCESS;
+out:
 	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_UPDATE_RXQ,
 			     length, status);
 }
@@ -2268,7 +2340,7 @@ qed_iov_vp_update_rss_param(struct qed_hwfn *p_hwfn,
 			DP_NOTICE(p_hwfn,
 				  "rss_ind_table[%d] = %d, rxq is out of range\n",
 				  i, q_idx);
-		else if (!vf->vf_queues[q_idx].rxq_active)
+		else if (!vf->vf_queues[q_idx].p_rx_cid)
 			DP_NOTICE(p_hwfn,
 				  "rss_ind_table[%d] = %d, rxq is not active\n",
 				  i, q_idx);
@@ -3468,8 +3540,28 @@ int qed_sriov_disable(struct qed_dev *cdev, bool pci_enabled)
 	return 0;
 }
 
+static void qed_sriov_enable_qid_config(struct qed_hwfn *hwfn,
+					u16 vfid,
+					struct qed_iov_vf_init_params *params)
+{
+	u16 base, i;
+
+	/* Since we have an equal resource distribution per-VF, and we assume
+	 * PF has acquired the QED_PF_L2_QUE first queues, we start setting
+	 * sequentially from there.
+	 */
+	base = FEAT_NUM(hwfn, QED_PF_L2_QUE) + vfid * params->num_queues;
+
+	params->rel_vf_id = vfid;
+	for (i = 0; i < params->num_queues; i++) {
+		params->req_rx_queue[i] = base + i;
+		params->req_tx_queue[i] = base + i;
+	}
+}
+
 static int qed_sriov_enable(struct qed_dev *cdev, int num)
 {
+	struct qed_iov_vf_init_params params;
 	int i, j, rc;
 
 	if (num >= RESC_NUM(&cdev->hwfns[0], QED_VPORT)) {
@@ -3478,15 +3570,17 @@ static int qed_sriov_enable(struct qed_dev *cdev, int num)
 		return -EINVAL;
 	}
 
+	memset(&params, 0, sizeof(params));
+
 	/* Initialize HW for VF access */
 	for_each_hwfn(cdev, j) {
 		struct qed_hwfn *hwfn = &cdev->hwfns[j];
 		struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
-		int num_queues;
 
 		/* Make sure not to use more than 16 queues per VF */
-		num_queues = min_t(int,
-				   FEAT_NUM(hwfn, QED_VF_L2_QUE) / num, 16);
+		params.num_queues = min_t(int,
+					  FEAT_NUM(hwfn, QED_VF_L2_QUE) / num,
+					  16);
 
 		if (!ptt) {
 			DP_ERR(hwfn, "Failed to acquire ptt\n");
@@ -3498,7 +3592,8 @@ static int qed_sriov_enable(struct qed_dev *cdev, int num)
 			if (!qed_iov_is_valid_vfid(hwfn, i, false, true))
 				continue;
 
-			rc = qed_iov_init_hw_for_vf(hwfn, ptt, i, num_queues);
+			qed_sriov_enable_qid_config(hwfn, i, &params);
+			rc = qed_iov_init_hw_for_vf(hwfn, ptt, &params);
 			if (rc) {
 				DP_ERR(cdev, "Failed to enable VF[%d]\n", i);
 				qed_ptt_release(hwfn, ptt);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index 3cf515b1b427..509c02b4772e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -58,6 +58,23 @@ struct qed_public_vf_info {
 	int tx_rate;
 };
 
+struct qed_iov_vf_init_params {
+	u16 rel_vf_id;
+
+	/* Number of requested Queues; Currently, don't support different
+	 * number of Rx/Tx queues.
+	 */
+
+	u16 num_queues;
+
+	/* Allow the client to choose which qzones to use for Rx/Tx,
+	 * and which queue_base to use for Tx queues on a per-queue basis.
+	 * Notice values should be relative to the PF resources.
+	 */
+	u16 req_rx_queue[QED_MAX_VF_CHAINS_PER_PF];
+	u16 req_tx_queue[QED_MAX_VF_CHAINS_PER_PF];
+};
+
 /* This struct is part of qed_dev and contains data relevant to all hwfns;
  * Initialized only if SR-IOV cpabability is exposed in PCIe config space.
  */
@@ -99,10 +116,10 @@ struct qed_iov_vf_mbx {
 
 struct qed_vf_q_info {
 	u16 fw_rx_qid;
+	struct qed_queue_cid *p_rx_cid;
 	u16 fw_tx_qid;
+	struct qed_queue_cid *p_tx_cid;
 	u8 fw_cid;
-	u8 rxq_active;
-	u8 txq_active;
 };
 
 enum vf_state {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index 3c0633642f4c..60b31a8ede73 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -388,18 +388,18 @@ free_p_iov:
 #define MSTORM_QZONE_START(dev)   (TSTORM_QZONE_START +	\
 				   (TSTORM_QZONE_SIZE * NUM_OF_L2_QUEUES(dev)))
 
-int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
-			u8 rx_qid,
-			u16 sb,
-			u8 sb_index,
-			u16 bd_max_bytes,
-			dma_addr_t bd_chain_phys_addr,
-			dma_addr_t cqe_pbl_addr,
-			u16 cqe_pbl_size, void __iomem **pp_prod)
+int
+qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
+		    struct qed_queue_cid *p_cid,
+		    u16 bd_max_bytes,
+		    dma_addr_t bd_chain_phys_addr,
+		    dma_addr_t cqe_pbl_addr,
+		    u16 cqe_pbl_size, void __iomem **pp_prod)
 {
 	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
 	struct pfvf_start_queue_resp_tlv *resp;
 	struct vfpf_start_rxq_tlv *req;
+	u8 rx_qid = p_cid->rel.queue_id;
 	int rc;
 
 	/* clear mailbox and prep first tlv */
@@ -409,21 +409,22 @@ int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
 	req->cqe_pbl_addr = cqe_pbl_addr;
 	req->cqe_pbl_size = cqe_pbl_size;
 	req->rxq_addr = bd_chain_phys_addr;
-	req->hw_sb = sb;
-	req->sb_index = sb_index;
+	req->hw_sb = p_cid->rel.sb;
+	req->sb_index = p_cid->rel.sb_idx;
 	req->bd_max_bytes = bd_max_bytes;
 	req->stat_id = -1;
 
 	/* If PF is legacy, we'll need to calculate producers ourselves
 	 * as well as clean them.
 	 */
-	if (pp_prod && p_iov->b_pre_fp_hsi) {
+	if (p_iov->b_pre_fp_hsi) {
 		u8 hw_qid = p_iov->acquire_resp.resc.hw_qid[rx_qid];
 		u32 init_prod_val = 0;
 
-		*pp_prod = (u8 __iomem *)p_hwfn->regview +
-					 MSTORM_QZONE_START(p_hwfn->cdev) +
-					 hw_qid * MSTORM_QZONE_SIZE;
+		*pp_prod = (u8 __iomem *)
+		    p_hwfn->regview +
+		    MSTORM_QZONE_START(p_hwfn->cdev) +
+		    hw_qid * MSTORM_QZONE_SIZE;
 
 		/* Init the rcq, rx bd and rx sge (if valid) producers to 0 */
 		__internal_ram_wr(p_hwfn, *pp_prod, sizeof(u32),
@@ -444,7 +445,7 @@ int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
 	}
 
 	/* Learn the address of the producer from the response */
-	if (pp_prod && !p_iov->b_pre_fp_hsi) {
+	if (!p_iov->b_pre_fp_hsi) {
 		u32 init_prod_val = 0;
 
 		*pp_prod = (u8 __iomem *)p_hwfn->regview + resp->offset;
@@ -462,7 +463,8 @@ exit:
 	return rc;
 }
 
-int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn, u16 rx_qid, bool cqe_completion)
+int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn,
+		       struct qed_queue_cid *p_cid, bool cqe_completion)
 {
 	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
 	struct vfpf_stop_rxqs_tlv *req;
@@ -472,7 +474,7 @@ int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn, u16 rx_qid, bool cqe_completion)
 	/* clear mailbox and prep first tlv */
 	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_STOP_RXQS, sizeof(*req));
 
-	req->rx_qid = rx_qid;
+	req->rx_qid = p_cid->rel.queue_id;
 	req->num_rxqs = 1;
 	req->cqe_completion = cqe_completion;
 
@@ -496,28 +498,28 @@ exit:
 	return rc;
 }
 
-int qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
-			u16 tx_queue_id,
-			u16 sb,
-			u8 sb_index,
-			dma_addr_t pbl_addr,
-			u16 pbl_size, void __iomem **pp_doorbell)
+int
+qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
+		    struct qed_queue_cid *p_cid,
+		    dma_addr_t pbl_addr,
+		    u16 pbl_size, void __iomem **pp_doorbell)
 {
 	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
 	struct pfvf_start_queue_resp_tlv *resp;
 	struct vfpf_start_txq_tlv *req;
+	u16 qid = p_cid->rel.queue_id;
 	int rc;
 
 	/* clear mailbox and prep first tlv */
 	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_START_TXQ, sizeof(*req));
 
-	req->tx_qid = tx_queue_id;
+	req->tx_qid = qid;
 
 	/* Tx */
 	req->pbl_addr = pbl_addr;
 	req->pbl_size = pbl_size;
-	req->hw_sb = sb;
-	req->sb_index = sb_index;
+	req->hw_sb = p_cid->rel.sb;
+	req->sb_index = p_cid->rel.sb_idx;
 
 	/* add list termination tlv */
 	qed_add_tlv(p_hwfn, &p_iov->offset,
@@ -533,33 +535,29 @@ int qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
 		goto exit;
 	}
 
-	if (pp_doorbell) {
-		/* Modern PFs provide the actual offsets, while legacy
-		 * provided only the queue id.
-		 */
-		if (!p_iov->b_pre_fp_hsi) {
-			*pp_doorbell = (u8 __iomem *)p_hwfn->doorbells +
-						     resp->offset;
-		} else {
-			u8 cid = p_iov->acquire_resp.resc.cid[tx_queue_id];
-			u32 db_addr;
-
-			db_addr = qed_db_addr_vf(cid, DQ_DEMS_LEGACY);
-			*pp_doorbell = (u8 __iomem *)p_hwfn->doorbells +
-						     db_addr;
-		}
+	/* Modern PFs provide the actual offsets, while legacy
+	 * provided only the queue id.
+	 */
+	if (!p_iov->b_pre_fp_hsi) {
+		*pp_doorbell = (u8 __iomem *)p_hwfn->doorbells + resp->offset;
+	} else {
+		u8 cid = p_iov->acquire_resp.resc.cid[qid];
 
-		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
-			   "Txq[0x%02x]: doorbell at %p [offset 0x%08x]\n",
-			   tx_queue_id, *pp_doorbell, resp->offset);
+		*pp_doorbell = (u8 __iomem *)p_hwfn->doorbells +
+					     qed_db_addr_vf(cid,
+							    DQ_DEMS_LEGACY);
 	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "Txq[0x%02x]: doorbell at %p [offset 0x%08x]\n",
+		   qid, *pp_doorbell, resp->offset);
 exit:
 	qed_vf_pf_req_end(p_hwfn, rc);
 
 	return rc;
 }
 
-int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn, u16 tx_qid)
+int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn, struct qed_queue_cid *p_cid)
 {
 	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
 	struct vfpf_stop_txqs_tlv *req;
@@ -569,7 +567,7 @@ int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn, u16 tx_qid)
 	/* clear mailbox and prep first tlv */
 	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_STOP_TXQS, sizeof(*req));
 
-	req->tx_qid = tx_qid;
+	req->tx_qid = p_cid->rel.queue_id;
 	req->num_txqs = 1;
 
 	/* add list termination tlv */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index 325c250d4ee5..11eb3854e6f2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -666,10 +666,7 @@ int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn);
 /**
  * @brief VF - start the RX Queue by sending a message to the PF
  * @param p_hwfn
- * @param cid                   - zero based within the VF
- * @param rx_queue_id           - zero based within the VF
- * @param sb                    - VF status block for this queue
- * @param sb_index              - Index within the status block
+ * @param p_cid			- Only relative fields are relevant
  * @param bd_max_bytes          - maximum number of bytes per bd
  * @param bd_chain_phys_addr    - physical address of bd chain
  * @param cqe_pbl_addr          - physical address of pbl
@@ -680,9 +677,7 @@ int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn);
  * @return int
  */
 int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
-			u8 rx_queue_id,
-			u16 sb,
-			u8 sb_index,
+			struct qed_queue_cid *p_cid,
 			u16 bd_max_bytes,
 			dma_addr_t bd_chain_phys_addr,
 			dma_addr_t cqe_pbl_addr,
@@ -702,24 +697,23 @@ int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
  *
  * @return int
  */
-int qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
-			u16 tx_queue_id,
-			u16 sb,
-			u8 sb_index,
-			dma_addr_t pbl_addr,
-			u16 pbl_size, void __iomem **pp_doorbell);
+int
+qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
+		    struct qed_queue_cid *p_cid,
+		    dma_addr_t pbl_addr,
+		    u16 pbl_size, void __iomem **pp_doorbell);
 
 /**
  * @brief VF - stop the RX queue by sending a message to the PF
  *
  * @param p_hwfn
- * @param rx_qid
+ * @param p_cid
  * @param cqe_completion
  *
  * @return int
  */
 int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn,
-		       u16 rx_qid, bool cqe_completion);
+		       struct qed_queue_cid *p_cid, bool cqe_completion);
 
 /**
  * @brief VF - stop the TX queue by sending a message to the PF
@@ -729,7 +723,7 @@ int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn,
  *
  * @return int
  */
-int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn, u16 tx_qid);
+int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn, struct qed_queue_cid *p_cid);
 
 /**
  * @brief VF - send a vport update command
@@ -902,9 +896,7 @@ static inline int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn)
 }
 
 static inline int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
-				      u8 rx_queue_id,
-				      u16 sb,
-				      u8 sb_index,
+				      struct qed_queue_cid *p_cid,
 				      u16 bd_max_bytes,
 				      dma_addr_t bd_chain_phys_adr,
 				      dma_addr_t cqe_pbl_addr,
@@ -914,9 +906,7 @@ static inline int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
 }
 
 static inline int qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
-				      u16 tx_queue_id,
-				      u16 sb,
-				      u8 sb_index,
+				      struct qed_queue_cid *p_cid,
 				      dma_addr_t pbl_addr,
 				      u16 pbl_size, void __iomem **pp_doorbell)
 {
@@ -924,12 +914,14 @@ static inline int qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
 }
 
 static inline int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn,
-				     u16 rx_qid, bool cqe_completion)
+				     struct qed_queue_cid *p_cid,
+				     bool cqe_completion)
 {
 	return -EINVAL;
 }
 
-static inline int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn, u16 tx_qid)
+static inline int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn,
+				     struct qed_queue_cid *p_cid)
 {
 	return -EINVAL;
 }
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 2116c4cc8924..c2135765f8ec 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -264,6 +264,8 @@ struct qede_rx_queue {
 	u64			rx_hw_errors;
 	u64			rx_alloc_errors;
 	u64			rx_ip_frags;
+
+	void *handle;
 };
 
 union db_prod {
@@ -293,6 +295,8 @@ struct qede_tx_queue {
 	u64			stopped_cnt;
 
 	bool			is_legacy;
+	void *handle;
+
 };
 
 #define BD_UNMAP_ADDR(bd)		HILO_U64(le32_to_cpu((bd)->addr.hi), \
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 64c7f3b75283..834921178615 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -3334,6 +3334,12 @@ static int qede_drain_txq(struct qede_dev *edev,
 	return 0;
 }
 
+static int qede_stop_txq(struct qede_dev *edev,
+			 struct qede_tx_queue *txq, int rss_id)
+{
+	return edev->ops->q_tx_stop(edev->cdev, rss_id, txq->handle);
+}
+
 static int qede_stop_queues(struct qede_dev *edev)
 {
 	struct qed_update_vport_params vport_update_params;
@@ -3367,28 +3373,18 @@ static int qede_stop_queues(struct qede_dev *edev)
 
 	/* Stop all Queues in reverse order */
 	for (i = QEDE_QUEUE_CNT(edev) - 1; i >= 0; i--) {
-		struct qed_stop_rxq_params rx_params;
-
 		fp = &edev->fp_array[i];
 
 		/* Stop the Tx Queue(s) */
 		if (fp->type & QEDE_FASTPATH_TX) {
-			struct qed_stop_txq_params tx_params;
-
-			tx_params.rss_id = i;
-			tx_params.tx_queue_id = fp->txq->index;
-				rc = edev->ops->q_tx_stop(cdev, &tx_params);
-				if (rc)
-					return rc;
+			rc = qede_stop_txq(edev, fp->txq, i);
+			if (rc)
+				return rc;
 		}
 
 		/* Stop the Rx Queue */
 		if (fp->type & QEDE_FASTPATH_RX) {
-			memset(&rx_params, 0, sizeof(rx_params));
-			rx_params.rss_id = i;
-			rx_params.rx_queue_id = fp->rxq->rxq_id;
-
-			rc = edev->ops->q_rx_stop(cdev, &rx_params);
+			rc = edev->ops->q_rx_stop(cdev, i, fp->rxq->handle);
 			if (rc) {
 				DP_ERR(edev, "Failed to stop RXQ #%d\n", i);
 				return rc;
@@ -3404,6 +3400,46 @@ static int qede_stop_queues(struct qede_dev *edev)
 	return rc;
 }
 
+static int qede_start_txq(struct qede_dev *edev,
+			  struct qede_fastpath *fp,
+			  struct qede_tx_queue *txq, u8 rss_id, u16 sb_idx)
+{
+	dma_addr_t phys_table = qed_chain_get_pbl_phys(&txq->tx_pbl);
+	u32 page_cnt = qed_chain_get_page_cnt(&txq->tx_pbl);
+	struct qed_queue_start_common_params params;
+	struct qed_txq_start_ret_params ret_params;
+	int rc;
+
+	memset(&params, 0, sizeof(params));
+	memset(&ret_params, 0, sizeof(ret_params));
+
+	params.queue_id = txq->index;
+	params.sb = fp->sb_info->igu_sb_id;
+	params.sb_idx = sb_idx;
+
+	rc = edev->ops->q_tx_start(edev->cdev, rss_id, &params, phys_table,
+				   page_cnt, &ret_params);
+	if (rc) {
+		DP_ERR(edev, "Start TXQ #%d failed %d\n", txq->index, rc);
+		return rc;
+	}
+
+	txq->doorbell_addr = ret_params.p_doorbell;
+	txq->handle = ret_params.p_handle;
+
+	/* Determine the FW consumer address associated */
+	txq->hw_cons_ptr = &fp->sb_info->sb_virt->pi_array[sb_idx];
+
+	/* Prepare the doorbell parameters */
+	SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_DEST, DB_DEST_XCM);
+	SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_AGG_CMD, DB_AGG_CMD_SET);
+	SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_AGG_VAL_SEL,
+		  DQ_XCM_ETH_TX_BD_PROD_CMD);
+	txq->tx_db.data.agg_flags = DQ_XCM_ETH_DQ_CF_CMD;
+
+	return rc;
+}
+
 static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 {
 	int vlan_removal_en = 1;
@@ -3445,11 +3481,12 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 		u32 page_cnt;
 
 		if (fp->type & QEDE_FASTPATH_RX) {
+			struct qed_rxq_start_ret_params ret_params;
 			struct qede_rx_queue *rxq = fp->rxq;
 			__le16 *val;
 
+			memset(&ret_params, 0, sizeof(ret_params));
 			memset(&q_params, 0, sizeof(q_params));
-			q_params.rss_id = i;
 			q_params.queue_id = rxq->rxq_id;
 			q_params.vport_id = 0;
 			q_params.sb = fp->sb_info->igu_sb_id;
@@ -3459,18 +3496,21 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 			    qed_chain_get_pbl_phys(&rxq->rx_comp_ring);
 			page_cnt = qed_chain_get_page_cnt(&rxq->rx_comp_ring);
 
-			rc = edev->ops->q_rx_start(cdev, &q_params,
+			rc = edev->ops->q_rx_start(cdev, i, &q_params,
 						   rxq->rx_buf_size,
 						   rxq->rx_bd_ring.p_phys_addr,
 						   p_phys_table,
-						   page_cnt,
-						   &rxq->hw_rxq_prod_addr);
+						   page_cnt, &ret_params);
 			if (rc) {
 				DP_ERR(edev, "Start RXQ #%d failed %d\n", i,
 				       rc);
 				return rc;
 			}
 
+			/* Use the return parameters */
+			rxq->hw_rxq_prod_addr = ret_params.p_prod;
+			rxq->handle = ret_params.p_handle;
+
 			val = &fp->sb_info->sb_virt->pi_array[RX_PI];
 			rxq->hw_cons_ptr = val;
 
@@ -3478,38 +3518,9 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 		}
 
 		if (fp->type & QEDE_FASTPATH_TX) {
-			struct qede_tx_queue *txq = fp->txq;
-
-			p_phys_table = qed_chain_get_pbl_phys(&txq->tx_pbl);
-			page_cnt = qed_chain_get_page_cnt(&txq->tx_pbl);
-
-			memset(&q_params, 0, sizeof(q_params));
-			q_params.rss_id = i;
-			q_params.queue_id = txq->index;
-			q_params.vport_id = 0;
-			q_params.sb = fp->sb_info->igu_sb_id;
-			q_params.sb_idx = TX_PI(0);
-
-			rc = edev->ops->q_tx_start(cdev, &q_params,
-						   p_phys_table, page_cnt,
-						   &txq->doorbell_addr);
-			if (rc) {
-				DP_ERR(edev, "Start TXQ #%d failed %d\n",
-				       txq->index, rc);
+			rc = qede_start_txq(edev, fp, fp->txq, i, TX_PI(0));
+			if (rc)
 				return rc;
-			}
-
-			txq->hw_cons_ptr =
-				&fp->sb_info->sb_virt->pi_array[TX_PI(0)];
-			SET_FIELD(txq->tx_db.data.params,
-				  ETH_DB_DATA_DEST, DB_DEST_XCM);
-			SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_AGG_CMD,
-				  DB_AGG_CMD_SET);
-			SET_FIELD(txq->tx_db.data.params,
-				  ETH_DB_DATA_AGG_VAL_SEL,
-				  DQ_XCM_ETH_TX_BD_PROD_CMD);
-
-			txq->tx_db.data.agg_flags = DQ_XCM_ETH_DQ_CF_CMD;
 		}
 	}
 
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 9755a3feb52e..7a52f7c58c37 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -15,6 +15,29 @@
 #include <linux/qed/qed_if.h>
 #include <linux/qed/qed_iov_if.h>
 
+struct qed_queue_start_common_params {
+	/* Should always be relative to entity sending this. */
+	u8 vport_id;
+	u16 queue_id;
+
+	/* Relative, but relevant only for PFs */
+	u8 stats_id;
+
+	/* These are always absolute */
+	u16 sb;
+	u8 sb_idx;
+};
+
+struct qed_rxq_start_ret_params {
+	void __iomem *p_prod;
+	void *p_handle;
+};
+
+struct qed_txq_start_ret_params {
+	void __iomem *p_doorbell;
+	void *p_handle;
+};
+
 struct qed_dev_eth_info {
 	struct qed_dev_info common;
 
@@ -56,18 +79,6 @@ struct qed_start_vport_params {
 	bool clear_stats;
 };
 
-struct qed_stop_rxq_params {
-	u8 rss_id;
-	u8 rx_queue_id;
-	u8 vport_id;
-	bool eq_completion_only;
-};
-
-struct qed_stop_txq_params {
-	u8 rss_id;
-	u8 tx_queue_id;
-};
-
 enum qed_filter_rx_mode_type {
 	QED_FILTER_RX_MODE_TYPE_REGULAR,
 	QED_FILTER_RX_MODE_TYPE_MULTI_PROMISC,
@@ -112,15 +123,6 @@ struct qed_filter_params {
 	union qed_filter_type_params filter;
 };
 
-struct qed_queue_start_common_params {
-	u8 rss_id;
-	u8 queue_id;
-	u8 vport_id;
-	u16 sb;
-	u16 sb_idx;
-	u16 vf_qid;
-};
-
 struct qed_tunn_params {
 	u16 vxlan_port;
 	u8 update_vxlan_port;
@@ -220,24 +222,24 @@ struct qed_eth_ops {
 			    struct qed_update_vport_params *params);
 
 	int (*q_rx_start)(struct qed_dev *cdev,
+			  u8 rss_num,
 			  struct qed_queue_start_common_params *params,
 			  u16 bd_max_bytes,
 			  dma_addr_t bd_chain_phys_addr,
 			  dma_addr_t cqe_pbl_addr,
 			  u16 cqe_pbl_size,
-			  void __iomem **pp_prod);
+			  struct qed_rxq_start_ret_params *ret_params);
 
-	int (*q_rx_stop)(struct qed_dev *cdev,
-			 struct qed_stop_rxq_params *params);
+	int (*q_rx_stop)(struct qed_dev *cdev, u8 rss_id, void *handle);
 
 	int (*q_tx_start)(struct qed_dev *cdev,
+			  u8 rss_num,
 			  struct qed_queue_start_common_params *params,
 			  dma_addr_t pbl_addr,
 			  u16 pbl_size,
-			  void __iomem **pp_doorbell);
+			  struct qed_txq_start_ret_params *ret_params);
 
-	int (*q_tx_stop)(struct qed_dev *cdev,
-			 struct qed_stop_txq_params *params);
+	int (*q_tx_stop)(struct qed_dev *cdev, u8 rss_id, void *handle);
 
 	int (*filter_config)(struct qed_dev *cdev,
 			     struct qed_filter_params *params);
-- 
cgit v1.2.3


From f4ed2fe34fb793755ef8cfc3509e783c4709ffc1 Mon Sep 17 00:00:00 2001
From: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
Date: Tue, 29 Nov 2016 15:16:46 +0530
Subject: net: phy: add mdix_ctrl to hold the user configuration.

Add new parameter mdix_ctrl to hold the user configuration.
Existing mdix maintain the current status of MDI(X) crossover performed or
not.
mdix_ctrl can configure either ETH_TP_MDI or ETH_TP_MDI_X orETH_TP_MDI_AUTO.

Signed-off-by: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index b53177fd38af..feb8a98e8dd3 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -450,6 +450,7 @@ struct phy_device {
 	struct net_device *attached_dev;
 
 	u8 mdix;
+	u8 mdix_ctrl;
 
 	void (*adjust_link)(struct net_device *dev);
 };
-- 
cgit v1.2.3


From 1c1b522808a18402f043c1418b4e48c7355480cc Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 30 Nov 2016 17:59:37 +0200
Subject: net/mlx5e: Implement Fragmented Work Queue (WQ)

Add new type of struct mlx5_frag_buf which is used to allocate fragmented
buffers rather than contiguous, and make the Completion Queues (CQs) use
it as they are big (default of 2MB per CQ in Striding RQ).

This fixes the failures of type:
"mlx5e_open_locked: mlx5e_open_channels failed, -12"
due to dma_zalloc_coherent insufficient contiguous coherent memory to
satisfy the driver's request when the user tries to setup more or larger
rings.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Reported-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/alloc.c   | 66 +++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 ++--
 drivers/net/ethernet/mellanox/mlx5/core/wq.c      | 26 ++++++---
 drivers/net/ethernet/mellanox/mlx5/core/wq.h      | 18 +++++--
 include/linux/mlx5/driver.h                       | 11 ++++
 6 files changed, 116 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
index 2c6e3c7b7417..44791de5afe6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
@@ -106,6 +106,63 @@ void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf)
 }
 EXPORT_SYMBOL_GPL(mlx5_buf_free);
 
+int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
+			     struct mlx5_frag_buf *buf, int node)
+{
+	int i;
+
+	buf->size = size;
+	buf->npages = 1 << get_order(size);
+	buf->page_shift = PAGE_SHIFT;
+	buf->frags = kcalloc(buf->npages, sizeof(struct mlx5_buf_list),
+			     GFP_KERNEL);
+	if (!buf->frags)
+		goto err_out;
+
+	for (i = 0; i < buf->npages; i++) {
+		struct mlx5_buf_list *frag = &buf->frags[i];
+		int frag_sz = min_t(int, size, PAGE_SIZE);
+
+		frag->buf = mlx5_dma_zalloc_coherent_node(dev, frag_sz,
+							  &frag->map, node);
+		if (!frag->buf)
+			goto err_free_buf;
+		if (frag->map & ((1 << buf->page_shift) - 1)) {
+			dma_free_coherent(&dev->pdev->dev, frag_sz,
+					  buf->frags[i].buf, buf->frags[i].map);
+			mlx5_core_warn(dev, "unexpected map alignment: 0x%p, page_shift=%d\n",
+				       (void *)frag->map, buf->page_shift);
+			goto err_free_buf;
+		}
+		size -= frag_sz;
+	}
+
+	return 0;
+
+err_free_buf:
+	while (i--)
+		dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, buf->frags[i].buf,
+				  buf->frags[i].map);
+	kfree(buf->frags);
+err_out:
+	return -ENOMEM;
+}
+
+void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf)
+{
+	int size = buf->size;
+	int i;
+
+	for (i = 0; i < buf->npages; i++) {
+		int frag_sz = min_t(int, size, PAGE_SIZE);
+
+		dma_free_coherent(&dev->pdev->dev, frag_sz, buf->frags[i].buf,
+				  buf->frags[i].map);
+		size -= frag_sz;
+	}
+	kfree(buf->frags);
+}
+
 static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct mlx5_core_dev *dev,
 						 int node)
 {
@@ -230,3 +287,12 @@ void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas)
 	}
 }
 EXPORT_SYMBOL_GPL(mlx5_fill_page_array);
+
+void mlx5_fill_page_frag_array(struct mlx5_frag_buf *buf, __be64 *pas)
+{
+	int i;
+
+	for (i = 0; i < buf->npages; i++)
+		pas[i] = cpu_to_be64(buf->frags[i].map);
+}
+EXPORT_SYMBOL_GPL(mlx5_fill_page_frag_array);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 442dbc3e6be4..f16f7fbd2044 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -286,7 +286,7 @@ struct mlx5e_cq {
 	u16                        decmprs_wqe_counter;
 
 	/* control */
-	struct mlx5_wq_ctrl        wq_ctrl;
+	struct mlx5_frag_wq_ctrl   wq_ctrl;
 } ____cacheline_aligned_in_smp;
 
 struct mlx5e_rq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 6b492ca17d7e..ba25cd361bb2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1201,7 +1201,7 @@ static int mlx5e_create_cq(struct mlx5e_channel *c,
 
 static void mlx5e_destroy_cq(struct mlx5e_cq *cq)
 {
-	mlx5_wq_destroy(&cq->wq_ctrl);
+	mlx5_cqwq_destroy(&cq->wq_ctrl);
 }
 
 static int mlx5e_enable_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
@@ -1218,7 +1218,7 @@ static int mlx5e_enable_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 	int err;
 
 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
-		sizeof(u64) * cq->wq_ctrl.buf.npages;
+		sizeof(u64) * cq->wq_ctrl.frag_buf.npages;
 	in = mlx5_vzalloc(inlen);
 	if (!in)
 		return -ENOMEM;
@@ -1227,15 +1227,15 @@ static int mlx5e_enable_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 
 	memcpy(cqc, param->cqc, sizeof(param->cqc));
 
-	mlx5_fill_page_array(&cq->wq_ctrl.buf,
-			     (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas));
+	mlx5_fill_page_frag_array(&cq->wq_ctrl.frag_buf,
+				  (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas));
 
 	mlx5_vector2eqn(mdev, param->eq_ix, &eqn, &irqn_not_used);
 
 	MLX5_SET(cqc,   cqc, cq_period_mode, param->cq_period_mode);
 	MLX5_SET(cqc,   cqc, c_eqn,         eqn);
 	MLX5_SET(cqc,   cqc, uar_page,      mcq->uar->index);
-	MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
+	MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.frag_buf.page_shift -
 					    MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(cqc, cqc, dbr_addr,      cq->wq_ctrl.db.dma);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
index 821a087c7ae2..921673c42bc9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
@@ -101,13 +101,15 @@ err_db_free:
 
 int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		     void *cqc, struct mlx5_cqwq *wq,
-		     struct mlx5_wq_ctrl *wq_ctrl)
+		     struct mlx5_frag_wq_ctrl *wq_ctrl)
 {
 	int err;
 
-	wq->log_stride = 6 + MLX5_GET(cqc, cqc, cqe_sz);
-	wq->log_sz = MLX5_GET(cqc, cqc, log_cq_size);
-	wq->sz_m1 = (1 << wq->log_sz) - 1;
+	wq->log_stride	= 6 + MLX5_GET(cqc, cqc, cqe_sz);
+	wq->log_sz	= MLX5_GET(cqc, cqc, log_cq_size);
+	wq->sz_m1	= (1 << wq->log_sz) - 1;
+	wq->log_frag_strides = PAGE_SHIFT - wq->log_stride;
+	wq->frag_sz_m1	= (1 << wq->log_frag_strides) - 1;
 
 	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
@@ -115,14 +117,16 @@ int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		return err;
 	}
 
-	err = mlx5_buf_alloc_node(mdev, mlx5_cqwq_get_byte_size(wq),
-				  &wq_ctrl->buf, param->buf_numa_node);
+	err = mlx5_frag_buf_alloc_node(mdev, mlx5_cqwq_get_byte_size(wq),
+				       &wq_ctrl->frag_buf,
+				       param->buf_numa_node);
 	if (err) {
-		mlx5_core_warn(mdev, "mlx5_buf_alloc_node() failed, %d\n", err);
+		mlx5_core_warn(mdev, "mlx5_frag_buf_alloc_node() failed, %d\n",
+			       err);
 		goto err_db_free;
 	}
 
-	wq->buf = wq_ctrl->buf.direct.buf;
+	wq->frag_buf = wq_ctrl->frag_buf;
 	wq->db  = wq_ctrl->db.db;
 
 	wq_ctrl->mdev = mdev;
@@ -184,3 +188,9 @@ void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl)
 	mlx5_buf_free(wq_ctrl->mdev, &wq_ctrl->buf);
 	mlx5_db_free(wq_ctrl->mdev, &wq_ctrl->db);
 }
+
+void mlx5_cqwq_destroy(struct mlx5_frag_wq_ctrl *wq_ctrl)
+{
+	mlx5_frag_buf_free(wq_ctrl->mdev, &wq_ctrl->frag_buf);
+	mlx5_db_free(wq_ctrl->mdev, &wq_ctrl->db);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
index 6c2a8f95093c..d8afed898c31 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -47,6 +47,12 @@ struct mlx5_wq_ctrl {
 	struct mlx5_db		db;
 };
 
+struct mlx5_frag_wq_ctrl {
+	struct mlx5_core_dev	*mdev;
+	struct mlx5_frag_buf	frag_buf;
+	struct mlx5_db		db;
+};
+
 struct mlx5_wq_cyc {
 	void			*buf;
 	__be32			*db;
@@ -55,12 +61,14 @@ struct mlx5_wq_cyc {
 };
 
 struct mlx5_cqwq {
-	void			*buf;
+	struct mlx5_frag_buf	frag_buf;
 	__be32			*db;
 	u32			sz_m1;
+	u32			frag_sz_m1;
 	u32			cc; /* consumer counter */
 	u8			log_sz;
 	u8			log_stride;
+	u8			log_frag_strides;
 };
 
 struct mlx5_wq_ll {
@@ -81,7 +89,7 @@ u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq);
 
 int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 		     void *cqc, struct mlx5_cqwq *wq,
-		     struct mlx5_wq_ctrl *wq_ctrl);
+		     struct mlx5_frag_wq_ctrl *wq_ctrl);
 u32 mlx5_cqwq_get_size(struct mlx5_cqwq *wq);
 
 int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
@@ -90,6 +98,7 @@ int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 u32 mlx5_wq_ll_get_size(struct mlx5_wq_ll *wq);
 
 void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl);
+void mlx5_cqwq_destroy(struct mlx5_frag_wq_ctrl *wq_ctrl);
 
 static inline u16 mlx5_wq_cyc_ctr2ix(struct mlx5_wq_cyc *wq, u16 ctr)
 {
@@ -116,7 +125,10 @@ static inline u32 mlx5_cqwq_get_ci(struct mlx5_cqwq *wq)
 
 static inline void *mlx5_cqwq_get_wqe(struct mlx5_cqwq *wq, u32 ix)
 {
-	return wq->buf + (ix << wq->log_stride);
+	unsigned int frag = (ix >> wq->log_frag_strides);
+
+	return wq->frag_buf.frags[frag].buf +
+		((wq->frag_sz_m1 & ix) << wq->log_stride);
 }
 
 static inline u32 mlx5_cqwq_get_wrap_cnt(struct mlx5_cqwq *wq)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 68b85efc3908..0ae55361e674 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -318,6 +318,13 @@ struct mlx5_buf {
 	u8			page_shift;
 };
 
+struct mlx5_frag_buf {
+	struct mlx5_buf_list	*frags;
+	int			npages;
+	int			size;
+	u8			page_shift;
+};
+
 struct mlx5_eq_tasklet {
 	struct list_head list;
 	struct list_head process_list;
@@ -822,6 +829,9 @@ int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
 			struct mlx5_buf *buf, int node);
 int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf);
 void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf);
+int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
+			     struct mlx5_frag_buf *buf, int node);
+void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf);
 struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
 						      gfp_t flags, int npages);
 void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev,
@@ -866,6 +876,7 @@ void mlx5_unregister_debugfs(void);
 int mlx5_eq_init(struct mlx5_core_dev *dev);
 void mlx5_eq_cleanup(struct mlx5_core_dev *dev);
 void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas);
+void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas);
 void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn);
 void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type);
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-- 
cgit v1.2.3


From 3a0af8fd61f90920f6fa04e4f1e9a6a73c1b4fd2 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 30 Nov 2016 17:10:10 +0100
Subject: bpf: BPF for lightweight tunnel infrastructure

Registers new BPF program types which correspond to the LWT hooks:
  - BPF_PROG_TYPE_LWT_IN   => dst_input()
  - BPF_PROG_TYPE_LWT_OUT  => dst_output()
  - BPF_PROG_TYPE_LWT_XMIT => lwtunnel_xmit()

The separate program types are required to differentiate between the
capabilities each LWT hook allows:

 * Programs attached to dst_input() or dst_output() are restricted and
   may only read the data of an skb. This prevent modification and
   possible invalidation of already validated packet headers on receive
   and the construction of illegal headers while the IP headers are
   still being assembled.

 * Programs attached to lwtunnel_xmit() are allowed to modify packet
   content as well as prepending an L2 header via a newly introduced
   helper bpf_skb_change_head(). This is safe as lwtunnel_xmit() is
   invoked after the IP header has been assembled completely.

All BPF programs receive an skb with L3 headers attached and may return
one of the following error codes:

 BPF_OK - Continue routing as per nexthop
 BPF_DROP - Drop skb and return EPERM
 BPF_REDIRECT - Redirect skb to device as per redirect() helper.
                (Only valid in lwtunnel_xmit() context)

The return codes are binary compatible with their TC_ACT_
relatives to ease compatibility.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h        |   2 +-
 include/uapi/linux/bpf.h      |  32 +++-
 include/uapi/linux/lwtunnel.h |  23 +++
 kernel/bpf/verifier.c         |  14 +-
 net/Kconfig                   |   8 +
 net/core/Makefile             |   1 +
 net/core/filter.c             | 173 ++++++++++++++++++
 net/core/lwt_bpf.c            | 396 ++++++++++++++++++++++++++++++++++++++++++
 net/core/lwtunnel.c           |   2 +
 9 files changed, 646 insertions(+), 5 deletions(-)
 create mode 100644 net/core/lwt_bpf.c

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7f246a281435..7ba644626553 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -438,7 +438,7 @@ struct xdp_buff {
 };
 
 /* compute the linear packet data range [data, data_end) which
- * will be accessed by cls_bpf and act_bpf programs
+ * will be accessed by cls_bpf, act_bpf and lwt programs
  */
 static inline void bpf_compute_data_end(struct sk_buff *skb)
 {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1370a9d1456f..22ac82792687 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,6 +101,9 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_XDP,
 	BPF_PROG_TYPE_PERF_EVENT,
 	BPF_PROG_TYPE_CGROUP_SKB,
+	BPF_PROG_TYPE_LWT_IN,
+	BPF_PROG_TYPE_LWT_OUT,
+	BPF_PROG_TYPE_LWT_XMIT,
 };
 
 enum bpf_attach_type {
@@ -409,6 +412,16 @@ union bpf_attr {
  *
  * int bpf_get_numa_node_id()
  *     Return: Id of current NUMA node.
+ *
+ * int bpf_skb_change_head()
+ *     Grows headroom of skb and adjusts MAC header offset accordingly.
+ *     Will extends/reallocae as required automatically.
+ *     May change skb data pointer and will thus invalidate any check
+ *     performed for direct packet access.
+ *     @skb: pointer to skb
+ *     @len: length of header to be pushed in front
+ *     @flags: Flags (unused for now)
+ *     Return: 0 on success or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -453,7 +466,8 @@ union bpf_attr {
 	FN(skb_pull_data),		\
 	FN(csum_update),		\
 	FN(set_hash_invalid),		\
-	FN(get_numa_node_id),
+	FN(get_numa_node_id),		\
+	FN(skb_change_head),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -537,6 +551,22 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+/* Generic BPF return codes which all BPF program types may support.
+ * The values are binary compatible with their TC_ACT_* counter-part to
+ * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
+ * programs.
+ *
+ * XDP is handled seprately, see XDP_*.
+ */
+enum bpf_ret_code {
+	BPF_OK = 0,
+	/* 1 reserved */
+	BPF_DROP = 2,
+	/* 3-6 reserved */
+	BPF_REDIRECT = 7,
+	/* >127 are reserved for prog type specific return codes */
+};
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 453cc6215bfd..92724cba1eba 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -10,6 +10,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_ILA,
 	LWTUNNEL_ENCAP_IP6,
 	LWTUNNEL_ENCAP_SEG6,
+	LWTUNNEL_ENCAP_BPF,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
@@ -43,4 +44,26 @@ enum lwtunnel_ip6_t {
 
 #define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1)
 
+enum {
+	LWT_BPF_PROG_UNSPEC,
+	LWT_BPF_PROG_FD,
+	LWT_BPF_PROG_NAME,
+	__LWT_BPF_PROG_MAX,
+};
+
+#define LWT_BPF_PROG_MAX (__LWT_BPF_PROG_MAX - 1)
+
+enum {
+	LWT_BPF_UNSPEC,
+	LWT_BPF_IN,
+	LWT_BPF_OUT,
+	LWT_BPF_XMIT,
+	LWT_BPF_XMIT_HEADROOM,
+	__LWT_BPF_MAX,
+};
+
+#define LWT_BPF_MAX (__LWT_BPF_MAX - 1)
+
+#define LWT_BPF_MAX_HEADROOM 256
+
 #endif /* _UAPI_LWTUNNEL_H_ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8740c5fa02fc..8135cb1077ee 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -633,12 +633,19 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 #define MAX_PACKET_OFF 0xffff
 
 static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
-				       const struct bpf_call_arg_meta *meta)
+				       const struct bpf_call_arg_meta *meta,
+				       enum bpf_access_type t)
 {
 	switch (env->prog->type) {
+	case BPF_PROG_TYPE_LWT_IN:
+	case BPF_PROG_TYPE_LWT_OUT:
+		/* dst_input() and dst_output() can't write for now */
+		if (t == BPF_WRITE)
+			return false;
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
 	case BPF_PROG_TYPE_XDP:
+	case BPF_PROG_TYPE_LWT_XMIT:
 		if (meta)
 			return meta->pkt_access;
 
@@ -837,7 +844,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 			err = check_stack_read(state, off, size, value_regno);
 		}
 	} else if (state->regs[regno].type == PTR_TO_PACKET) {
-		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
+		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
 		}
@@ -970,7 +977,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
+	if (type == PTR_TO_PACKET &&
+	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
 		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
 	}
diff --git a/net/Kconfig b/net/Kconfig
index 7b6cd340b72b..a1005007224c 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -402,6 +402,14 @@ config LWTUNNEL
 	  weight tunnel endpoint. Tunnel encapsulation parameters are stored
 	  with light weight tunnel state associated with fib routes.
 
+config LWTUNNEL_BPF
+	bool "Execute BPF program as route nexthop action"
+	depends on LWTUNNEL
+	default y if LWTUNNEL=y
+	---help---
+	  Allows to run BPF programs as a nexthop action following a route
+	  lookup for incoming and outgoing packets.
+
 config DST_CACHE
 	bool
 	default n
diff --git a/net/core/Makefile b/net/core/Makefile
index d6508c2ddca5..f6761b6e3b29 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
 obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
 obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/filter.c b/net/core/filter.c
index 698a262b8ebb..1c4d0faf22c8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1689,6 +1689,12 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
 static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
 				 u32 flags)
 {
+	/* Verify that a link layer header is carried */
+	if (unlikely(skb->mac_header >= skb->network_header)) {
+		kfree_skb(skb);
+		return -ERANGE;
+	}
+
 	bpf_push_mac_rcsum(skb);
 	return flags & BPF_F_INGRESS ?
 	       __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
@@ -2188,12 +2194,53 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
+	   u64, flags)
+{
+	u32 max_len = __bpf_skb_max_len(skb);
+	u32 new_len = skb->len + head_room;
+	int ret;
+
+	if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
+		     new_len < skb->len))
+		return -EINVAL;
+
+	ret = skb_cow(skb, head_room);
+	if (likely(!ret)) {
+		/* Idea for this helper is that we currently only
+		 * allow to expand on mac header. This means that
+		 * skb->protocol network header, etc, stay as is.
+		 * Compared to bpf_skb_change_tail(), we're more
+		 * flexible due to not needing to linearize or
+		 * reset GSO. Intention for this helper is to be
+		 * used by an L3 skb that needs to push mac header
+		 * for redirection into L2 device.
+		 */
+		__skb_push(skb, head_room);
+		memset(skb->data, 0, head_room);
+		skb_reset_mac_header(skb);
+	}
+
+	bpf_compute_data_end(skb);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_change_head_proto = {
+	.func		= bpf_skb_change_head,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 bool bpf_helper_changes_skb_data(void *func)
 {
 	if (func == bpf_skb_vlan_push ||
 	    func == bpf_skb_vlan_pop ||
 	    func == bpf_skb_store_bytes ||
 	    func == bpf_skb_change_proto ||
+	    func == bpf_skb_change_head ||
 	    func == bpf_skb_change_tail ||
 	    func == bpf_skb_pull_data ||
 	    func == bpf_l3_csum_replace ||
@@ -2639,6 +2686,68 @@ cg_skb_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+lwt_inout_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_load_bytes:
+		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_pull_data:
+		return &bpf_skb_pull_data_proto;
+	case BPF_FUNC_csum_diff:
+		return &bpf_csum_diff_proto;
+	case BPF_FUNC_get_cgroup_classid:
+		return &bpf_get_cgroup_classid_proto;
+	case BPF_FUNC_get_route_realm:
+		return &bpf_get_route_realm_proto;
+	case BPF_FUNC_get_hash_recalc:
+		return &bpf_get_hash_recalc_proto;
+	case BPF_FUNC_perf_event_output:
+		return &bpf_skb_event_output_proto;
+	case BPF_FUNC_get_smp_processor_id:
+		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_skb_under_cgroup:
+		return &bpf_skb_under_cgroup_proto;
+	default:
+		return sk_filter_func_proto(func_id);
+	}
+}
+
+static const struct bpf_func_proto *
+lwt_xmit_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_get_tunnel_key:
+		return &bpf_skb_get_tunnel_key_proto;
+	case BPF_FUNC_skb_set_tunnel_key:
+		return bpf_get_skb_set_tunnel_proto(func_id);
+	case BPF_FUNC_skb_get_tunnel_opt:
+		return &bpf_skb_get_tunnel_opt_proto;
+	case BPF_FUNC_skb_set_tunnel_opt:
+		return bpf_get_skb_set_tunnel_proto(func_id);
+	case BPF_FUNC_redirect:
+		return &bpf_redirect_proto;
+	case BPF_FUNC_clone_redirect:
+		return &bpf_clone_redirect_proto;
+	case BPF_FUNC_skb_change_tail:
+		return &bpf_skb_change_tail_proto;
+	case BPF_FUNC_skb_change_head:
+		return &bpf_skb_change_head_proto;
+	case BPF_FUNC_skb_store_bytes:
+		return &bpf_skb_store_bytes_proto;
+	case BPF_FUNC_csum_update:
+		return &bpf_csum_update_proto;
+	case BPF_FUNC_l3_csum_replace:
+		return &bpf_l3_csum_replace_proto;
+	case BPF_FUNC_l4_csum_replace:
+		return &bpf_l4_csum_replace_proto;
+	case BPF_FUNC_set_hash_invalid:
+		return &bpf_set_hash_invalid_proto;
+	default:
+		return lwt_inout_func_proto(func_id);
+	}
+}
+
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
 	if (off < 0 || off >= sizeof(struct __sk_buff))
@@ -2676,6 +2785,39 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static bool lwt_is_valid_access(int off, int size,
+				enum bpf_access_type type,
+				enum bpf_reg_type *reg_type)
+{
+	switch (off) {
+	case offsetof(struct __sk_buff, tc_classid):
+		return false;
+	}
+
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct __sk_buff, mark):
+		case offsetof(struct __sk_buff, priority):
+		case offsetof(struct __sk_buff, cb[0]) ...
+		     offsetof(struct __sk_buff, cb[4]):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	switch (off) {
+	case offsetof(struct __sk_buff, data):
+		*reg_type = PTR_TO_PACKET;
+		break;
+	case offsetof(struct __sk_buff, data_end):
+		*reg_type = PTR_TO_PACKET_END;
+		break;
+	}
+
+	return __is_valid_access(off, size, type);
+}
+
 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 			       const struct bpf_prog *prog)
 {
@@ -3007,6 +3149,19 @@ static const struct bpf_verifier_ops cg_skb_ops = {
 	.convert_ctx_access	= sk_filter_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops lwt_inout_ops = {
+	.get_func_proto		= lwt_inout_func_proto,
+	.is_valid_access	= lwt_is_valid_access,
+	.convert_ctx_access	= sk_filter_convert_ctx_access,
+};
+
+static const struct bpf_verifier_ops lwt_xmit_ops = {
+	.get_func_proto		= lwt_xmit_func_proto,
+	.is_valid_access	= lwt_is_valid_access,
+	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
@@ -3032,6 +3187,21 @@ static struct bpf_prog_type_list cg_skb_type __read_mostly = {
 	.type	= BPF_PROG_TYPE_CGROUP_SKB,
 };
 
+static struct bpf_prog_type_list lwt_in_type __read_mostly = {
+	.ops	= &lwt_inout_ops,
+	.type	= BPF_PROG_TYPE_LWT_IN,
+};
+
+static struct bpf_prog_type_list lwt_out_type __read_mostly = {
+	.ops	= &lwt_inout_ops,
+	.type	= BPF_PROG_TYPE_LWT_OUT,
+};
+
+static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
+	.ops	= &lwt_xmit_ops,
+	.type	= BPF_PROG_TYPE_LWT_XMIT,
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
@@ -3039,6 +3209,9 @@ static int __init register_sk_filter_ops(void)
 	bpf_register_prog_type(&sched_act_type);
 	bpf_register_prog_type(&xdp_type);
 	bpf_register_prog_type(&cg_skb_type);
+	bpf_register_prog_type(&lwt_in_type);
+	bpf_register_prog_type(&lwt_out_type);
+	bpf_register_prog_type(&lwt_xmit_type);
 
 	return 0;
 }
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
new file mode 100644
index 000000000000..71bb3e2eca08
--- /dev/null
+++ b/net/core/lwt_bpf.c
@@ -0,0 +1,396 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <net/lwtunnel.h>
+
+struct bpf_lwt_prog {
+	struct bpf_prog *prog;
+	char *name;
+};
+
+struct bpf_lwt {
+	struct bpf_lwt_prog in;
+	struct bpf_lwt_prog out;
+	struct bpf_lwt_prog xmit;
+	int family;
+};
+
+#define MAX_PROG_NAME 256
+
+static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+	return (struct bpf_lwt *)lwt->data;
+}
+
+#define NO_REDIRECT false
+#define CAN_REDIRECT true
+
+static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
+		       struct dst_entry *dst, bool can_redirect)
+{
+	int ret;
+
+	/* Preempt disable is needed to protect per-cpu redirect_info between
+	 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
+	 * access to maps strictly require a rcu_read_lock() for protection,
+	 * mixing with BH RCU lock doesn't work.
+	 */
+	preempt_disable();
+	rcu_read_lock();
+	bpf_compute_data_end(skb);
+	ret = bpf_prog_run_save_cb(lwt->prog, skb);
+	rcu_read_unlock();
+
+	switch (ret) {
+	case BPF_OK:
+		break;
+
+	case BPF_REDIRECT:
+		if (unlikely(!can_redirect)) {
+			pr_warn_once("Illegal redirect return code in prog %s\n",
+				     lwt->name ? : "<unknown>");
+			ret = BPF_OK;
+		} else {
+			ret = skb_do_redirect(skb);
+			if (ret == 0)
+				ret = BPF_REDIRECT;
+		}
+		break;
+
+	case BPF_DROP:
+		kfree_skb(skb);
+		ret = -EPERM;
+		break;
+
+	default:
+		pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret);
+		kfree_skb(skb);
+		ret = -EINVAL;
+		break;
+	}
+
+	preempt_enable();
+
+	return ret;
+}
+
+static int bpf_input(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct bpf_lwt *bpf;
+	int ret;
+
+	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+	if (bpf->in.prog) {
+		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (unlikely(!dst->lwtstate->orig_input)) {
+		pr_warn_once("orig_input not set on dst for prog %s\n",
+			     bpf->out.name);
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	return dst->lwtstate->orig_input(skb);
+}
+
+static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct bpf_lwt *bpf;
+	int ret;
+
+	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+	if (bpf->out.prog) {
+		ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (unlikely(!dst->lwtstate->orig_output)) {
+		pr_warn_once("orig_output not set on dst for prog %s\n",
+			     bpf->out.name);
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	return dst->lwtstate->orig_output(net, sk, skb);
+}
+
+static int xmit_check_hhlen(struct sk_buff *skb)
+{
+	int hh_len = skb_dst(skb)->dev->hard_header_len;
+
+	if (skb_headroom(skb) < hh_len) {
+		int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+		if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int bpf_xmit(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct bpf_lwt *bpf;
+
+	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+	if (bpf->xmit.prog) {
+		int ret;
+
+		ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
+		switch (ret) {
+		case BPF_OK:
+			/* If the header was expanded, headroom might be too
+			 * small for L2 header to come, expand as needed.
+			 */
+			ret = xmit_check_hhlen(skb);
+			if (unlikely(ret))
+				return ret;
+
+			return LWTUNNEL_XMIT_CONTINUE;
+		case BPF_REDIRECT:
+			return LWTUNNEL_XMIT_DONE;
+		default:
+			return ret;
+		}
+	}
+
+	return LWTUNNEL_XMIT_CONTINUE;
+}
+
+static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
+{
+	if (prog->prog)
+		bpf_prog_put(prog->prog);
+
+	kfree(prog->name);
+}
+
+static void bpf_destroy_state(struct lwtunnel_state *lwt)
+{
+	struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+	bpf_lwt_prog_destroy(&bpf->in);
+	bpf_lwt_prog_destroy(&bpf->out);
+	bpf_lwt_prog_destroy(&bpf->xmit);
+}
+
+static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
+	[LWT_BPF_PROG_FD]   = { .type = NLA_U32, },
+	[LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+				.len = MAX_PROG_NAME },
+};
+
+static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
+			  enum bpf_prog_type type)
+{
+	struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
+	struct bpf_prog *p;
+	int ret;
+	u32 fd;
+
+	ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
+		return -EINVAL;
+
+	prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL);
+	if (!prog->name)
+		return -ENOMEM;
+
+	fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
+	p = bpf_prog_get_type(fd, type);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	prog->prog = p;
+
+	return 0;
+}
+
+static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
+	[LWT_BPF_IN]		= { .type = NLA_NESTED, },
+	[LWT_BPF_OUT]		= { .type = NLA_NESTED, },
+	[LWT_BPF_XMIT]		= { .type = NLA_NESTED, },
+	[LWT_BPF_XMIT_HEADROOM]	= { .type = NLA_U32 },
+};
+
+static int bpf_build_state(struct net_device *dev, struct nlattr *nla,
+			   unsigned int family, const void *cfg,
+			   struct lwtunnel_state **ts)
+{
+	struct nlattr *tb[LWT_BPF_MAX + 1];
+	struct lwtunnel_state *newts;
+	struct bpf_lwt *bpf;
+	int ret;
+
+	if (family != AF_INET && family != AF_INET6)
+		return -EAFNOSUPPORT;
+
+	ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(sizeof(*bpf));
+	if (!newts)
+		return -ENOMEM;
+
+	newts->type = LWTUNNEL_ENCAP_BPF;
+	bpf = bpf_lwt_lwtunnel(newts);
+
+	if (tb[LWT_BPF_IN]) {
+		newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+		ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
+				     BPF_PROG_TYPE_LWT_IN);
+		if (ret  < 0)
+			goto errout;
+	}
+
+	if (tb[LWT_BPF_OUT]) {
+		newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+		ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
+				     BPF_PROG_TYPE_LWT_OUT);
+		if (ret < 0)
+			goto errout;
+	}
+
+	if (tb[LWT_BPF_XMIT]) {
+		newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
+		ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
+				     BPF_PROG_TYPE_LWT_XMIT);
+		if (ret < 0)
+			goto errout;
+	}
+
+	if (tb[LWT_BPF_XMIT_HEADROOM]) {
+		u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]);
+
+		if (headroom > LWT_BPF_MAX_HEADROOM) {
+			ret = -ERANGE;
+			goto errout;
+		}
+
+		newts->headroom = headroom;
+	}
+
+	bpf->family = family;
+	*ts = newts;
+
+	return 0;
+
+errout:
+	bpf_destroy_state(newts);
+	kfree(newts);
+	return ret;
+}
+
+static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
+			     struct bpf_lwt_prog *prog)
+{
+	struct nlattr *nest;
+
+	if (!prog->prog)
+		return 0;
+
+	nest = nla_nest_start(skb, attr);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (prog->name &&
+	    nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
+		return -EMSGSIZE;
+
+	return nla_nest_end(skb, nest);
+}
+
+static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
+{
+	struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+	if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
+	    bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
+	    bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	int nest_len = nla_total_size(sizeof(struct nlattr)) +
+		       nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
+		       0;
+
+	return nest_len + /* LWT_BPF_IN */
+	       nest_len + /* LWT_BPF_OUT */
+	       nest_len + /* LWT_BPF_XMIT */
+	       0;
+}
+
+int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
+{
+	/* FIXME:
+	 * The LWT state is currently rebuilt for delete requests which
+	 * results in a new bpf_prog instance. Comparing names for now.
+	 */
+	if (!a->name && !b->name)
+		return 0;
+
+	if (!a->name || !b->name)
+		return 1;
+
+	return strcmp(a->name, b->name);
+}
+
+static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
+	struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
+
+	return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
+	       bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
+	       bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
+}
+
+static const struct lwtunnel_encap_ops bpf_encap_ops = {
+	.build_state	= bpf_build_state,
+	.destroy_state	= bpf_destroy_state,
+	.input		= bpf_input,
+	.output		= bpf_output,
+	.xmit		= bpf_xmit,
+	.fill_encap	= bpf_fill_encap_info,
+	.get_encap_size = bpf_encap_nlsize,
+	.cmp_encap	= bpf_encap_cmp,
+};
+
+static int __init bpf_lwt_init(void)
+{
+	return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
+}
+
+subsys_initcall(bpf_lwt_init)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 03976e939818..a5d4e866ce88 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -41,6 +41,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
 		return "ILA";
 	case LWTUNNEL_ENCAP_SEG6:
 		return "SEG6";
+	case LWTUNNEL_ENCAP_BPF:
+		return "BPF";
 	case LWTUNNEL_ENCAP_IP6:
 	case LWTUNNEL_ENCAP_IP:
 	case LWTUNNEL_ENCAP_NONE:
-- 
cgit v1.2.3


From 366cbf2f46048d70005c6c33dc289330f24b54b0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 30 Nov 2016 22:16:06 +0100
Subject: bpf, xdp: drop rcu_read_lock from bpf_prog_run_xdp and move to caller

After 326fe02d1ed6 ("net/mlx4_en: protect ring->xdp_prog with rcu_read_lock"),
the rcu_read_lock() in bpf_prog_run_xdp() is superfluous, since callers
need to hold rcu_read_lock() already to make sure BPF program doesn't
get released in the background.

Thus, drop it from bpf_prog_run_xdp(), as it can otherwise be misleading.
Still keeping the bpf_prog_run_xdp() is useful as it allows for grepping
in XDP supported drivers and to keep the typecheck on the context intact.
For mlx4, this means we don't have a double rcu_read_lock() anymore. nfp can
just make use of bpf_prog_run_xdp(), too. For qede, just move rcu_read_lock()
out of the helper. When the driver gets atomic replace support, this will
move to call-sites eventually.

mlx5 needs actual fixing as it has the same issue as described already in
326fe02d1ed6 ("net/mlx4_en: protect ring->xdp_prog with rcu_read_lock"),
that is, we're under RCU bh at this time, BPF programs are released via
call_rcu(), and call_rcu() != call_rcu_bh(), so we need to properly mark
read side as programs can get xchg()'ed in mlx5e_xdp_set() without queue
reset.

Fixes: 86994156c736 ("net/mlx5e: XDP fast RX drop bpf programs support")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c     |  8 ++++++--
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c |  2 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c        |  7 +++++++
 include/linux/filter.h                              | 18 +++++++++---------
 4 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index b036710ba52c..42cd687e6608 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -737,10 +737,10 @@ static inline
 struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 			     u16 wqe_counter, u32 cqe_bcnt)
 {
-	struct bpf_prog *xdp_prog = READ_ONCE(rq->xdp_prog);
 	struct mlx5e_dma_info *di;
 	struct sk_buff *skb;
 	void *va, *data;
+	bool consumed;
 
 	di             = &rq->dma_info[wqe_counter];
 	va             = page_address(di->page);
@@ -759,7 +759,11 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 		return NULL;
 	}
 
-	if (mlx5e_xdp_handle(rq, xdp_prog, di, data, cqe_bcnt))
+	rcu_read_lock();
+	consumed = mlx5e_xdp_handle(rq, READ_ONCE(rq->xdp_prog), di, data,
+				    cqe_bcnt);
+	rcu_read_unlock();
+	if (consumed)
 		return NULL; /* page/packet was consumed by XDP */
 
 	skb = build_skb(va, RQ_PAGE_SIZE(rq));
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 876ab3a92ad5..00d9a03be31d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1518,7 +1518,7 @@ static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, unsigned int len)
 	xdp.data = data;
 	xdp.data_end = data + len;
 
-	return BPF_PROG_RUN(prog, &xdp);
+	return bpf_prog_run_xdp(prog, &xdp);
 }
 
 /**
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 172ff6da92ad..faeaa9f3b197 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1497,7 +1497,14 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 
 	xdp.data = page_address(bd->data) + cqe->placement_offset;
 	xdp.data_end = xdp.data + len;
+
+	/* Queues always have a full reset currently, so for the time
+	 * being until there's atomic program replace just mark read
+	 * side for map helpers.
+	 */
+	rcu_read_lock();
 	act = bpf_prog_run_xdp(prog, &xdp);
+	rcu_read_unlock();
 
 	if (act == XDP_PASS)
 		return true;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7ba644626553..97338134398f 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -498,16 +498,16 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
 	return BPF_PROG_RUN(prog, skb);
 }
 
-static inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
-				   struct xdp_buff *xdp)
+static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
+					    struct xdp_buff *xdp)
 {
-	u32 ret;
-
-	rcu_read_lock();
-	ret = BPF_PROG_RUN(prog, xdp);
-	rcu_read_unlock();
-
-	return ret;
+	/* Caller needs to hold rcu_read_lock() (!), otherwise program
+	 * can be released while still running, or map elements could be
+	 * freed early while still having concurrent users. XDP fastpath
+	 * already takes rcu_read_lock() when fetching the program, so
+	 * it's not necessary here anymore.
+	 */
+	return BPF_PROG_RUN(prog, xdp);
 }
 
 static inline unsigned int bpf_prog_size(unsigned int proglen)
-- 
cgit v1.2.3


From fc831825f99eb3a2f1bf3fe7307b392513b642a5 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuval.mintz@cavium.com>
Date: Thu, 1 Dec 2016 00:21:06 -0800
Subject: qed: Add support for hardware offloaded iSCSI.

This adds the backbone required for the various HW initalizations
which are necessary for the iSCSI driver (qedi) for QLogic FastLinQ
4xxxx line of adapters - FW notification, resource initializations, etc.

Signed-off-by: Arun Easi <arun.easi@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/Kconfig            |    3 +
 drivers/net/ethernet/qlogic/qed/Makefile       |    1 +
 drivers/net/ethernet/qlogic/qed/qed.h          |    7 +-
 drivers/net/ethernet/qlogic/qed/qed_dev.c      |   12 +
 drivers/net/ethernet/qlogic/qed/qed_iscsi.c    | 1277 ++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_iscsi.h    |   52 +
 drivers/net/ethernet/qlogic/qed/qed_ll2.c      |    4 +-
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h |    2 +
 drivers/net/ethernet/qlogic/qed/qed_spq.c      |   15 +
 include/linux/qed/qed_if.h                     |    2 +
 include/linux/qed/qed_iscsi_if.h               |  229 +++++
 11 files changed, 1602 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_iscsi.c
 create mode 100644 drivers/net/ethernet/qlogic/qed/qed_iscsi.h
 create mode 100644 include/linux/qed/qed_iscsi_if.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/Kconfig b/drivers/net/ethernet/qlogic/Kconfig
index 32f2a45f4ab2..3cfd10503446 100644
--- a/drivers/net/ethernet/qlogic/Kconfig
+++ b/drivers/net/ethernet/qlogic/Kconfig
@@ -110,4 +110,7 @@ config QEDE
 config QED_RDMA
 	bool
 
+config QED_ISCSI
+	bool
+
 endif # NET_VENDOR_QLOGIC
diff --git a/drivers/net/ethernet/qlogic/qed/Makefile b/drivers/net/ethernet/qlogic/qed/Makefile
index 967acf322c09..597e15c54a11 100644
--- a/drivers/net/ethernet/qlogic/qed/Makefile
+++ b/drivers/net/ethernet/qlogic/qed/Makefile
@@ -6,3 +6,4 @@ qed-y := qed_cxt.o qed_dev.o qed_hw.o qed_init_fw_funcs.o qed_init_ops.o \
 qed-$(CONFIG_QED_SRIOV) += qed_sriov.o qed_vf.o
 qed-$(CONFIG_QED_LL2) += qed_ll2.o
 qed-$(CONFIG_QED_RDMA) += qed_roce.o
+qed-$(CONFIG_QED_ISCSI) += qed_iscsi.o
diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 244dd40ccac3..1f423b38faab 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -35,6 +35,7 @@ extern const struct qed_common_ops qed_common_ops_pass;
 
 #define QED_WFQ_UNIT	100
 
+#define ISCSI_BDQ_ID(_port_id) (_port_id)
 #define QED_WID_SIZE            (1024)
 #define QED_PF_DEMS_SIZE        (4)
 
@@ -383,6 +384,7 @@ struct qed_hwfn {
 	bool				using_ll2;
 	struct qed_ll2_info		*p_ll2_info;
 	struct qed_rdma_info		*p_rdma_info;
+	struct qed_iscsi_info		*p_iscsi_info;
 	struct qed_pf_params		pf_params;
 
 	bool b_rdma_enabled_in_prs;
@@ -581,6 +583,8 @@ struct qed_dev {
 	/* Linux specific here */
 	struct  qede_dev		*edev;
 	struct  pci_dev			*pdev;
+	u32 flags;
+#define QED_FLAG_STORAGE_STARTED	(BIT(0))
 	int				msg_enable;
 
 	struct pci_params		pci_params;
@@ -594,6 +598,7 @@ struct qed_dev {
 	union {
 		struct qed_common_cb_ops	*common;
 		struct qed_eth_cb_ops		*eth;
+		struct qed_iscsi_cb_ops		*iscsi;
 	} protocol_ops;
 	void				*ops_cookie;
 
@@ -603,7 +608,7 @@ struct qed_dev {
 	struct qed_cb_ll2_info		*ll2;
 	u8				ll2_mac_address[ETH_ALEN];
 #endif
-
+	DECLARE_HASHTABLE(connections, 10);
 	const struct firmware		*firmware;
 
 	u32 rdma_max_sge;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 00b9a67ba359..cd9810296630 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -29,6 +29,7 @@
 #include "qed_hw.h"
 #include "qed_init_ops.h"
 #include "qed_int.h"
+#include "qed_iscsi.h"
 #include "qed_ll2.h"
 #include "qed_mcp.h"
 #include "qed_reg_addr.h"
@@ -146,6 +147,8 @@ void qed_resc_free(struct qed_dev *cdev)
 #ifdef CONFIG_QED_LL2
 		qed_ll2_free(p_hwfn, p_hwfn->p_ll2_info);
 #endif
+		if (p_hwfn->hw_info.personality == QED_PCI_ISCSI)
+			qed_iscsi_free(p_hwfn, p_hwfn->p_iscsi_info);
 		qed_iov_free(p_hwfn);
 		qed_dmae_info_free(p_hwfn);
 		qed_dcbx_info_free(p_hwfn, p_hwfn->p_dcbx_info);
@@ -402,6 +405,7 @@ int qed_qm_reconf(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 
 int qed_resc_alloc(struct qed_dev *cdev)
 {
+	struct qed_iscsi_info *p_iscsi_info;
 #ifdef CONFIG_QED_LL2
 	struct qed_ll2_info *p_ll2_info;
 #endif
@@ -507,6 +511,12 @@ int qed_resc_alloc(struct qed_dev *cdev)
 			p_hwfn->p_ll2_info = p_ll2_info;
 		}
 #endif
+		if (p_hwfn->hw_info.personality == QED_PCI_ISCSI) {
+			p_iscsi_info = qed_iscsi_alloc(p_hwfn);
+			if (!p_iscsi_info)
+				goto alloc_no_mem;
+			p_hwfn->p_iscsi_info = p_iscsi_info;
+		}
 
 		/* DMA info initialization */
 		rc = qed_dmae_info_alloc(p_hwfn);
@@ -560,6 +570,8 @@ void qed_resc_setup(struct qed_dev *cdev)
 		if (p_hwfn->using_ll2)
 			qed_ll2_setup(p_hwfn, p_hwfn->p_ll2_info);
 #endif
+		if (p_hwfn->hw_info.personality == QED_PCI_ISCSI)
+			qed_iscsi_setup(p_hwfn, p_hwfn->p_iscsi_info);
 	}
 }
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iscsi.c b/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
new file mode 100644
index 000000000000..00efb1c4c57e
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
@@ -0,0 +1,1277 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <asm/param.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/etherdevice.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/version.h>
+#include <linux/workqueue.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/qed/qed_iscsi_if.h>
+#include "qed.h"
+#include "qed_cxt.h"
+#include "qed_dev_api.h"
+#include "qed_hsi.h"
+#include "qed_hw.h"
+#include "qed_int.h"
+#include "qed_iscsi.h"
+#include "qed_ll2.h"
+#include "qed_mcp.h"
+#include "qed_sp.h"
+#include "qed_sriov.h"
+#include "qed_reg_addr.h"
+
+struct qed_iscsi_conn {
+	struct list_head list_entry;
+	bool free_on_delete;
+
+	u16 conn_id;
+	u32 icid;
+	u32 fw_cid;
+
+	u8 layer_code;
+	u8 offl_flags;
+	u8 connect_mode;
+	u32 initial_ack;
+	dma_addr_t sq_pbl_addr;
+	struct qed_chain r2tq;
+	struct qed_chain xhq;
+	struct qed_chain uhq;
+
+	struct tcp_upload_params *tcp_upload_params_virt_addr;
+	dma_addr_t tcp_upload_params_phys_addr;
+	struct scsi_terminate_extra_params *queue_cnts_virt_addr;
+	dma_addr_t queue_cnts_phys_addr;
+	dma_addr_t syn_phy_addr;
+
+	u16 syn_ip_payload_length;
+	u8 local_mac[6];
+	u8 remote_mac[6];
+	u16 vlan_id;
+	u8 tcp_flags;
+	u8 ip_version;
+	u32 remote_ip[4];
+	u32 local_ip[4];
+	u8 ka_max_probe_cnt;
+	u8 dup_ack_theshold;
+	u32 rcv_next;
+	u32 snd_una;
+	u32 snd_next;
+	u32 snd_max;
+	u32 snd_wnd;
+	u32 rcv_wnd;
+	u32 snd_wl1;
+	u32 cwnd;
+	u32 ss_thresh;
+	u16 srtt;
+	u16 rtt_var;
+	u32 ts_time;
+	u32 ts_recent;
+	u32 ts_recent_age;
+	u32 total_rt;
+	u32 ka_timeout_delta;
+	u32 rt_timeout_delta;
+	u8 dup_ack_cnt;
+	u8 snd_wnd_probe_cnt;
+	u8 ka_probe_cnt;
+	u8 rt_cnt;
+	u32 flow_label;
+	u32 ka_timeout;
+	u32 ka_interval;
+	u32 max_rt_time;
+	u32 initial_rcv_wnd;
+	u8 ttl;
+	u8 tos_or_tc;
+	u16 remote_port;
+	u16 local_port;
+	u16 mss;
+	u8 snd_wnd_scale;
+	u8 rcv_wnd_scale;
+	u32 ts_ticks_per_second;
+	u16 da_timeout_value;
+	u8 ack_frequency;
+
+	u8 update_flag;
+	u8 default_cq;
+	u32 max_seq_size;
+	u32 max_recv_pdu_length;
+	u32 max_send_pdu_length;
+	u32 first_seq_length;
+	u32 exp_stat_sn;
+	u32 stat_sn;
+	u16 physical_q0;
+	u16 physical_q1;
+	u8 abortive_dsconnect;
+};
+
+static int
+qed_sp_iscsi_func_start(struct qed_hwfn *p_hwfn,
+			enum spq_mode comp_mode,
+			struct qed_spq_comp_cb *p_comp_addr,
+			void *event_context, iscsi_event_cb_t async_event_cb)
+{
+	struct iscsi_init_ramrod_params *p_ramrod = NULL;
+	struct scsi_init_func_queues *p_queue = NULL;
+	struct qed_iscsi_pf_params *p_params = NULL;
+	struct iscsi_spe_func_init *p_init = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = 0;
+	u32 dval;
+	u16 val;
+	u8 i;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ISCSI_RAMROD_CMD_ID_INIT_FUNC,
+				 PROTOCOLID_ISCSI, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.iscsi_init;
+	p_init = &p_ramrod->iscsi_init_spe;
+	p_params = &p_hwfn->pf_params.iscsi_pf_params;
+	p_queue = &p_init->q_params;
+
+	SET_FIELD(p_init->hdr.flags,
+		  ISCSI_SLOW_PATH_HDR_LAYER_CODE, ISCSI_SLOW_PATH_LAYER_CODE);
+	p_init->hdr.op_code = ISCSI_RAMROD_CMD_ID_INIT_FUNC;
+
+	val = p_params->half_way_close_timeout;
+	p_init->half_way_close_timeout = cpu_to_le16(val);
+	p_init->num_sq_pages_in_ring = p_params->num_sq_pages_in_ring;
+	p_init->num_r2tq_pages_in_ring = p_params->num_r2tq_pages_in_ring;
+	p_init->num_uhq_pages_in_ring = p_params->num_uhq_pages_in_ring;
+	p_init->func_params.log_page_size = p_params->log_page_size;
+	val = p_params->num_tasks;
+	p_init->func_params.num_tasks = cpu_to_le16(val);
+	p_init->debug_mode.flags = p_params->debug_mode;
+
+	DMA_REGPAIR_LE(p_queue->glbl_q_params_addr,
+		       p_params->glbl_q_params_addr);
+
+	val = p_params->cq_num_entries;
+	p_queue->cq_num_entries = cpu_to_le16(val);
+	val = p_params->cmdq_num_entries;
+	p_queue->cmdq_num_entries = cpu_to_le16(val);
+	p_queue->num_queues = p_params->num_queues;
+	dval = (u8)p_hwfn->hw_info.resc_start[QED_CMDQS_CQS];
+	p_queue->queue_relative_offset = (u8)dval;
+	p_queue->cq_sb_pi = p_params->gl_rq_pi;
+	p_queue->cmdq_sb_pi = p_params->gl_cmd_pi;
+
+	for (i = 0; i < p_params->num_queues; i++) {
+		val = p_hwfn->sbs_info[i]->igu_sb_id;
+		p_queue->cq_cmdq_sb_num_arr[i] = cpu_to_le16(val);
+	}
+
+	p_queue->bdq_resource_id = ISCSI_BDQ_ID(p_hwfn->port_id);
+
+	DMA_REGPAIR_LE(p_queue->bdq_pbl_base_address[BDQ_ID_RQ],
+		       p_params->bdq_pbl_base_addr[BDQ_ID_RQ]);
+	p_queue->bdq_pbl_num_entries[BDQ_ID_RQ] =
+	    p_params->bdq_pbl_num_entries[BDQ_ID_RQ];
+	val = p_params->bdq_xoff_threshold[BDQ_ID_RQ];
+	p_queue->bdq_xoff_threshold[BDQ_ID_RQ] = cpu_to_le16(val);
+	val = p_params->bdq_xon_threshold[BDQ_ID_RQ];
+	p_queue->bdq_xon_threshold[BDQ_ID_RQ] = cpu_to_le16(val);
+
+	DMA_REGPAIR_LE(p_queue->bdq_pbl_base_address[BDQ_ID_IMM_DATA],
+		       p_params->bdq_pbl_base_addr[BDQ_ID_IMM_DATA]);
+	p_queue->bdq_pbl_num_entries[BDQ_ID_IMM_DATA] =
+	    p_params->bdq_pbl_num_entries[BDQ_ID_IMM_DATA];
+	val = p_params->bdq_xoff_threshold[BDQ_ID_IMM_DATA];
+	p_queue->bdq_xoff_threshold[BDQ_ID_IMM_DATA] = cpu_to_le16(val);
+	val = p_params->bdq_xon_threshold[BDQ_ID_IMM_DATA];
+	p_queue->bdq_xon_threshold[BDQ_ID_IMM_DATA] = cpu_to_le16(val);
+	val = p_params->rq_buffer_size;
+	p_queue->rq_buffer_size = cpu_to_le16(val);
+	if (p_params->is_target) {
+		SET_FIELD(p_queue->q_validity,
+			  SCSI_INIT_FUNC_QUEUES_RQ_VALID, 1);
+		if (p_queue->bdq_pbl_num_entries[BDQ_ID_IMM_DATA])
+			SET_FIELD(p_queue->q_validity,
+				  SCSI_INIT_FUNC_QUEUES_IMM_DATA_VALID, 1);
+		SET_FIELD(p_queue->q_validity,
+			  SCSI_INIT_FUNC_QUEUES_CMD_VALID, 1);
+	} else {
+		SET_FIELD(p_queue->q_validity,
+			  SCSI_INIT_FUNC_QUEUES_RQ_VALID, 1);
+	}
+	p_ramrod->tcp_init.two_msl_timer = cpu_to_le32(p_params->two_msl_timer);
+	val = p_params->tx_sws_timer;
+	p_ramrod->tcp_init.tx_sws_timer = cpu_to_le16(val);
+	p_ramrod->tcp_init.maxfinrt = p_params->max_fin_rt;
+
+	p_hwfn->p_iscsi_info->event_context = event_context;
+	p_hwfn->p_iscsi_info->event_cb = async_event_cb;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_iscsi_conn_offload(struct qed_hwfn *p_hwfn,
+				     struct qed_iscsi_conn *p_conn,
+				     enum spq_mode comp_mode,
+				     struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct iscsi_spe_conn_offload *p_ramrod = NULL;
+	struct tcp_offload_params_opt2 *p_tcp2 = NULL;
+	struct tcp_offload_params *p_tcp = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	union qed_qm_pq_params pq_params;
+	u16 pq0_id = 0, pq1_id = 0;
+	dma_addr_t r2tq_pbl_addr;
+	dma_addr_t xhq_pbl_addr;
+	dma_addr_t uhq_pbl_addr;
+	int rc = 0;
+	u32 dval;
+	u16 wval;
+	u8 i;
+	u16 *p;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ISCSI_RAMROD_CMD_ID_OFFLOAD_CONN,
+				 PROTOCOLID_ISCSI, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.iscsi_conn_offload;
+
+	/* Transmission PQ is the first of the PF */
+	memset(&pq_params, 0, sizeof(pq_params));
+	pq0_id = qed_get_qm_pq(p_hwfn, PROTOCOLID_ISCSI, &pq_params);
+	p_conn->physical_q0 = cpu_to_le16(pq0_id);
+	p_ramrod->iscsi.physical_q0 = cpu_to_le16(pq0_id);
+
+	/* iSCSI Pure-ACK PQ */
+	pq_params.iscsi.q_idx = 1;
+	pq1_id = qed_get_qm_pq(p_hwfn, PROTOCOLID_ISCSI, &pq_params);
+	p_conn->physical_q1 = cpu_to_le16(pq1_id);
+	p_ramrod->iscsi.physical_q1 = cpu_to_le16(pq1_id);
+
+	p_ramrod->hdr.op_code = ISCSI_RAMROD_CMD_ID_OFFLOAD_CONN;
+	SET_FIELD(p_ramrod->hdr.flags, ISCSI_SLOW_PATH_HDR_LAYER_CODE,
+		  p_conn->layer_code);
+
+	p_ramrod->conn_id = cpu_to_le16(p_conn->conn_id);
+	p_ramrod->fw_cid = cpu_to_le32(p_conn->icid);
+
+	DMA_REGPAIR_LE(p_ramrod->iscsi.sq_pbl_addr, p_conn->sq_pbl_addr);
+
+	r2tq_pbl_addr = qed_chain_get_pbl_phys(&p_conn->r2tq);
+	DMA_REGPAIR_LE(p_ramrod->iscsi.r2tq_pbl_addr, r2tq_pbl_addr);
+
+	xhq_pbl_addr = qed_chain_get_pbl_phys(&p_conn->xhq);
+	DMA_REGPAIR_LE(p_ramrod->iscsi.xhq_pbl_addr, xhq_pbl_addr);
+
+	uhq_pbl_addr = qed_chain_get_pbl_phys(&p_conn->uhq);
+	DMA_REGPAIR_LE(p_ramrod->iscsi.uhq_pbl_addr, uhq_pbl_addr);
+
+	p_ramrod->iscsi.initial_ack = cpu_to_le32(p_conn->initial_ack);
+	p_ramrod->iscsi.flags = p_conn->offl_flags;
+	p_ramrod->iscsi.default_cq = p_conn->default_cq;
+	p_ramrod->iscsi.stat_sn = cpu_to_le32(p_conn->stat_sn);
+
+	if (!GET_FIELD(p_ramrod->iscsi.flags,
+		       ISCSI_CONN_OFFLOAD_PARAMS_TCP_ON_CHIP_1B)) {
+		p_tcp = &p_ramrod->tcp;
+
+		p = (u16 *)p_conn->local_mac;
+		p_tcp->local_mac_addr_hi = swab16(get_unaligned(p));
+		p_tcp->local_mac_addr_mid = swab16(get_unaligned(p + 1));
+		p_tcp->local_mac_addr_lo = swab16(get_unaligned(p + 2));
+
+		p = (u16 *)p_conn->remote_mac;
+		p_tcp->remote_mac_addr_hi = swab16(get_unaligned(p));
+		p_tcp->remote_mac_addr_mid = swab16(get_unaligned(p + 1));
+		p_tcp->remote_mac_addr_lo = swab16(get_unaligned(p + 2));
+
+		p_tcp->vlan_id = cpu_to_le16(p_conn->vlan_id);
+
+		p_tcp->flags = p_conn->tcp_flags;
+		p_tcp->ip_version = p_conn->ip_version;
+		for (i = 0; i < 4; i++) {
+			dval = p_conn->remote_ip[i];
+			p_tcp->remote_ip[i] = cpu_to_le32(dval);
+			dval = p_conn->local_ip[i];
+			p_tcp->local_ip[i] = cpu_to_le32(dval);
+		}
+		p_tcp->ka_max_probe_cnt = p_conn->ka_max_probe_cnt;
+		p_tcp->dup_ack_theshold = p_conn->dup_ack_theshold;
+
+		p_tcp->rcv_next = cpu_to_le32(p_conn->rcv_next);
+		p_tcp->snd_una = cpu_to_le32(p_conn->snd_una);
+		p_tcp->snd_next = cpu_to_le32(p_conn->snd_next);
+		p_tcp->snd_max = cpu_to_le32(p_conn->snd_max);
+		p_tcp->snd_wnd = cpu_to_le32(p_conn->snd_wnd);
+		p_tcp->rcv_wnd = cpu_to_le32(p_conn->rcv_wnd);
+		p_tcp->snd_wl1 = cpu_to_le32(p_conn->snd_wl1);
+		p_tcp->cwnd = cpu_to_le32(p_conn->cwnd);
+		p_tcp->ss_thresh = cpu_to_le32(p_conn->ss_thresh);
+		p_tcp->srtt = cpu_to_le16(p_conn->srtt);
+		p_tcp->rtt_var = cpu_to_le16(p_conn->rtt_var);
+		p_tcp->ts_time = cpu_to_le32(p_conn->ts_time);
+		p_tcp->ts_recent = cpu_to_le32(p_conn->ts_recent);
+		p_tcp->ts_recent_age = cpu_to_le32(p_conn->ts_recent_age);
+		p_tcp->total_rt = cpu_to_le32(p_conn->total_rt);
+		dval = p_conn->ka_timeout_delta;
+		p_tcp->ka_timeout_delta = cpu_to_le32(dval);
+		dval = p_conn->rt_timeout_delta;
+		p_tcp->rt_timeout_delta = cpu_to_le32(dval);
+		p_tcp->dup_ack_cnt = p_conn->dup_ack_cnt;
+		p_tcp->snd_wnd_probe_cnt = p_conn->snd_wnd_probe_cnt;
+		p_tcp->ka_probe_cnt = p_conn->ka_probe_cnt;
+		p_tcp->rt_cnt = p_conn->rt_cnt;
+		p_tcp->flow_label = cpu_to_le32(p_conn->flow_label);
+		p_tcp->ka_timeout = cpu_to_le32(p_conn->ka_timeout);
+		p_tcp->ka_interval = cpu_to_le32(p_conn->ka_interval);
+		p_tcp->max_rt_time = cpu_to_le32(p_conn->max_rt_time);
+		dval = p_conn->initial_rcv_wnd;
+		p_tcp->initial_rcv_wnd = cpu_to_le32(dval);
+		p_tcp->ttl = p_conn->ttl;
+		p_tcp->tos_or_tc = p_conn->tos_or_tc;
+		p_tcp->remote_port = cpu_to_le16(p_conn->remote_port);
+		p_tcp->local_port = cpu_to_le16(p_conn->local_port);
+		p_tcp->mss = cpu_to_le16(p_conn->mss);
+		p_tcp->snd_wnd_scale = p_conn->snd_wnd_scale;
+		p_tcp->rcv_wnd_scale = p_conn->rcv_wnd_scale;
+		dval = p_conn->ts_ticks_per_second;
+		p_tcp->ts_ticks_per_second = cpu_to_le32(dval);
+		wval = p_conn->da_timeout_value;
+		p_tcp->da_timeout_value = cpu_to_le16(wval);
+		p_tcp->ack_frequency = p_conn->ack_frequency;
+		p_tcp->connect_mode = p_conn->connect_mode;
+	} else {
+		p_tcp2 =
+		    &((struct iscsi_spe_conn_offload_option2 *)p_ramrod)->tcp;
+
+		p = (u16 *)p_conn->local_mac;
+		p_tcp2->local_mac_addr_hi = swab16(get_unaligned(p));
+		p_tcp2->local_mac_addr_mid = swab16(get_unaligned(p + 1));
+		p_tcp2->local_mac_addr_lo = swab16(get_unaligned(p + 2));
+
+		p = (u16 *)p_conn->remote_mac;
+		p_tcp2->remote_mac_addr_hi = swab16(get_unaligned(p));
+		p_tcp2->remote_mac_addr_mid = swab16(get_unaligned(p + 1));
+		p_tcp2->remote_mac_addr_lo = swab16(get_unaligned(p + 2));
+
+		p_tcp2->vlan_id = cpu_to_le16(p_conn->vlan_id);
+		p_tcp2->flags = p_conn->tcp_flags;
+
+		p_tcp2->ip_version = p_conn->ip_version;
+		for (i = 0; i < 4; i++) {
+			dval = p_conn->remote_ip[i];
+			p_tcp2->remote_ip[i] = cpu_to_le32(dval);
+			dval = p_conn->local_ip[i];
+			p_tcp2->local_ip[i] = cpu_to_le32(dval);
+		}
+
+		p_tcp2->flow_label = cpu_to_le32(p_conn->flow_label);
+		p_tcp2->ttl = p_conn->ttl;
+		p_tcp2->tos_or_tc = p_conn->tos_or_tc;
+		p_tcp2->remote_port = cpu_to_le16(p_conn->remote_port);
+		p_tcp2->local_port = cpu_to_le16(p_conn->local_port);
+		p_tcp2->mss = cpu_to_le16(p_conn->mss);
+		p_tcp2->rcv_wnd_scale = p_conn->rcv_wnd_scale;
+		p_tcp2->connect_mode = p_conn->connect_mode;
+		wval = p_conn->syn_ip_payload_length;
+		p_tcp2->syn_ip_payload_length = cpu_to_le16(wval);
+		p_tcp2->syn_phy_addr_lo = DMA_LO_LE(p_conn->syn_phy_addr);
+		p_tcp2->syn_phy_addr_hi = DMA_HI_LE(p_conn->syn_phy_addr);
+	}
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_iscsi_conn_update(struct qed_hwfn *p_hwfn,
+				    struct qed_iscsi_conn *p_conn,
+				    enum spq_mode comp_mode,
+				    struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct iscsi_conn_update_ramrod_params *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+	u32 dval;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ISCSI_RAMROD_CMD_ID_UPDATE_CONN,
+				 PROTOCOLID_ISCSI, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.iscsi_conn_update;
+	p_ramrod->hdr.op_code = ISCSI_RAMROD_CMD_ID_UPDATE_CONN;
+	SET_FIELD(p_ramrod->hdr.flags,
+		  ISCSI_SLOW_PATH_HDR_LAYER_CODE, p_conn->layer_code);
+
+	p_ramrod->conn_id = cpu_to_le16(p_conn->conn_id);
+	p_ramrod->fw_cid = cpu_to_le32(p_conn->icid);
+	p_ramrod->flags = p_conn->update_flag;
+	p_ramrod->max_seq_size = cpu_to_le32(p_conn->max_seq_size);
+	dval = p_conn->max_recv_pdu_length;
+	p_ramrod->max_recv_pdu_length = cpu_to_le32(dval);
+	dval = p_conn->max_send_pdu_length;
+	p_ramrod->max_send_pdu_length = cpu_to_le32(dval);
+	dval = p_conn->first_seq_length;
+	p_ramrod->first_seq_length = cpu_to_le32(dval);
+	p_ramrod->exp_stat_sn = cpu_to_le32(p_conn->exp_stat_sn);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_iscsi_conn_terminate(struct qed_hwfn *p_hwfn,
+				       struct qed_iscsi_conn *p_conn,
+				       enum spq_mode comp_mode,
+				       struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct iscsi_spe_conn_termination *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ISCSI_RAMROD_CMD_ID_TERMINATION_CONN,
+				 PROTOCOLID_ISCSI, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.iscsi_conn_terminate;
+	p_ramrod->hdr.op_code = ISCSI_RAMROD_CMD_ID_TERMINATION_CONN;
+	SET_FIELD(p_ramrod->hdr.flags,
+		  ISCSI_SLOW_PATH_HDR_LAYER_CODE, p_conn->layer_code);
+
+	p_ramrod->conn_id = cpu_to_le16(p_conn->conn_id);
+	p_ramrod->fw_cid = cpu_to_le32(p_conn->icid);
+	p_ramrod->abortive = p_conn->abortive_dsconnect;
+
+	DMA_REGPAIR_LE(p_ramrod->query_params_addr,
+		       p_conn->tcp_upload_params_phys_addr);
+	DMA_REGPAIR_LE(p_ramrod->queue_cnts_addr, p_conn->queue_cnts_phys_addr);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_iscsi_conn_clear_sq(struct qed_hwfn *p_hwfn,
+				      struct qed_iscsi_conn *p_conn,
+				      enum spq_mode comp_mode,
+				      struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct iscsi_slow_path_hdr *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = -EINVAL;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = p_conn->icid;
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ISCSI_RAMROD_CMD_ID_CLEAR_SQ,
+				 PROTOCOLID_ISCSI, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.iscsi_empty;
+	p_ramrod->op_code = ISCSI_RAMROD_CMD_ID_CLEAR_SQ;
+	SET_FIELD(p_ramrod->flags,
+		  ISCSI_SLOW_PATH_HDR_LAYER_CODE, p_conn->layer_code);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static int qed_sp_iscsi_func_stop(struct qed_hwfn *p_hwfn,
+				  enum spq_mode comp_mode,
+				  struct qed_spq_comp_cb *p_comp_addr)
+{
+	struct iscsi_spe_func_dstry *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	int rc = 0;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+	init_data.comp_mode = comp_mode;
+	init_data.p_comp_data = p_comp_addr;
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ISCSI_RAMROD_CMD_ID_DESTROY_FUNC,
+				 PROTOCOLID_ISCSI, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.iscsi_destroy;
+	p_ramrod->hdr.op_code = ISCSI_RAMROD_CMD_ID_DESTROY_FUNC;
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+static void __iomem *qed_iscsi_get_db_addr(struct qed_hwfn *p_hwfn, u32 cid)
+{
+	return (u8 __iomem *)p_hwfn->doorbells +
+			     qed_db_addr(cid, DQ_DEMS_LEGACY);
+}
+
+static void __iomem *qed_iscsi_get_primary_bdq_prod(struct qed_hwfn *p_hwfn,
+						    u8 bdq_id)
+{
+	u8 bdq_function_id = ISCSI_BDQ_ID(p_hwfn->port_id);
+
+	return (u8 __iomem *)p_hwfn->regview + GTT_BAR0_MAP_REG_MSDM_RAM +
+			     MSTORM_SCSI_BDQ_EXT_PROD_OFFSET(bdq_function_id,
+							     bdq_id);
+}
+
+static void __iomem *qed_iscsi_get_secondary_bdq_prod(struct qed_hwfn *p_hwfn,
+						      u8 bdq_id)
+{
+	u8 bdq_function_id = ISCSI_BDQ_ID(p_hwfn->port_id);
+
+	return (u8 __iomem *)p_hwfn->regview + GTT_BAR0_MAP_REG_TSDM_RAM +
+			     TSTORM_SCSI_BDQ_EXT_PROD_OFFSET(bdq_function_id,
+							     bdq_id);
+}
+
+static int qed_iscsi_setup_connection(struct qed_hwfn *p_hwfn,
+				      struct qed_iscsi_conn *p_conn)
+{
+	if (!p_conn->queue_cnts_virt_addr)
+		goto nomem;
+	memset(p_conn->queue_cnts_virt_addr, 0,
+	       sizeof(*p_conn->queue_cnts_virt_addr));
+
+	if (!p_conn->tcp_upload_params_virt_addr)
+		goto nomem;
+	memset(p_conn->tcp_upload_params_virt_addr, 0,
+	       sizeof(*p_conn->tcp_upload_params_virt_addr));
+
+	if (!p_conn->r2tq.p_virt_addr)
+		goto nomem;
+	qed_chain_pbl_zero_mem(&p_conn->r2tq);
+
+	if (!p_conn->uhq.p_virt_addr)
+		goto nomem;
+	qed_chain_pbl_zero_mem(&p_conn->uhq);
+
+	if (!p_conn->xhq.p_virt_addr)
+		goto nomem;
+	qed_chain_pbl_zero_mem(&p_conn->xhq);
+
+	return 0;
+nomem:
+	return -ENOMEM;
+}
+
+static int qed_iscsi_allocate_connection(struct qed_hwfn *p_hwfn,
+					 struct qed_iscsi_conn **p_out_conn)
+{
+	u16 uhq_num_elements = 0, xhq_num_elements = 0, r2tq_num_elements = 0;
+	struct scsi_terminate_extra_params *p_q_cnts = NULL;
+	struct qed_iscsi_pf_params *p_params = NULL;
+	struct tcp_upload_params *p_tcp = NULL;
+	struct qed_iscsi_conn *p_conn = NULL;
+	int rc = 0;
+
+	/* Try finding a free connection that can be used */
+	spin_lock_bh(&p_hwfn->p_iscsi_info->lock);
+	if (!list_empty(&p_hwfn->p_iscsi_info->free_list))
+		p_conn = list_first_entry(&p_hwfn->p_iscsi_info->free_list,
+					  struct qed_iscsi_conn, list_entry);
+	if (p_conn) {
+		list_del(&p_conn->list_entry);
+		spin_unlock_bh(&p_hwfn->p_iscsi_info->lock);
+		*p_out_conn = p_conn;
+		return 0;
+	}
+	spin_unlock_bh(&p_hwfn->p_iscsi_info->lock);
+
+	/* Need to allocate a new connection */
+	p_params = &p_hwfn->pf_params.iscsi_pf_params;
+
+	p_conn = kzalloc(sizeof(*p_conn), GFP_KERNEL);
+	if (!p_conn)
+		return -ENOMEM;
+
+	p_q_cnts = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				      sizeof(*p_q_cnts),
+				      &p_conn->queue_cnts_phys_addr,
+				      GFP_KERNEL);
+	if (!p_q_cnts)
+		goto nomem_queue_cnts_param;
+	p_conn->queue_cnts_virt_addr = p_q_cnts;
+
+	p_tcp = dma_alloc_coherent(&p_hwfn->cdev->pdev->dev,
+				   sizeof(*p_tcp),
+				   &p_conn->tcp_upload_params_phys_addr,
+				   GFP_KERNEL);
+	if (!p_tcp)
+		goto nomem_upload_param;
+	p_conn->tcp_upload_params_virt_addr = p_tcp;
+
+	r2tq_num_elements = p_params->num_r2tq_pages_in_ring *
+			    QED_CHAIN_PAGE_SIZE / 0x80;
+	rc = qed_chain_alloc(p_hwfn->cdev,
+			     QED_CHAIN_USE_TO_CONSUME_PRODUCE,
+			     QED_CHAIN_MODE_PBL,
+			     QED_CHAIN_CNT_TYPE_U16,
+			     r2tq_num_elements, 0x80, &p_conn->r2tq);
+	if (rc)
+		goto nomem_r2tq;
+
+	uhq_num_elements = p_params->num_uhq_pages_in_ring *
+			   QED_CHAIN_PAGE_SIZE / sizeof(struct iscsi_uhqe);
+	rc = qed_chain_alloc(p_hwfn->cdev,
+			     QED_CHAIN_USE_TO_CONSUME_PRODUCE,
+			     QED_CHAIN_MODE_PBL,
+			     QED_CHAIN_CNT_TYPE_U16,
+			     uhq_num_elements,
+			     sizeof(struct iscsi_uhqe), &p_conn->uhq);
+	if (rc)
+		goto nomem_uhq;
+
+	xhq_num_elements = uhq_num_elements;
+	rc = qed_chain_alloc(p_hwfn->cdev,
+			     QED_CHAIN_USE_TO_CONSUME_PRODUCE,
+			     QED_CHAIN_MODE_PBL,
+			     QED_CHAIN_CNT_TYPE_U16,
+			     xhq_num_elements,
+			     sizeof(struct iscsi_xhqe), &p_conn->xhq);
+	if (rc)
+		goto nomem;
+
+	p_conn->free_on_delete = true;
+	*p_out_conn = p_conn;
+	return 0;
+
+nomem:
+	qed_chain_free(p_hwfn->cdev, &p_conn->uhq);
+nomem_uhq:
+	qed_chain_free(p_hwfn->cdev, &p_conn->r2tq);
+nomem_r2tq:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  sizeof(struct tcp_upload_params),
+			  p_conn->tcp_upload_params_virt_addr,
+			  p_conn->tcp_upload_params_phys_addr);
+nomem_upload_param:
+	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+			  sizeof(struct scsi_terminate_extra_params),
+			  p_conn->queue_cnts_virt_addr,
+			  p_conn->queue_cnts_phys_addr);
+nomem_queue_cnts_param:
+	kfree(p_conn);
+
+	return -ENOMEM;
+}
+
+static int qed_iscsi_acquire_connection(struct qed_hwfn *p_hwfn,
+					struct qed_iscsi_conn *p_in_conn,
+					struct qed_iscsi_conn **p_out_conn)
+{
+	struct qed_iscsi_conn *p_conn = NULL;
+	int rc = 0;
+	u32 icid;
+
+	spin_lock_bh(&p_hwfn->p_iscsi_info->lock);
+	rc = qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_ISCSI, &icid);
+	spin_unlock_bh(&p_hwfn->p_iscsi_info->lock);
+	if (rc)
+		return rc;
+
+	/* Use input connection or allocate a new one */
+	if (p_in_conn)
+		p_conn = p_in_conn;
+	else
+		rc = qed_iscsi_allocate_connection(p_hwfn, &p_conn);
+
+	if (!rc)
+		rc = qed_iscsi_setup_connection(p_hwfn, p_conn);
+
+	if (rc) {
+		spin_lock_bh(&p_hwfn->p_iscsi_info->lock);
+		qed_cxt_release_cid(p_hwfn, icid);
+		spin_unlock_bh(&p_hwfn->p_iscsi_info->lock);
+		return rc;
+	}
+
+	p_conn->icid = icid;
+	p_conn->conn_id = (u16)icid;
+	p_conn->fw_cid = (p_hwfn->hw_info.opaque_fid << 16) | icid;
+
+	*p_out_conn = p_conn;
+
+	return rc;
+}
+
+static void qed_iscsi_release_connection(struct qed_hwfn *p_hwfn,
+					 struct qed_iscsi_conn *p_conn)
+{
+	spin_lock_bh(&p_hwfn->p_iscsi_info->lock);
+	list_add_tail(&p_conn->list_entry, &p_hwfn->p_iscsi_info->free_list);
+	qed_cxt_release_cid(p_hwfn, p_conn->icid);
+	spin_unlock_bh(&p_hwfn->p_iscsi_info->lock);
+}
+
+struct qed_iscsi_info *qed_iscsi_alloc(struct qed_hwfn *p_hwfn)
+{
+	struct qed_iscsi_info *p_iscsi_info;
+
+	p_iscsi_info = kzalloc(sizeof(*p_iscsi_info), GFP_KERNEL);
+	if (!p_iscsi_info)
+		return NULL;
+
+	INIT_LIST_HEAD(&p_iscsi_info->free_list);
+	return p_iscsi_info;
+}
+
+void qed_iscsi_setup(struct qed_hwfn *p_hwfn,
+		     struct qed_iscsi_info *p_iscsi_info)
+{
+	spin_lock_init(&p_iscsi_info->lock);
+}
+
+void qed_iscsi_free(struct qed_hwfn *p_hwfn,
+		    struct qed_iscsi_info *p_iscsi_info)
+{
+	kfree(p_iscsi_info);
+}
+
+static void _qed_iscsi_get_tstats(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_iscsi_stats *p_stats)
+{
+	struct tstorm_iscsi_stats_drv tstats;
+	u32 tstats_addr;
+
+	memset(&tstats, 0, sizeof(tstats));
+	tstats_addr = BAR0_MAP_REG_TSDM_RAM +
+		      TSTORM_ISCSI_RX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &tstats, tstats_addr, sizeof(tstats));
+
+	p_stats->iscsi_rx_bytes_cnt =
+	    HILO_64_REGPAIR(tstats.iscsi_rx_bytes_cnt);
+	p_stats->iscsi_rx_packet_cnt =
+	    HILO_64_REGPAIR(tstats.iscsi_rx_packet_cnt);
+	p_stats->iscsi_cmdq_threshold_cnt =
+	    le32_to_cpu(tstats.iscsi_cmdq_threshold_cnt);
+	p_stats->iscsi_rq_threshold_cnt =
+	    le32_to_cpu(tstats.iscsi_rq_threshold_cnt);
+	p_stats->iscsi_immq_threshold_cnt =
+	    le32_to_cpu(tstats.iscsi_immq_threshold_cnt);
+}
+
+static void _qed_iscsi_get_mstats(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_iscsi_stats *p_stats)
+{
+	struct mstorm_iscsi_stats_drv mstats;
+	u32 mstats_addr;
+
+	memset(&mstats, 0, sizeof(mstats));
+	mstats_addr = BAR0_MAP_REG_MSDM_RAM +
+		      MSTORM_ISCSI_RX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &mstats, mstats_addr, sizeof(mstats));
+
+	p_stats->iscsi_rx_dropped_pdus_task_not_valid =
+	    HILO_64_REGPAIR(mstats.iscsi_rx_dropped_pdus_task_not_valid);
+}
+
+static void _qed_iscsi_get_ustats(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_iscsi_stats *p_stats)
+{
+	struct ustorm_iscsi_stats_drv ustats;
+	u32 ustats_addr;
+
+	memset(&ustats, 0, sizeof(ustats));
+	ustats_addr = BAR0_MAP_REG_USDM_RAM +
+		      USTORM_ISCSI_RX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &ustats, ustats_addr, sizeof(ustats));
+
+	p_stats->iscsi_rx_data_pdu_cnt =
+	    HILO_64_REGPAIR(ustats.iscsi_rx_data_pdu_cnt);
+	p_stats->iscsi_rx_r2t_pdu_cnt =
+	    HILO_64_REGPAIR(ustats.iscsi_rx_r2t_pdu_cnt);
+	p_stats->iscsi_rx_total_pdu_cnt =
+	    HILO_64_REGPAIR(ustats.iscsi_rx_total_pdu_cnt);
+}
+
+static void _qed_iscsi_get_xstats(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_iscsi_stats *p_stats)
+{
+	struct xstorm_iscsi_stats_drv xstats;
+	u32 xstats_addr;
+
+	memset(&xstats, 0, sizeof(xstats));
+	xstats_addr = BAR0_MAP_REG_XSDM_RAM +
+		      XSTORM_ISCSI_TX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &xstats, xstats_addr, sizeof(xstats));
+
+	p_stats->iscsi_tx_go_to_slow_start_event_cnt =
+	    HILO_64_REGPAIR(xstats.iscsi_tx_go_to_slow_start_event_cnt);
+	p_stats->iscsi_tx_fast_retransmit_event_cnt =
+	    HILO_64_REGPAIR(xstats.iscsi_tx_fast_retransmit_event_cnt);
+}
+
+static void _qed_iscsi_get_ystats(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_iscsi_stats *p_stats)
+{
+	struct ystorm_iscsi_stats_drv ystats;
+	u32 ystats_addr;
+
+	memset(&ystats, 0, sizeof(ystats));
+	ystats_addr = BAR0_MAP_REG_YSDM_RAM +
+		      YSTORM_ISCSI_TX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &ystats, ystats_addr, sizeof(ystats));
+
+	p_stats->iscsi_tx_data_pdu_cnt =
+	    HILO_64_REGPAIR(ystats.iscsi_tx_data_pdu_cnt);
+	p_stats->iscsi_tx_r2t_pdu_cnt =
+	    HILO_64_REGPAIR(ystats.iscsi_tx_r2t_pdu_cnt);
+	p_stats->iscsi_tx_total_pdu_cnt =
+	    HILO_64_REGPAIR(ystats.iscsi_tx_total_pdu_cnt);
+}
+
+static void _qed_iscsi_get_pstats(struct qed_hwfn *p_hwfn,
+				  struct qed_ptt *p_ptt,
+				  struct qed_iscsi_stats *p_stats)
+{
+	struct pstorm_iscsi_stats_drv pstats;
+	u32 pstats_addr;
+
+	memset(&pstats, 0, sizeof(pstats));
+	pstats_addr = BAR0_MAP_REG_PSDM_RAM +
+		      PSTORM_ISCSI_TX_STATS_OFFSET(p_hwfn->rel_pf_id);
+	qed_memcpy_from(p_hwfn, p_ptt, &pstats, pstats_addr, sizeof(pstats));
+
+	p_stats->iscsi_tx_bytes_cnt =
+	    HILO_64_REGPAIR(pstats.iscsi_tx_bytes_cnt);
+	p_stats->iscsi_tx_packet_cnt =
+	    HILO_64_REGPAIR(pstats.iscsi_tx_packet_cnt);
+}
+
+static int qed_iscsi_get_stats(struct qed_hwfn *p_hwfn,
+			       struct qed_iscsi_stats *stats)
+{
+	struct qed_ptt *p_ptt;
+
+	memset(stats, 0, sizeof(*stats));
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt) {
+		DP_ERR(p_hwfn, "Failed to acquire ptt\n");
+		return -EAGAIN;
+	}
+
+	_qed_iscsi_get_tstats(p_hwfn, p_ptt, stats);
+	_qed_iscsi_get_mstats(p_hwfn, p_ptt, stats);
+	_qed_iscsi_get_ustats(p_hwfn, p_ptt, stats);
+
+	_qed_iscsi_get_xstats(p_hwfn, p_ptt, stats);
+	_qed_iscsi_get_ystats(p_hwfn, p_ptt, stats);
+	_qed_iscsi_get_pstats(p_hwfn, p_ptt, stats);
+
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return 0;
+}
+
+struct qed_hash_iscsi_con {
+	struct hlist_node node;
+	struct qed_iscsi_conn *con;
+};
+
+static int qed_fill_iscsi_dev_info(struct qed_dev *cdev,
+				   struct qed_dev_iscsi_info *info)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+
+	int rc;
+
+	memset(info, 0, sizeof(*info));
+	rc = qed_fill_dev_info(cdev, &info->common);
+
+	info->primary_dbq_rq_addr =
+	    qed_iscsi_get_primary_bdq_prod(hwfn, BDQ_ID_RQ);
+	info->secondary_bdq_rq_addr =
+	    qed_iscsi_get_secondary_bdq_prod(hwfn, BDQ_ID_RQ);
+
+	return rc;
+}
+
+static void qed_register_iscsi_ops(struct qed_dev *cdev,
+				   struct qed_iscsi_cb_ops *ops, void *cookie)
+{
+	cdev->protocol_ops.iscsi = ops;
+	cdev->ops_cookie = cookie;
+}
+
+static struct qed_hash_iscsi_con *qed_iscsi_get_hash(struct qed_dev *cdev,
+						     u32 handle)
+{
+	struct qed_hash_iscsi_con *hash_con = NULL;
+
+	if (!(cdev->flags & QED_FLAG_STORAGE_STARTED))
+		return NULL;
+
+	hash_for_each_possible(cdev->connections, hash_con, node, handle) {
+		if (hash_con->con->icid == handle)
+			break;
+	}
+
+	if (!hash_con || (hash_con->con->icid != handle))
+		return NULL;
+
+	return hash_con;
+}
+
+static int qed_iscsi_stop(struct qed_dev *cdev)
+{
+	int rc;
+
+	if (!(cdev->flags & QED_FLAG_STORAGE_STARTED)) {
+		DP_NOTICE(cdev, "iscsi already stopped\n");
+		return 0;
+	}
+
+	if (!hash_empty(cdev->connections)) {
+		DP_NOTICE(cdev,
+			  "Can't stop iscsi - not all connections were returned\n");
+		return -EINVAL;
+	}
+
+	/* Stop the iscsi */
+	rc = qed_sp_iscsi_func_stop(QED_LEADING_HWFN(cdev),
+				    QED_SPQ_MODE_EBLOCK, NULL);
+	cdev->flags &= ~QED_FLAG_STORAGE_STARTED;
+
+	return rc;
+}
+
+static int qed_iscsi_start(struct qed_dev *cdev,
+			   struct qed_iscsi_tid *tasks,
+			   void *event_context,
+			   iscsi_event_cb_t async_event_cb)
+{
+	int rc;
+	struct qed_tid_mem *tid_info;
+
+	if (cdev->flags & QED_FLAG_STORAGE_STARTED) {
+		DP_NOTICE(cdev, "iscsi already started;\n");
+		return 0;
+	}
+
+	rc = qed_sp_iscsi_func_start(QED_LEADING_HWFN(cdev),
+				     QED_SPQ_MODE_EBLOCK, NULL, event_context,
+				     async_event_cb);
+	if (rc) {
+		DP_NOTICE(cdev, "Failed to start iscsi\n");
+		return rc;
+	}
+
+	cdev->flags |= QED_FLAG_STORAGE_STARTED;
+	hash_init(cdev->connections);
+
+	if (!tasks)
+		return 0;
+
+	tid_info = kzalloc(sizeof(*tid_info), GFP_KERNEL);
+
+	if (!tid_info) {
+		qed_iscsi_stop(cdev);
+		return -ENOMEM;
+	}
+
+	rc = qed_cxt_get_tid_mem_info(QED_LEADING_HWFN(cdev),
+				      tid_info);
+	if (rc) {
+		DP_NOTICE(cdev, "Failed to gather task information\n");
+		qed_iscsi_stop(cdev);
+		kfree(tid_info);
+		return rc;
+	}
+
+	/* Fill task information */
+	tasks->size = tid_info->tid_size;
+	tasks->num_tids_per_block = tid_info->num_tids_per_block;
+	memcpy(tasks->blocks, tid_info->blocks,
+	       MAX_TID_BLOCKS_ISCSI * sizeof(u8 *));
+
+	kfree(tid_info);
+
+	return 0;
+}
+
+static int qed_iscsi_acquire_conn(struct qed_dev *cdev,
+				  u32 *handle,
+				  u32 *fw_cid, void __iomem **p_doorbell)
+{
+	struct qed_hash_iscsi_con *hash_con;
+	int rc;
+
+	/* Allocate a hashed connection */
+	hash_con = kzalloc(sizeof(*hash_con), GFP_ATOMIC);
+	if (!hash_con)
+		return -ENOMEM;
+
+	/* Acquire the connection */
+	rc = qed_iscsi_acquire_connection(QED_LEADING_HWFN(cdev), NULL,
+					  &hash_con->con);
+	if (rc) {
+		DP_NOTICE(cdev, "Failed to acquire Connection\n");
+		kfree(hash_con);
+		return rc;
+	}
+
+	/* Added the connection to hash table */
+	*handle = hash_con->con->icid;
+	*fw_cid = hash_con->con->fw_cid;
+	hash_add(cdev->connections, &hash_con->node, *handle);
+
+	if (p_doorbell)
+		*p_doorbell = qed_iscsi_get_db_addr(QED_LEADING_HWFN(cdev),
+						    *handle);
+
+	return 0;
+}
+
+static int qed_iscsi_release_conn(struct qed_dev *cdev, u32 handle)
+{
+	struct qed_hash_iscsi_con *hash_con;
+
+	hash_con = qed_iscsi_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	hlist_del(&hash_con->node);
+	qed_iscsi_release_connection(QED_LEADING_HWFN(cdev), hash_con->con);
+	kfree(hash_con);
+
+	return 0;
+}
+
+static int qed_iscsi_offload_conn(struct qed_dev *cdev,
+				  u32 handle,
+				  struct qed_iscsi_params_offload *conn_info)
+{
+	struct qed_hash_iscsi_con *hash_con;
+	struct qed_iscsi_conn *con;
+
+	hash_con = qed_iscsi_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	/* Update the connection with information from the params */
+	con = hash_con->con;
+
+	ether_addr_copy(con->local_mac, conn_info->src.mac);
+	ether_addr_copy(con->remote_mac, conn_info->dst.mac);
+	memcpy(con->local_ip, conn_info->src.ip, sizeof(con->local_ip));
+	memcpy(con->remote_ip, conn_info->dst.ip, sizeof(con->remote_ip));
+	con->local_port = conn_info->src.port;
+	con->remote_port = conn_info->dst.port;
+
+	con->layer_code = conn_info->layer_code;
+	con->sq_pbl_addr = conn_info->sq_pbl_addr;
+	con->initial_ack = conn_info->initial_ack;
+	con->vlan_id = conn_info->vlan_id;
+	con->tcp_flags = conn_info->tcp_flags;
+	con->ip_version = conn_info->ip_version;
+	con->default_cq = conn_info->default_cq;
+	con->ka_max_probe_cnt = conn_info->ka_max_probe_cnt;
+	con->dup_ack_theshold = conn_info->dup_ack_theshold;
+	con->rcv_next = conn_info->rcv_next;
+	con->snd_una = conn_info->snd_una;
+	con->snd_next = conn_info->snd_next;
+	con->snd_max = conn_info->snd_max;
+	con->snd_wnd = conn_info->snd_wnd;
+	con->rcv_wnd = conn_info->rcv_wnd;
+	con->snd_wl1 = conn_info->snd_wl1;
+	con->cwnd = conn_info->cwnd;
+	con->ss_thresh = conn_info->ss_thresh;
+	con->srtt = conn_info->srtt;
+	con->rtt_var = conn_info->rtt_var;
+	con->ts_time = conn_info->ts_time;
+	con->ts_recent = conn_info->ts_recent;
+	con->ts_recent_age = conn_info->ts_recent_age;
+	con->total_rt = conn_info->total_rt;
+	con->ka_timeout_delta = conn_info->ka_timeout_delta;
+	con->rt_timeout_delta = conn_info->rt_timeout_delta;
+	con->dup_ack_cnt = conn_info->dup_ack_cnt;
+	con->snd_wnd_probe_cnt = conn_info->snd_wnd_probe_cnt;
+	con->ka_probe_cnt = conn_info->ka_probe_cnt;
+	con->rt_cnt = conn_info->rt_cnt;
+	con->flow_label = conn_info->flow_label;
+	con->ka_timeout = conn_info->ka_timeout;
+	con->ka_interval = conn_info->ka_interval;
+	con->max_rt_time = conn_info->max_rt_time;
+	con->initial_rcv_wnd = conn_info->initial_rcv_wnd;
+	con->ttl = conn_info->ttl;
+	con->tos_or_tc = conn_info->tos_or_tc;
+	con->remote_port = conn_info->remote_port;
+	con->local_port = conn_info->local_port;
+	con->mss = conn_info->mss;
+	con->snd_wnd_scale = conn_info->snd_wnd_scale;
+	con->rcv_wnd_scale = conn_info->rcv_wnd_scale;
+	con->ts_ticks_per_second = conn_info->ts_ticks_per_second;
+	con->da_timeout_value = conn_info->da_timeout_value;
+	con->ack_frequency = conn_info->ack_frequency;
+
+	/* Set default values on other connection fields */
+	con->offl_flags = 0x1;
+
+	return qed_sp_iscsi_conn_offload(QED_LEADING_HWFN(cdev), con,
+					 QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+static int qed_iscsi_update_conn(struct qed_dev *cdev,
+				 u32 handle,
+				 struct qed_iscsi_params_update *conn_info)
+{
+	struct qed_hash_iscsi_con *hash_con;
+	struct qed_iscsi_conn *con;
+
+	hash_con = qed_iscsi_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	/* Update the connection with information from the params */
+	con = hash_con->con;
+	con->update_flag = conn_info->update_flag;
+	con->max_seq_size = conn_info->max_seq_size;
+	con->max_recv_pdu_length = conn_info->max_recv_pdu_length;
+	con->max_send_pdu_length = conn_info->max_send_pdu_length;
+	con->first_seq_length = conn_info->first_seq_length;
+	con->exp_stat_sn = conn_info->exp_stat_sn;
+
+	return qed_sp_iscsi_conn_update(QED_LEADING_HWFN(cdev), con,
+					QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+static int qed_iscsi_clear_conn_sq(struct qed_dev *cdev, u32 handle)
+{
+	struct qed_hash_iscsi_con *hash_con;
+
+	hash_con = qed_iscsi_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	return qed_sp_iscsi_conn_clear_sq(QED_LEADING_HWFN(cdev),
+					  hash_con->con,
+					  QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+static int qed_iscsi_destroy_conn(struct qed_dev *cdev,
+				  u32 handle, u8 abrt_conn)
+{
+	struct qed_hash_iscsi_con *hash_con;
+
+	hash_con = qed_iscsi_get_hash(cdev, handle);
+	if (!hash_con) {
+		DP_NOTICE(cdev, "Failed to find connection for handle %d\n",
+			  handle);
+		return -EINVAL;
+	}
+
+	hash_con->con->abortive_dsconnect = abrt_conn;
+
+	return qed_sp_iscsi_conn_terminate(QED_LEADING_HWFN(cdev),
+					   hash_con->con,
+					   QED_SPQ_MODE_EBLOCK, NULL);
+}
+
+static int qed_iscsi_stats(struct qed_dev *cdev, struct qed_iscsi_stats *stats)
+{
+	return qed_iscsi_get_stats(QED_LEADING_HWFN(cdev), stats);
+}
+
+static const struct qed_iscsi_ops qed_iscsi_ops_pass = {
+	.common = &qed_common_ops_pass,
+	.ll2 = &qed_ll2_ops_pass,
+	.fill_dev_info = &qed_fill_iscsi_dev_info,
+	.register_ops = &qed_register_iscsi_ops,
+	.start = &qed_iscsi_start,
+	.stop = &qed_iscsi_stop,
+	.acquire_conn = &qed_iscsi_acquire_conn,
+	.release_conn = &qed_iscsi_release_conn,
+	.offload_conn = &qed_iscsi_offload_conn,
+	.update_conn = &qed_iscsi_update_conn,
+	.destroy_conn = &qed_iscsi_destroy_conn,
+	.clear_sq = &qed_iscsi_clear_conn_sq,
+	.get_stats = &qed_iscsi_stats,
+};
+
+const struct qed_iscsi_ops *qed_get_iscsi_ops()
+{
+	return &qed_iscsi_ops_pass;
+}
+EXPORT_SYMBOL(qed_get_iscsi_ops);
+
+void qed_put_iscsi_ops(void)
+{
+}
+EXPORT_SYMBOL(qed_put_iscsi_ops);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iscsi.h b/drivers/net/ethernet/qlogic/qed/qed_iscsi.h
new file mode 100644
index 000000000000..67c25f3db4d5
--- /dev/null
+++ b/drivers/net/ethernet/qlogic/qed/qed_iscsi.h
@@ -0,0 +1,52 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QED_ISCSI_H
+#define _QED_ISCSI_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/qed/tcp_common.h>
+#include <linux/qed/qed_iscsi_if.h>
+#include <linux/qed/qed_chain.h>
+#include "qed.h"
+#include "qed_hsi.h"
+#include "qed_mcp.h"
+#include "qed_sp.h"
+
+struct qed_iscsi_info {
+	spinlock_t lock; /* Connection resources. */
+	struct list_head free_list;
+	u16 max_num_outstanding_tasks;
+	void *event_context;
+	iscsi_event_cb_t event_cb;
+};
+
+#ifdef CONFIG_QED_LL2
+extern const struct qed_ll2_ops qed_ll2_ops_pass;
+#endif
+
+#if IS_ENABLED(CONFIG_QED_ISCSI)
+struct qed_iscsi_info *qed_iscsi_alloc(struct qed_hwfn *p_hwfn);
+
+void qed_iscsi_setup(struct qed_hwfn *p_hwfn,
+		     struct qed_iscsi_info *p_iscsi_info);
+
+void qed_iscsi_free(struct qed_hwfn *p_hwfn,
+		    struct qed_iscsi_info *p_iscsi_info);
+#else /* IS_ENABLED(CONFIG_QED_ISCSI) */
+static inline struct qed_iscsi_info *qed_iscsi_alloc(
+		struct qed_hwfn *p_hwfn) { return NULL; }
+static inline void qed_iscsi_setup(struct qed_hwfn *p_hwfn,
+				   struct qed_iscsi_info *p_iscsi_info) {}
+static inline void qed_iscsi_free(struct qed_hwfn *p_hwfn,
+				  struct qed_iscsi_info *p_iscsi_info) {}
+#endif /* IS_ENABLED(CONFIG_QED_ISCSI) */
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index f95385cbbd40..84d7e9146b00 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -1517,6 +1517,7 @@ static int qed_ll2_start(struct qed_dev *cdev, struct qed_ll2_params *params)
 	enum qed_ll2_conn_type conn_type;
 	struct qed_ptt *p_ptt;
 	int rc, i;
+	u8 gsi_enable = 1;
 
 	/* Initialize LL2 locks & lists */
 	INIT_LIST_HEAD(&cdev->ll2->list);
@@ -1548,6 +1549,7 @@ static int qed_ll2_start(struct qed_dev *cdev, struct qed_ll2_params *params)
 	switch (QED_LEADING_HWFN(cdev)->hw_info.personality) {
 	case QED_PCI_ISCSI:
 		conn_type = QED_LL2_TYPE_ISCSI;
+		gsi_enable = 0;
 		break;
 	case QED_PCI_ETH_ROCE:
 		conn_type = QED_LL2_TYPE_ROCE;
@@ -1564,7 +1566,7 @@ static int qed_ll2_start(struct qed_dev *cdev, struct qed_ll2_params *params)
 	ll2_info.rx_vlan_removal_en = params->rx_vlan_stripping;
 	ll2_info.tx_tc = 0;
 	ll2_info.tx_dest = CORE_TX_DEST_NW;
-	ll2_info.gsi_enable = 1;
+	ll2_info.gsi_enable = gsi_enable;
 
 	rc = qed_ll2_acquire_connection(QED_LEADING_HWFN(cdev), &ll2_info,
 					QED_LL2_RX_SIZE, QED_LL2_TX_SIZE,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index b414a0542177..97544205a8c1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -82,6 +82,8 @@
 	0x1c80000UL
 #define BAR0_MAP_REG_XSDM_RAM \
 	0x1e00000UL
+#define BAR0_MAP_REG_YSDM_RAM \
+	0x1e80000UL
 #define  NIG_REG_RX_LLH_BRB_GATE_DNTFWD_PERPF \
 	0x5011f4UL
 #define  PRS_REG_SEARCH_TCP \
diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c
index 019960b7855a..56d2f64a3655 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_spq.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c
@@ -24,6 +24,7 @@
 #include "qed_hsi.h"
 #include "qed_hw.h"
 #include "qed_int.h"
+#include "qed_iscsi.h"
 #include "qed_mcp.h"
 #include "qed_reg_addr.h"
 #include "qed_sp.h"
@@ -277,6 +278,20 @@ qed_async_event_completion(struct qed_hwfn *p_hwfn,
 		return qed_sriov_eqe_event(p_hwfn,
 					   p_eqe->opcode,
 					   p_eqe->echo, &p_eqe->data);
+	case PROTOCOLID_ISCSI:
+		if (!IS_ENABLED(CONFIG_QED_ISCSI))
+			return -EINVAL;
+
+		if (p_hwfn->p_iscsi_info->event_cb) {
+			struct qed_iscsi_info *p_iscsi = p_hwfn->p_iscsi_info;
+
+			return p_iscsi->event_cb(p_iscsi->event_context,
+						 p_eqe->opcode, &p_eqe->data);
+		} else {
+			DP_NOTICE(p_hwfn,
+				  "iSCSI async completion is not set\n");
+			return -EINVAL;
+		}
 	default:
 		DP_NOTICE(p_hwfn,
 			  "Unknown Async completion for protocol: %d\n",
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index ea095b4893aa..4b454f4f5b25 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -166,6 +166,7 @@ struct qed_iscsi_pf_params {
 	u32 max_cwnd;
 	u16 cq_num_entries;
 	u16 cmdq_num_entries;
+	u32 two_msl_timer;
 	u16 dup_ack_threshold;
 	u16 tx_sws_timer;
 	u16 min_rto;
@@ -275,6 +276,7 @@ struct qed_dev_info {
 enum qed_sb_type {
 	QED_SB_TYPE_L2_QUEUE,
 	QED_SB_TYPE_CNQ,
+	QED_SB_TYPE_STORAGE,
 };
 
 enum qed_protocol {
diff --git a/include/linux/qed/qed_iscsi_if.h b/include/linux/qed/qed_iscsi_if.h
new file mode 100644
index 000000000000..d27912480cb3
--- /dev/null
+++ b/include/linux/qed/qed_iscsi_if.h
@@ -0,0 +1,229 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QED_ISCSI_IF_H
+#define _QED_ISCSI_IF_H
+#include <linux/types.h>
+#include <linux/qed/qed_if.h>
+
+typedef int (*iscsi_event_cb_t) (void *context,
+				 u8 fw_event_code, void *fw_handle);
+struct qed_iscsi_stats {
+	u64 iscsi_rx_bytes_cnt;
+	u64 iscsi_rx_packet_cnt;
+	u64 iscsi_rx_new_ooo_isle_events_cnt;
+	u32 iscsi_cmdq_threshold_cnt;
+	u32 iscsi_rq_threshold_cnt;
+	u32 iscsi_immq_threshold_cnt;
+
+	u64 iscsi_rx_dropped_pdus_task_not_valid;
+
+	u64 iscsi_rx_data_pdu_cnt;
+	u64 iscsi_rx_r2t_pdu_cnt;
+	u64 iscsi_rx_total_pdu_cnt;
+
+	u64 iscsi_tx_go_to_slow_start_event_cnt;
+	u64 iscsi_tx_fast_retransmit_event_cnt;
+
+	u64 iscsi_tx_data_pdu_cnt;
+	u64 iscsi_tx_r2t_pdu_cnt;
+	u64 iscsi_tx_total_pdu_cnt;
+
+	u64 iscsi_tx_bytes_cnt;
+	u64 iscsi_tx_packet_cnt;
+};
+
+struct qed_dev_iscsi_info {
+	struct qed_dev_info common;
+
+	void __iomem *primary_dbq_rq_addr;
+	void __iomem *secondary_bdq_rq_addr;
+};
+
+struct qed_iscsi_id_params {
+	u8 mac[ETH_ALEN];
+	u32 ip[4];
+	u16 port;
+};
+
+struct qed_iscsi_params_offload {
+	u8 layer_code;
+	dma_addr_t sq_pbl_addr;
+	u32 initial_ack;
+
+	struct qed_iscsi_id_params src;
+	struct qed_iscsi_id_params dst;
+	u16 vlan_id;
+	u8 tcp_flags;
+	u8 ip_version;
+	u8 default_cq;
+
+	u8 ka_max_probe_cnt;
+	u8 dup_ack_theshold;
+	u32 rcv_next;
+	u32 snd_una;
+	u32 snd_next;
+	u32 snd_max;
+	u32 snd_wnd;
+	u32 rcv_wnd;
+	u32 snd_wl1;
+	u32 cwnd;
+	u32 ss_thresh;
+	u16 srtt;
+	u16 rtt_var;
+	u32 ts_time;
+	u32 ts_recent;
+	u32 ts_recent_age;
+	u32 total_rt;
+	u32 ka_timeout_delta;
+	u32 rt_timeout_delta;
+	u8 dup_ack_cnt;
+	u8 snd_wnd_probe_cnt;
+	u8 ka_probe_cnt;
+	u8 rt_cnt;
+	u32 flow_label;
+	u32 ka_timeout;
+	u32 ka_interval;
+	u32 max_rt_time;
+	u32 initial_rcv_wnd;
+	u8 ttl;
+	u8 tos_or_tc;
+	u16 remote_port;
+	u16 local_port;
+	u16 mss;
+	u8 snd_wnd_scale;
+	u8 rcv_wnd_scale;
+	u32 ts_ticks_per_second;
+	u16 da_timeout_value;
+	u8 ack_frequency;
+};
+
+struct qed_iscsi_params_update {
+	u8 update_flag;
+#define QED_ISCSI_CONN_HD_EN            BIT(0)
+#define QED_ISCSI_CONN_DD_EN            BIT(1)
+#define QED_ISCSI_CONN_INITIAL_R2T      BIT(2)
+#define QED_ISCSI_CONN_IMMEDIATE_DATA   BIT(3)
+
+	u32 max_seq_size;
+	u32 max_recv_pdu_length;
+	u32 max_send_pdu_length;
+	u32 first_seq_length;
+	u32 exp_stat_sn;
+};
+
+#define MAX_TID_BLOCKS_ISCSI (512)
+struct qed_iscsi_tid {
+	u32 size;		/* In bytes per task */
+	u32 num_tids_per_block;
+	u8 *blocks[MAX_TID_BLOCKS_ISCSI];
+};
+
+struct qed_iscsi_cb_ops {
+	struct qed_common_cb_ops common;
+};
+
+/**
+ * struct qed_iscsi_ops - qed iSCSI operations.
+ * @common:		common operations pointer
+ * @ll2:		light L2 operations pointer
+ * @fill_dev_info:	fills iSCSI specific information
+ *			@param cdev
+ *			@param info
+ *			@return 0 on sucesss, otherwise error value.
+ * @register_ops:	register iscsi operations
+ *			@param cdev
+ *			@param ops - specified using qed_iscsi_cb_ops
+ *			@param cookie - driver private
+ * @start:		iscsi in FW
+ *			@param cdev
+ *			@param tasks - qed will fill information about tasks
+ *			return 0 on success, otherwise error value.
+ * @stop:		iscsi in FW
+ *			@param cdev
+ *			return 0 on success, otherwise error value.
+ * @acquire_conn:	acquire a new iscsi connection
+ *			@param cdev
+ *			@param handle - qed will fill handle that should be
+ *				used henceforth as identifier of the
+ *				connection.
+ *			@param p_doorbell - qed will fill the address of the
+ *				doorbell.
+ *			@return 0 on sucesss, otherwise error value.
+ * @release_conn:	release a previously acquired iscsi connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@return 0 on success, otherwise error value.
+ * @offload_conn:	configures an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@param conn_info - the configuration to use for the
+ *				offload.
+ *			@return 0 on success, otherwise error value.
+ * @update_conn:	updates an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@param conn_info - the configuration to use for the
+ *				offload.
+ *			@return 0 on success, otherwise error value.
+ * @destroy_conn:	stops an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@return 0 on success, otherwise error value.
+ * @clear_sq:		clear all task in sq
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@return 0 on success, otherwise error value.
+ * @get_stats:		iSCSI related statistics
+ *			@param cdev
+ *			@param stats - pointer to struck that would be filled
+ *				we stats
+ *			@return 0 on success, error otherwise.
+ */
+struct qed_iscsi_ops {
+	const struct qed_common_ops *common;
+
+	const struct qed_ll2_ops *ll2;
+
+	int (*fill_dev_info)(struct qed_dev *cdev,
+			     struct qed_dev_iscsi_info *info);
+
+	void (*register_ops)(struct qed_dev *cdev,
+			     struct qed_iscsi_cb_ops *ops, void *cookie);
+
+	int (*start)(struct qed_dev *cdev,
+		     struct qed_iscsi_tid *tasks,
+		     void *event_context, iscsi_event_cb_t async_event_cb);
+
+	int (*stop)(struct qed_dev *cdev);
+
+	int (*acquire_conn)(struct qed_dev *cdev,
+			    u32 *handle,
+			    u32 *fw_cid, void __iomem **p_doorbell);
+
+	int (*release_conn)(struct qed_dev *cdev, u32 handle);
+
+	int (*offload_conn)(struct qed_dev *cdev,
+			    u32 handle,
+			    struct qed_iscsi_params_offload *conn_info);
+
+	int (*update_conn)(struct qed_dev *cdev,
+			   u32 handle,
+			   struct qed_iscsi_params_update *conn_info);
+
+	int (*destroy_conn)(struct qed_dev *cdev, u32 handle, u8 abrt_conn);
+
+	int (*clear_sq)(struct qed_dev *cdev, u32 handle);
+
+	int (*get_stats)(struct qed_dev *cdev,
+			 struct qed_iscsi_stats *stats);
+};
+
+const struct qed_iscsi_ops *qed_get_iscsi_ops(void);
+void qed_put_iscsi_ops(void);
+#endif
-- 
cgit v1.2.3


From 95a22caee396cef0bb2ca8fafdd82966a49367bb Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 1 Dec 2016 11:32:06 +0100
Subject: tcp: randomize tcp timestamp offsets for each connection

jiffies based timestamps allow for easy inference of number of devices
behind NAT translators and also makes tracking of hosts simpler.

commit ceaa1fef65a7c2e ("tcp: adding a per-socket timestamp offset")
added the main infrastructure that is needed for per-connection ts
randomization, in particular writing/reading the on-wire tcp header
format takes the offset into account so rest of stack can use normal
tcp_time_stamp (jiffies).

So only two items are left:
 - add a tsoffset for request sockets
 - extend the tcp isn generator to also return another 32bit number
   in addition to the ISN.

Re-use of ISN generator also means timestamps are still monotonically
increasing for same connection quadruple, i.e. PAWS will still work.

Includes fixes from Eric Dumazet.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  1 +
 include/net/secure_seq.h |  8 ++++----
 include/net/tcp.h        |  2 +-
 net/core/secure_seq.c    | 10 ++++++----
 net/ipv4/syncookies.c    |  1 +
 net/ipv4/tcp_input.c     |  7 ++++++-
 net/ipv4/tcp_ipv4.c      |  9 +++++----
 net/ipv4/tcp_minisocks.c |  4 +++-
 net/ipv4/tcp_output.c    |  2 +-
 net/ipv6/syncookies.c    |  1 +
 net/ipv6/tcp_ipv6.c      | 10 ++++++----
 11 files changed, 35 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 00e0ee8f001f..734bab4c3bef 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -123,6 +123,7 @@ struct tcp_request_sock {
 	u32				txhash;
 	u32				rcv_isn;
 	u32				snt_isn;
+	u32				ts_off;
 	u32				last_oow_ack_time; /* last SYNACK */
 	u32				rcv_nxt; /* the ack # by SYNACK. For
 						  * FastOpen it's the seq#
diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index 3f36d45b714a..0caee631a836 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -6,10 +6,10 @@
 u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
 u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 			       __be16 dport);
-__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
-				 __be16 sport, __be16 dport);
-__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
-				   __be16 sport, __be16 dport);
+u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
+			       __be16 sport, __be16 dport, u32 *tsoff);
+u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
+				 __be16 sport, __be16 dport, u32 *tsoff);
 u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
 				__be16 sport, __be16 dport);
 u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3e097e39d4d2..207147b4c6b2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1827,7 +1827,7 @@ struct tcp_request_sock_ops {
 	struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl,
 				       const struct request_sock *req,
 				       bool *strict);
-	__u32 (*init_seq)(const struct sk_buff *skb);
+	__u32 (*init_seq)(const struct sk_buff *skb, u32 *tsoff);
 	int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
 			   struct flowi *fl, struct request_sock *req,
 			   struct tcp_fastopen_cookie *foc,
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index fd3ce461fbe6..a8d6062cbb4a 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -40,8 +40,8 @@ static u32 seq_scale(u32 seq)
 #endif
 
 #if IS_ENABLED(CONFIG_IPV6)
-__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
-				   __be16 sport, __be16 dport)
+u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
+				 __be16 sport, __be16 dport, u32 *tsoff)
 {
 	u32 secret[MD5_MESSAGE_BYTES / 4];
 	u32 hash[MD5_DIGEST_WORDS];
@@ -58,6 +58,7 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
 
 	md5_transform(hash, secret);
 
+	*tsoff = hash[1];
 	return seq_scale(hash[0]);
 }
 EXPORT_SYMBOL(secure_tcpv6_sequence_number);
@@ -86,8 +87,8 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
 
 #ifdef CONFIG_INET
 
-__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
-				 __be16 sport, __be16 dport)
+u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
+			       __be16 sport, __be16 dport, u32 *tsoff)
 {
 	u32 hash[MD5_DIGEST_WORDS];
 
@@ -99,6 +100,7 @@ __u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
 
 	md5_transform(hash, net_secret);
 
+	*tsoff = hash[1];
 	return seq_scale(hash[0]);
 }
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 0dc6286272aa..3e88467d70ee 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -334,6 +334,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	treq = tcp_rsk(req);
 	treq->rcv_isn		= ntohl(th->seq) - 1;
 	treq->snt_isn		= cookie;
+	treq->ts_off		= 0;
 	req->mss		= mss;
 	ireq->ir_num		= ntohs(th->dest);
 	ireq->ir_rmt_port	= th->source;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 56fe736fd64d..2257de244622 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6307,6 +6307,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		goto drop;
 
 	tcp_rsk(req)->af_specific = af_ops;
+	tcp_rsk(req)->ts_off = 0;
 
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = af_ops->mss_clamp;
@@ -6328,6 +6329,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	if (security_inet_conn_request(sk, skb, req))
 		goto drop_and_free;
 
+	if (isn && tmp_opt.tstamp_ok)
+		af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
+
 	if (!want_cookie && !isn) {
 		/* VJ's idea. We save last timestamp seen
 		 * from the destination in peer table, when entering
@@ -6368,7 +6372,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 			goto drop_and_release;
 		}
 
-		isn = af_ops->init_seq(skb);
+		isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off);
 	}
 	if (!dst) {
 		dst = af_ops->route_req(sk, &fl, req, NULL);
@@ -6380,6 +6384,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
 	if (want_cookie) {
 		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+		tcp_rsk(req)->ts_off = 0;
 		req->cookie_ts = tmp_opt.tstamp_ok;
 		if (!tmp_opt.tstamp_ok)
 			inet_rsk(req)->ecn_ok = 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5555eb86e549..b50f05905ced 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -95,12 +95,12 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 struct inet_hashinfo tcp_hashinfo;
 EXPORT_SYMBOL(tcp_hashinfo);
 
-static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
+static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff)
 {
 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 					  ip_hdr(skb)->saddr,
 					  tcp_hdr(skb)->dest,
-					  tcp_hdr(skb)->source);
+					  tcp_hdr(skb)->source, tsoff);
 }
 
 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -237,7 +237,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 							   inet->inet_daddr,
 							   inet->inet_sport,
-							   usin->sin_port);
+							   usin->sin_port,
+							   &tp->tsoffset);
 
 	inet->inet_id = tp->write_seq ^ jiffies;
 
@@ -824,7 +825,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 	tcp_v4_send_ack(sk, skb, seq,
 			tcp_rsk(req)->rcv_nxt,
 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
-			tcp_time_stamp,
+			tcp_time_stamp + tcp_rsk(req)->ts_off,
 			req->ts_recent,
 			0,
 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6234ebaa7db1..28ce5ee831f5 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -532,7 +532,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 			newtp->rx_opt.ts_recent_stamp = 0;
 			newtp->tcp_header_len = sizeof(struct tcphdr);
 		}
-		newtp->tsoffset = 0;
+		newtp->tsoffset = treq->ts_off;
 #ifdef CONFIG_TCP_MD5SIG
 		newtp->md5sig_info = NULL;	/*XXX*/
 		if (newtp->af_specific->md5_lookup(sk, newsk))
@@ -581,6 +581,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent = req->ts_recent;
+			if (tmp_opt.rcv_tsecr)
+				tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
 			/* We do not store true stamp, but it is not required,
 			 * it can be estimated (approximately)
 			 * from another data.
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d3545d0cff75..c7adcb57654e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -640,7 +640,7 @@ static unsigned int tcp_synack_options(struct request_sock *req,
 	}
 	if (likely(ireq->tstamp_ok)) {
 		opts->options |= OPTION_TS;
-		opts->tsval = tcp_skb_timestamp(skb);
+		opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
 		opts->tsecr = req->ts_recent;
 		remaining -= TCPOLEN_TSTAMP_ALIGNED;
 	}
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 97830a6a9cbb..a4d49760bf43 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -209,6 +209,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	treq->snt_synack.v64	= 0;
 	treq->rcv_isn = ntohl(th->seq) - 1;
 	treq->snt_isn = cookie;
+	treq->ts_off = 0;
 
 	/*
 	 * We need to lookup the dst_entry to get the correct window size.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 28ec0a2e7b72..a2185a214abc 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -101,12 +101,12 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 	}
 }
 
-static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
+static u32 tcp_v6_init_sequence(const struct sk_buff *skb, u32 *tsoff)
 {
 	return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
 					    ipv6_hdr(skb)->saddr.s6_addr32,
 					    tcp_hdr(skb)->dest,
-					    tcp_hdr(skb)->source);
+					    tcp_hdr(skb)->source, tsoff);
 }
 
 static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
@@ -283,7 +283,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
 							     sk->sk_v6_daddr.s6_addr32,
 							     inet->inet_sport,
-							     inet->inet_dport);
+							     inet->inet_dport,
+							     &tp->tsoffset);
 
 	err = tcp_connect(sk);
 	if (err)
@@ -956,7 +957,8 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 			tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 			tcp_rsk(req)->rcv_nxt,
 			req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
-			tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if,
+			tcp_time_stamp + tcp_rsk(req)->ts_off,
+			req->ts_recent, sk->sk_bound_dev_if,
 			tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
 			0, 0);
 }
-- 
cgit v1.2.3


From 7091d8c7055d7310339435ae3af2fb490a92524d Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Thu, 1 Dec 2016 14:06:37 +0200
Subject: net/sched: cls_flower: Add offload support using egress Hardware
 device

In order to support hardware offloading when the device given by the tc
rule is different from the Hardware underline device, extract the mirred
(egress) device from the tc action when a filter is added, using the new
tc_action_ops, get_dev().

Flower caches the information about the mirred device and use it for
calling ndo_setup_tc in filter change, update stats and delete.

Calling ndo_setup_tc of the mirred (egress) device instead of the
ingress device will allow a resolution between the software ingress
device and the underline hardware device.

The resolution will take place inside the offloading driver using
'egress_device' flag added to tc_to_netdev struct which is provided to
the offloading driver.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 include/net/pkt_cls.h     |  2 ++
 net/sched/cls_api.c       | 24 ++++++++++++++++++++++++
 net/sched/cls_flower.c    | 41 ++++++++++++++++++++++++-----------------
 4 files changed, 51 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3755317cc6a9..1ff5ea6e1221 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -802,6 +802,7 @@ struct tc_to_netdev {
 		struct tc_cls_matchall_offload *cls_mall;
 		struct tc_cls_bpf_offload *cls_bpf;
 	};
+	bool egress_dev;
 };
 
 /* These structures hold the attributes of xdp state that are being passed
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 45ad9aab9bba..f0a051480c6c 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -171,6 +171,8 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
 		     struct tcf_exts *src);
 int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts);
 int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts);
+int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
+		     struct net_device **hw_dev);
 
 /**
  * struct tcf_pkt_info - packet information
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index b05d4a2155b0..3fbba79a4ef0 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -682,6 +682,30 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
 }
 EXPORT_SYMBOL(tcf_exts_dump_stats);
 
+int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
+		     struct net_device **hw_dev)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	const struct tc_action *a;
+	LIST_HEAD(actions);
+
+	if (tc_no_actions(exts))
+		return -EINVAL;
+
+	tcf_exts_to_list(exts, &actions);
+	list_for_each_entry(a, &actions, list) {
+		if (a->ops->get_dev) {
+			a->ops->get_dev(a, dev_net(dev), hw_dev);
+			break;
+		}
+	}
+	if (*hw_dev)
+		return 0;
+#endif
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(tcf_exts_get_dev);
+
 static int __init tc_filter_init(void)
 {
 	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 13b349f426a7..1cacfa5c95f3 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -78,6 +78,8 @@ struct cls_fl_filter {
 	u32 handle;
 	u32 flags;
 	struct rcu_head	rcu;
+	struct tc_to_netdev tc;
+	struct net_device *hw_dev;
 };
 
 static unsigned short int fl_mask_range(const struct fl_flow_mask *mask)
@@ -203,9 +205,9 @@ static void fl_destroy_filter(struct rcu_head *head)
 
 static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 {
-	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_flower_offload offload = {0};
-	struct tc_to_netdev tc;
+	struct net_device *dev = f->hw_dev;
+	struct tc_to_netdev *tc = &f->tc;
 
 	if (!tc_can_offload(dev, tp))
 		return;
@@ -213,10 +215,10 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 	offload.command = TC_CLSFLOWER_DESTROY;
 	offload.cookie = (unsigned long)f;
 
-	tc.type = TC_SETUP_CLSFLOWER;
-	tc.cls_flower = &offload;
+	tc->type = TC_SETUP_CLSFLOWER;
+	tc->cls_flower = &offload;
 
-	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc);
 }
 
 static int fl_hw_replace_filter(struct tcf_proto *tp,
@@ -226,11 +228,17 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_flower_offload offload = {0};
-	struct tc_to_netdev tc;
+	struct tc_to_netdev *tc = &f->tc;
 	int err;
 
-	if (!tc_can_offload(dev, tp))
-		return tc_skip_sw(f->flags) ? -EINVAL : 0;
+	if (!tc_can_offload(dev, tp)) {
+		if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev))
+			return tc_skip_sw(f->flags) ? -EINVAL : 0;
+		dev = f->hw_dev;
+		tc->egress_dev = true;
+	} else {
+		f->hw_dev = dev;
+	}
 
 	offload.command = TC_CLSFLOWER_REPLACE;
 	offload.cookie = (unsigned long)f;
@@ -239,23 +247,22 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	offload.key = &f->key;
 	offload.exts = &f->exts;
 
-	tc.type = TC_SETUP_CLSFLOWER;
-	tc.cls_flower = &offload;
+	tc->type = TC_SETUP_CLSFLOWER;
+	tc->cls_flower = &offload;
 
 	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol,
-					    &tc);
+					    tc);
 
 	if (tc_skip_sw(f->flags))
 		return err;
-
 	return 0;
 }
 
 static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 {
-	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_flower_offload offload = {0};
-	struct tc_to_netdev tc;
+	struct net_device *dev = f->hw_dev;
+	struct tc_to_netdev *tc = &f->tc;
 
 	if (!tc_can_offload(dev, tp))
 		return;
@@ -264,10 +271,10 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 	offload.cookie = (unsigned long)f;
 	offload.exts = &f->exts;
 
-	tc.type = TC_SETUP_CLSFLOWER;
-	tc.cls_flower = &offload;
+	tc->type = TC_SETUP_CLSFLOWER;
+	tc->cls_flower = &offload;
 
-	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, tc);
 }
 
 static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
-- 
cgit v1.2.3


From b2cd12574aa3e1625f471ff57cde7f628a18a46b Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 1 Dec 2016 08:48:03 -0800
Subject: bpf: Refactor cgroups code in prep for new type

Code move and rename only; no functional change intended.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h | 46 +++++++++++++++++++++++-----------------------
 kernel/bpf/cgroup.c        | 10 +++++-----
 kernel/bpf/syscall.c       | 28 +++++++++++++++-------------
 3 files changed, 43 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 0cf1adfadd2d..af2ca8b432c0 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -36,31 +36,31 @@ void cgroup_bpf_update(struct cgroup *cgrp,
 		       struct bpf_prog *prog,
 		       enum bpf_attach_type type);
 
-int __cgroup_bpf_run_filter(struct sock *sk,
-			    struct sk_buff *skb,
-			    enum bpf_attach_type type);
-
-/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */
-#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb)			\
-({									\
-	int __ret = 0;							\
-	if (cgroup_bpf_enabled)						\
-		__ret = __cgroup_bpf_run_filter(sk, skb,		\
-						BPF_CGROUP_INET_INGRESS); \
-									\
-	__ret;								\
+int __cgroup_bpf_run_filter_skb(struct sock *sk,
+				struct sk_buff *skb,
+				enum bpf_attach_type type);
+
+/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
+({									      \
+	int __ret = 0;							      \
+	if (cgroup_bpf_enabled)						      \
+		__ret = __cgroup_bpf_run_filter_skb(sk, skb,		      \
+						    BPF_CGROUP_INET_INGRESS); \
+									      \
+	__ret;								      \
 })
 
-#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb)				\
-({									\
-	int __ret = 0;							\
-	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		\
-		typeof(sk) __sk = sk_to_full_sk(sk);			\
-		if (sk_fullsock(__sk))					\
-			__ret = __cgroup_bpf_run_filter(__sk, skb,	\
-						BPF_CGROUP_INET_EGRESS); \
-	}								\
-	__ret;								\
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb)			       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		       \
+		typeof(sk) __sk = sk_to_full_sk(sk);			       \
+		if (sk_fullsock(__sk))					       \
+			__ret = __cgroup_bpf_run_filter_skb(__sk, skb,	       \
+						      BPF_CGROUP_INET_EGRESS); \
+	}								       \
+	__ret;								       \
 })
 
 #else
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 8c784f8c67cd..8fe55ffd109d 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -118,7 +118,7 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
 }
 
 /**
- * __cgroup_bpf_run_filter() - Run a program for packet filtering
+ * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
  * @sk: The socken sending or receiving traffic
  * @skb: The skb that is being sent or received
  * @type: The type of program to be exectuted
@@ -132,9 +132,9 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
  * This function will return %-EPERM if any if an attached program was found
  * and if it returned != 1 during execution. In all other cases, 0 is returned.
  */
-int __cgroup_bpf_run_filter(struct sock *sk,
-			    struct sk_buff *skb,
-			    enum bpf_attach_type type)
+int __cgroup_bpf_run_filter_skb(struct sock *sk,
+				struct sk_buff *skb,
+				enum bpf_attach_type type)
 {
 	struct bpf_prog *prog;
 	struct cgroup *cgrp;
@@ -164,4 +164,4 @@ int __cgroup_bpf_run_filter(struct sock *sk,
 
 	return ret;
 }
-EXPORT_SYMBOL(__cgroup_bpf_run_filter);
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4caa18e6860a..5518a6839ab1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -856,6 +856,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 {
 	struct bpf_prog *prog;
 	struct cgroup *cgrp;
+	enum bpf_prog_type ptype;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -866,25 +867,26 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
-		prog = bpf_prog_get_type(attr->attach_bpf_fd,
-					 BPF_PROG_TYPE_CGROUP_SKB);
-		if (IS_ERR(prog))
-			return PTR_ERR(prog);
-
-		cgrp = cgroup_get_from_fd(attr->target_fd);
-		if (IS_ERR(cgrp)) {
-			bpf_prog_put(prog);
-			return PTR_ERR(cgrp);
-		}
-
-		cgroup_bpf_update(cgrp, prog, attr->attach_type);
-		cgroup_put(cgrp);
+		ptype = BPF_PROG_TYPE_CGROUP_SKB;
 		break;
 
 	default:
 		return -EINVAL;
 	}
 
+	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	cgrp = cgroup_get_from_fd(attr->target_fd);
+	if (IS_ERR(cgrp)) {
+		bpf_prog_put(prog);
+		return PTR_ERR(cgrp);
+	}
+
+	cgroup_bpf_update(cgrp, prog, attr->attach_type);
+	cgroup_put(cgrp);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 61023658760032e97869b07d54be9681d2529e77 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 1 Dec 2016 08:48:04 -0800
Subject: bpf: Add new cgroup attach type to enable sock modifications

Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
Currently only sk_bound_dev_if is exported to userspace for modification
by a bpf program.

This allows a cgroup to be configured such that AF_INET{6} sockets opened
by processes are automatically bound to a specific device. In turn, this
enables the running of programs that do not support SO_BINDTODEVICE in a
specific VRF context / L3 domain.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h | 14 +++++++++++
 include/uapi/linux/bpf.h   |  6 +++++
 kernel/bpf/cgroup.c        | 33 ++++++++++++++++++++++++
 kernel/bpf/syscall.c       |  5 +++-
 net/core/filter.c          | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/af_inet.c         | 12 ++++++++-
 net/ipv6/af_inet6.c        |  8 ++++++
 7 files changed, 138 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index af2ca8b432c0..7b6e5d168c95 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -40,6 +40,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
 				enum bpf_attach_type type);
 
+int __cgroup_bpf_run_filter_sk(struct sock *sk,
+			       enum bpf_attach_type type);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -63,6 +66,16 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk)				       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled && sk) {					       \
+		__ret = __cgroup_bpf_run_filter_sk(sk,			       \
+						 BPF_CGROUP_INET_SOCK_CREATE); \
+	}								       \
+	__ret;								       \
+})
+
 #else
 
 struct cgroup_bpf {};
@@ -72,6 +85,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
 
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 22ac82792687..bfe5e31a1288 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,6 +101,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_XDP,
 	BPF_PROG_TYPE_PERF_EVENT,
 	BPF_PROG_TYPE_CGROUP_SKB,
+	BPF_PROG_TYPE_CGROUP_SOCK,
 	BPF_PROG_TYPE_LWT_IN,
 	BPF_PROG_TYPE_LWT_OUT,
 	BPF_PROG_TYPE_LWT_XMIT,
@@ -109,6 +110,7 @@ enum bpf_prog_type {
 enum bpf_attach_type {
 	BPF_CGROUP_INET_INGRESS,
 	BPF_CGROUP_INET_EGRESS,
+	BPF_CGROUP_INET_SOCK_CREATE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -567,6 +569,10 @@ enum bpf_ret_code {
 	/* >127 are reserved for prog type specific return codes */
 };
 
+struct bpf_sock {
+	__u32 bound_dev_if;
+};
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 8fe55ffd109d..a515f7b007c6 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -165,3 +165,36 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	return ret;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
+
+/**
+ * __cgroup_bpf_run_filter_sk() - Run a program on a sock
+ * @sk: sock structure to manipulate
+ * @type: The type of program to be exectuted
+ *
+ * socket is passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sk(struct sock *sk,
+			       enum bpf_attach_type type)
+{
+	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_prog *prog;
+	int ret = 0;
+
+
+	rcu_read_lock();
+
+	prog = rcu_dereference(cgrp->bpf.effective[type]);
+	if (prog)
+		ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5518a6839ab1..85af86c496cd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -869,7 +869,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET_EGRESS:
 		ptype = BPF_PROG_TYPE_CGROUP_SKB;
 		break;
-
+	case BPF_CGROUP_INET_SOCK_CREATE:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -905,6 +907,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
+	case BPF_CGROUP_INET_SOCK_CREATE:
 		cgrp = cgroup_get_from_fd(attr->target_fd);
 		if (IS_ERR(cgrp))
 			return PTR_ERR(cgrp);
diff --git a/net/core/filter.c b/net/core/filter.c
index 1c4d0faf22c8..0ab252e462aa 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2818,6 +2818,32 @@ static bool lwt_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static bool sock_filter_is_valid_access(int off, int size,
+					enum bpf_access_type type,
+					enum bpf_reg_type *reg_type)
+{
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct bpf_sock, bound_dev_if):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	if (off < 0 || off + size > sizeof(struct bpf_sock))
+		return false;
+
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+
+	if (size != sizeof(__u32))
+		return false;
+
+	return true;
+}
+
 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 			       const struct bpf_prog *prog)
 {
@@ -3076,6 +3102,30 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	return insn - insn_buf;
 }
 
+static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
+					  int dst_reg, int src_reg,
+					  int ctx_off,
+					  struct bpf_insn *insn_buf,
+					  struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (ctx_off) {
+	case offsetof(struct bpf_sock, bound_dev_if):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+					offsetof(struct sock, sk_bound_dev_if));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sock, sk_bound_dev_if));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 					 int src_reg, int ctx_off,
 					 struct bpf_insn *insn_buf,
@@ -3162,6 +3212,12 @@ static const struct bpf_verifier_ops lwt_xmit_ops = {
 	.gen_prologue		= tc_cls_act_prologue,
 };
 
+static const struct bpf_verifier_ops cg_sock_ops = {
+	.get_func_proto		= sk_filter_func_proto,
+	.is_valid_access	= sock_filter_is_valid_access,
+	.convert_ctx_access	= sock_filter_convert_ctx_access,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
@@ -3202,6 +3258,11 @@ static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
 	.type	= BPF_PROG_TYPE_LWT_XMIT,
 };
 
+static struct bpf_prog_type_list cg_sock_type __read_mostly = {
+	.ops	= &cg_sock_ops,
+	.type	= BPF_PROG_TYPE_CGROUP_SOCK
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
@@ -3209,6 +3270,7 @@ static int __init register_sk_filter_ops(void)
 	bpf_register_prog_type(&sched_act_type);
 	bpf_register_prog_type(&xdp_type);
 	bpf_register_prog_type(&cg_skb_type);
+	bpf_register_prog_type(&cg_sock_type);
 	bpf_register_prog_type(&lwt_in_type);
 	bpf_register_prog_type(&lwt_out_type);
 	bpf_register_prog_type(&lwt_xmit_type);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5ddf5cda07f4..24d2550492ee 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -374,8 +374,18 @@ lookup_protocol:
 
 	if (sk->sk_prot->init) {
 		err = sk->sk_prot->init(sk);
-		if (err)
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
+
+	if (!kern) {
+		err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+		if (err) {
 			sk_common_release(sk);
+			goto out;
+		}
 	}
 out:
 	return err;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d424f3a3737a..237e654ba717 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -258,6 +258,14 @@ lookup_protocol:
 			goto out;
 		}
 	}
+
+	if (!kern) {
+		err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
 out:
 	return err;
 out_rcu_unlock:
-- 
cgit v1.2.3


From adc176c5472214971d77c1a61c83db9b01e9cdc7 Mon Sep 17 00:00:00 2001
From: Erik Nordmark <nordmark@arista.com>
Date: Fri, 2 Dec 2016 14:00:08 -0800
Subject: ipv6 addrconf: Implemented enhanced DAD (RFC7527)

Implemented RFC7527 Enhanced DAD.
IPv6 duplicate address detection can fail if there is some temporary
loopback of Ethernet frames. RFC7527 solves this by including a random
nonce in the NS messages used for DAD, and if an NS is received with the
same nonce it is assumed to be a looped back DAD probe and is ignored.
RFC7527 is enabled by default. Can be disabled by setting both of
conf/{all,interface}/enhanced_dad to zero.

Signed-off-by: Erik Nordmark <nordmark@arista.com>
Signed-off-by: Bob Gilligan <gilligan@arista.com>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  9 +++++++++
 include/linux/ipv6.h                   |  1 +
 include/net/if_inet6.h                 |  1 +
 include/net/ndisc.h                    |  5 ++++-
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 22 +++++++++++++++++++++-
 net/ipv6/ndisc.c                       | 29 ++++++++++++++++++++++++++---
 net/ipv6/route.c                       |  2 +-
 8 files changed, 64 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 5ca567fa6b8c..7dd65c9cf707 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1734,6 +1734,15 @@ drop_unsolicited_na - BOOLEAN
 
 	By default this is turned off.
 
+enhanced_dad - BOOLEAN
+	Include a nonce option in the IPv6 neighbor solicitation messages used for
+	duplicate address detection per RFC7527. A received DAD NS will only signal
+	a duplicate address if the nonce is different. This avoids any false
+	detection of duplicates due to loopback of the NS messages that we send.
+	The nonce option will be sent on an interface unless both of
+	conf/{all,interface}/enhanced_dad are set to FALSE.
+	Default: TRUE
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 3f95233b2733..671d014e6429 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -68,6 +68,7 @@ struct ipv6_devconf {
 #ifdef CONFIG_IPV6_SEG6_HMAC
 	__s32		seg6_require_hmac;
 #endif
+	__u32		enhanced_dad;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index b0576cb2ab25..0fa4c324b713 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -55,6 +55,7 @@ struct inet6_ifaddr {
 	__u8			stable_privacy_retry;
 
 	__u16			scope;
+	__u64			dad_nonce;
 
 	unsigned long		cstamp;	/* created timestamp */
 	unsigned long		tstamp; /* updated timestamp */
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index be1fe2283254..d562a2fe4860 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -31,6 +31,7 @@ enum {
 	ND_OPT_PREFIX_INFO = 3,		/* RFC2461 */
 	ND_OPT_REDIRECT_HDR = 4,	/* RFC2461 */
 	ND_OPT_MTU = 5,			/* RFC2461 */
+	ND_OPT_NONCE = 14,              /* RFC7527 */
 	__ND_OPT_ARRAY_MAX,
 	ND_OPT_ROUTE_INFO = 24,		/* RFC4191 */
 	ND_OPT_RDNSS = 25,		/* RFC5006 */
@@ -121,6 +122,7 @@ struct ndisc_options {
 #define nd_opts_pi_end			nd_opt_array[__ND_OPT_PREFIX_INFO_END]
 #define nd_opts_rh			nd_opt_array[ND_OPT_REDIRECT_HDR]
 #define nd_opts_mtu			nd_opt_array[ND_OPT_MTU]
+#define nd_opts_nonce			nd_opt_array[ND_OPT_NONCE]
 #define nd_802154_opts_src_lladdr	nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR]
 #define nd_802154_opts_tgt_lladdr	nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR]
 
@@ -398,7 +400,8 @@ void ndisc_cleanup(void);
 int ndisc_rcv(struct sk_buff *skb);
 
 void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
-		   const struct in6_addr *daddr, const struct in6_addr *saddr);
+		   const struct in6_addr *daddr, const struct in6_addr *saddr,
+		   u64 nonce);
 
 void ndisc_send_rs(struct net_device *dev,
 		   const struct in6_addr *saddr, const struct in6_addr *daddr);
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 53561be1ac21..eaf65dc82e22 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -181,6 +181,7 @@ enum {
 	DEVCONF_RTR_SOLICIT_MAX_INTERVAL,
 	DEVCONF_SEG6_ENABLED,
 	DEVCONF_SEG6_REQUIRE_HMAC,
+	DEVCONF_ENHANCED_DAD,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4c387dc338e3..c1e124bc8e1e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -242,6 +242,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 #ifdef CONFIG_IPV6_SEG6_HMAC
 	.seg6_require_hmac	= 0,
 #endif
+	.enhanced_dad           = 1,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -292,6 +293,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 #ifdef CONFIG_IPV6_SEG6_HMAC
 	.seg6_require_hmac	= 0,
 #endif
+	.enhanced_dad           = 1,
 };
 
 /* Check if a valid qdisc is available */
@@ -3735,12 +3737,21 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp)
 {
 	unsigned long rand_num;
 	struct inet6_dev *idev = ifp->idev;
+	u64 nonce;
 
 	if (ifp->flags & IFA_F_OPTIMISTIC)
 		rand_num = 0;
 	else
 		rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1);
 
+	nonce = 0;
+	if (idev->cnf.enhanced_dad ||
+	    dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad) {
+		do
+			get_random_bytes(&nonce, 6);
+		while (nonce == 0);
+	}
+	ifp->dad_nonce = nonce;
 	ifp->dad_probes = idev->cnf.dad_transmits;
 	addrconf_mod_dad_work(ifp, rand_num);
 }
@@ -3918,7 +3929,8 @@ static void addrconf_dad_work(struct work_struct *w)
 
 	/* send a neighbour solicitation for our addr */
 	addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
-	ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any);
+	ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any,
+		      ifp->dad_nonce);
 out:
 	in6_ifa_put(ifp);
 	rtnl_unlock();
@@ -4962,6 +4974,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 #ifdef CONFIG_IPV6_SEG6_HMAC
 	array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac;
 #endif
+	array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -6069,6 +6082,13 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
+	{
+		.procname       = "enhanced_dad",
+		.data           = &ipv6_devconf.enhanced_dad,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
 	{
 		/* sentinel */
 	}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index d8e671457d10..7ebac630d3c6 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -233,6 +233,7 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
 		case ND_OPT_SOURCE_LL_ADDR:
 		case ND_OPT_TARGET_LL_ADDR:
 		case ND_OPT_MTU:
+		case ND_OPT_NONCE:
 		case ND_OPT_REDIRECT_HDR:
 			if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
 				ND_PRINTK(2, warn,
@@ -568,7 +569,8 @@ static void ndisc_send_unsol_na(struct net_device *dev)
 }
 
 void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
-		   const struct in6_addr *daddr, const struct in6_addr *saddr)
+		   const struct in6_addr *daddr, const struct in6_addr *saddr,
+		   u64 nonce)
 {
 	struct sk_buff *skb;
 	struct in6_addr addr_buf;
@@ -588,6 +590,8 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
 	if (inc_opt)
 		optlen += ndisc_opt_addr_space(dev,
 					       NDISC_NEIGHBOUR_SOLICITATION);
+	if (nonce != 0)
+		optlen += 8;
 
 	skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
 	if (!skb)
@@ -605,6 +609,13 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
 		ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
 				       dev->dev_addr,
 				       NDISC_NEIGHBOUR_SOLICITATION);
+	if (nonce != 0) {
+		u8 *opt = skb_put(skb, 8);
+
+		opt[0] = ND_OPT_NONCE;
+		opt[1] = 8 >> 3;
+		memcpy(opt + 2, &nonce, 6);
+	}
 
 	ndisc_send_skb(skb, daddr, saddr);
 }
@@ -693,12 +704,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
 				  "%s: trying to ucast probe in NUD_INVALID: %pI6\n",
 				  __func__, target);
 		}
-		ndisc_send_ns(dev, target, target, saddr);
+		ndisc_send_ns(dev, target, target, saddr, 0);
 	} else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) {
 		neigh_app_ns(neigh);
 	} else {
 		addrconf_addr_solict_mult(target, &mcaddr);
-		ndisc_send_ns(dev, target, &mcaddr, saddr);
+		ndisc_send_ns(dev, target, &mcaddr, saddr, 0);
 	}
 }
 
@@ -742,6 +753,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 	int dad = ipv6_addr_any(saddr);
 	bool inc;
 	int is_router = -1;
+	u64 nonce = 0;
 
 	if (skb->len < sizeof(struct nd_msg)) {
 		ND_PRINTK(2, warn, "NS: packet too short\n");
@@ -786,6 +798,8 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 			return;
 		}
 	}
+	if (ndopts.nd_opts_nonce)
+		memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6);
 
 	inc = ipv6_addr_is_multicast(daddr);
 
@@ -794,6 +808,15 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 have_ifp:
 		if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
 			if (dad) {
+				if (nonce != 0 && ifp->dad_nonce == nonce) {
+					u8 *np = (u8 *)&nonce;
+					/* Matching nonce if looped back */
+					ND_PRINTK(2, notice,
+						  "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n",
+						  ifp->idev->dev->name,
+						  &ifp->addr, np);
+					goto out;
+				}
 				/*
 				 * We are colliding with another node
 				 * who is doing DAD
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b317bb135ed4..aac7818e2e0f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -527,7 +527,7 @@ static void rt6_probe_deferred(struct work_struct *w)
 		container_of(w, struct __rt6_probe_work, work);
 
 	addrconf_addr_solict_mult(&work->target, &mcaddr);
-	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
+	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
 	dev_put(work->dev);
 	kfree(work);
 }
-- 
cgit v1.2.3


From c51d39010a1bccc9c1294e2d7c00005aefeb2b5c Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Tue, 15 Nov 2016 15:08:25 +0100
Subject: netfilter: conntrack: built-in support for DCCP

CONFIG_NF_CT_PROTO_DCCP is no more a tristate. When set to y, connection
tracking support for DCCP protocol is built-in into nf_conntrack.ko.

footprint test:
$ ls -l net/netfilter/nf_conntrack{_proto_dccp,}.ko \
        net/ipv4/netfilter/nf_conntrack_ipv4.ko \
        net/ipv6/netfilter/nf_conntrack_ipv6.ko

(builtin)||  dccp  |  ipv4  |  ipv6  | nf_conntrack
---------++--------+--------+--------+--------------
none     || 469140 | 828755 | 828676 | 6141434
DCCP     ||   -    | 830566 | 829935 | 6533526

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_dccp.h    |  2 +-
 include/net/netfilter/ipv4/nf_conntrack_ipv4.h |  3 +
 include/net/netfilter/ipv6/nf_conntrack_ipv6.h |  3 +
 include/net/netns/conntrack.h                  | 14 +++++
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  3 +
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  3 +
 net/netfilter/Kconfig                          |  6 +-
 net/netfilter/Makefile                         |  3 +-
 net/netfilter/nf_conntrack_proto_dccp.c        | 79 ++++----------------------
 9 files changed, 41 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nf_conntrack_dccp.h b/include/linux/netfilter/nf_conntrack_dccp.h
index 40dcc82058d1..ff721d7325cf 100644
--- a/include/linux/netfilter/nf_conntrack_dccp.h
+++ b/include/linux/netfilter/nf_conntrack_dccp.h
@@ -25,7 +25,7 @@ enum ct_dccp_roles {
 #define CT_DCCP_ROLE_MAX	(__CT_DCCP_ROLE_MAX - 1)
 
 #ifdef __KERNEL__
-#include <net/netfilter/nf_conntrack_tuple.h>
+#include <linux/netfilter/nf_conntrack_tuple_common.h>
 
 struct nf_ct_dccp {
 	u_int8_t	role[IP_CT_DIR_MAX];
diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 981c327374da..c2f155fd9299 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -15,6 +15,9 @@ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp;
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4;
+#endif
 
 int nf_conntrack_ipv4_compat_init(void);
 void nf_conntrack_ipv4_compat_fini(void);
diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
index a4c993685795..5ec66c0d21c4 100644
--- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
@@ -6,6 +6,9 @@ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6;
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6;
+#endif
 
 #include <linux/sysctl.h>
 extern struct ctl_table nf_ct_ipv6_sysctl_table[];
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 3d06d94d2e52..440b781baf0b 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -6,6 +6,9 @@
 #include <linux/atomic.h>
 #include <linux/workqueue.h>
 #include <linux/netfilter/nf_conntrack_tcp.h>
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+#include <linux/netfilter/nf_conntrack_dccp.h>
+#endif
 #include <linux/seqlock.h>
 
 struct ctl_table_header;
@@ -48,12 +51,23 @@ struct nf_icmp_net {
 	unsigned int timeout;
 };
 
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+struct nf_dccp_net {
+	struct nf_proto_net pn;
+	int dccp_loose;
+	unsigned int dccp_timeout[CT_DCCP_MAX + 1];
+};
+#endif
+
 struct nf_ip_net {
 	struct nf_generic_net   generic;
 	struct nf_tcp_net	tcp;
 	struct nf_udp_net	udp;
 	struct nf_icmp_net	icmp;
 	struct nf_icmp_net	icmpv6;
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+	struct nf_dccp_net	dccp;
+#endif
 };
 
 struct ct_pcpu {
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 7130ed5dc1fa..cb3cf770b00c 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -340,6 +340,9 @@ static struct nf_conntrack_l4proto *builtin_l4proto4[] = {
 	&nf_conntrack_l4proto_tcp4,
 	&nf_conntrack_l4proto_udp4,
 	&nf_conntrack_l4proto_icmp,
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+	&nf_conntrack_l4proto_dccp4,
+#endif
 };
 
 static int ipv4_net_init(struct net *net)
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 500be28ff563..f52338d02951 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -340,6 +340,9 @@ static struct nf_conntrack_l4proto *builtin_l4proto6[] = {
 	&nf_conntrack_l4proto_tcp6,
 	&nf_conntrack_l4proto_udp6,
 	&nf_conntrack_l4proto_icmpv6,
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+	&nf_conntrack_l4proto_dccp6,
+#endif
 };
 
 static int ipv6_net_init(struct net *net)
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 496e1dcbd003..27a3d8c8f8ce 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -146,14 +146,14 @@ config NF_CONNTRACK_LABELS
 	  to connection tracking entries.  It selected by the connlabel match.
 
 config NF_CT_PROTO_DCCP
-	tristate 'DCCP protocol connection tracking support'
+	bool 'DCCP protocol connection tracking support'
 	depends on NETFILTER_ADVANCED
-	default IP_DCCP
+	default y
 	help
 	  With this option enabled, the layer 3 independent connection
 	  tracking code will be able to do state tracking on DCCP connections.
 
-	  If unsure, say 'N'.
+	  If unsure, say Y.
 
 config NF_CT_PROTO_GRE
 	tristate
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 3b97d89df2cd..bbd0cc08eff0 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -5,6 +5,7 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
+nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
 
 obj-$(CONFIG_NETFILTER) = netfilter.o
 
@@ -16,8 +17,6 @@ obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
 # connection tracking
 obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
 
-# SCTP protocol connection tracking
-obj-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
 obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
 obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
 obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 073b047314dc..b68ce6ac13b3 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -9,7 +9,6 @@
  *
  */
 #include <linux/kernel.h>
-#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/spinlock.h>
@@ -384,17 +383,9 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =
 	},
 };
 
-/* this module per-net specifics */
-static unsigned int dccp_net_id __read_mostly;
-struct dccp_net {
-	struct nf_proto_net pn;
-	int dccp_loose;
-	unsigned int dccp_timeout[CT_DCCP_MAX + 1];
-};
-
-static inline struct dccp_net *dccp_pernet(struct net *net)
+static inline struct nf_dccp_net *dccp_pernet(struct net *net)
 {
-	return net_generic(net, dccp_net_id);
+	return &net->ct.nf_ct_proto.dccp;
 }
 
 static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
@@ -424,7 +415,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
 		     unsigned int dataoff, unsigned int *timeouts)
 {
 	struct net *net = nf_ct_net(ct);
-	struct dccp_net *dn;
+	struct nf_dccp_net *dn;
 	struct dccp_hdr _dh, *dh;
 	const char *msg;
 	u_int8_t state;
@@ -719,7 +710,7 @@ static int dccp_nlattr_size(void)
 static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
 				      struct net *net, void *data)
 {
-	struct dccp_net *dn = dccp_pernet(net);
+	struct nf_dccp_net *dn = dccp_pernet(net);
 	unsigned int *timeouts = data;
 	int i;
 
@@ -820,7 +811,7 @@ static struct ctl_table dccp_sysctl_table[] = {
 #endif /* CONFIG_SYSCTL */
 
 static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
-				     struct dccp_net *dn)
+				     struct nf_dccp_net *dn)
 {
 #ifdef CONFIG_SYSCTL
 	if (pn->ctl_table)
@@ -850,7 +841,7 @@ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn,
 
 static int dccp_init_net(struct net *net, u_int16_t proto)
 {
-	struct dccp_net *dn = dccp_pernet(net);
+	struct nf_dccp_net *dn = dccp_pernet(net);
 	struct nf_proto_net *pn = &dn->pn;
 
 	if (!pn->users) {
@@ -868,7 +859,7 @@ static int dccp_init_net(struct net *net, u_int16_t proto)
 	return dccp_kmemdup_sysctl_table(net, pn, dn);
 }
 
-static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
+struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = {
 	.l3proto		= AF_INET,
 	.l4proto		= IPPROTO_DCCP,
 	.name			= "dccp",
@@ -898,11 +889,11 @@ static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = {
 		.nla_policy	= dccp_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-	.net_id			= &dccp_net_id,
 	.init_net		= dccp_init_net,
 };
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4);
 
-static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
+struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = {
 	.l3proto		= AF_INET6,
 	.l4proto		= IPPROTO_DCCP,
 	.name			= "dccp",
@@ -932,56 +923,6 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
 		.nla_policy	= dccp_timeout_nla_policy,
 	},
 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
-	.net_id			= &dccp_net_id,
 	.init_net		= dccp_init_net,
 };
-
-static struct nf_conntrack_l4proto *dccp_proto[] = {
-	&dccp_proto4,
-	&dccp_proto6,
-};
-
-static __net_init int dccp_net_init(struct net *net)
-{
-	return nf_ct_l4proto_pernet_register(net, dccp_proto,
-					     ARRAY_SIZE(dccp_proto));
-}
-
-static __net_exit void dccp_net_exit(struct net *net)
-{
-	nf_ct_l4proto_pernet_unregister(net, dccp_proto,
-					ARRAY_SIZE(dccp_proto));
-}
-
-static struct pernet_operations dccp_net_ops = {
-	.init = dccp_net_init,
-	.exit = dccp_net_exit,
-	.id   = &dccp_net_id,
-	.size = sizeof(struct dccp_net),
-};
-
-static int __init nf_conntrack_proto_dccp_init(void)
-{
-	int ret;
-
-	ret = register_pernet_subsys(&dccp_net_ops);
-	if (ret < 0)
-		return ret;
-	ret = nf_ct_l4proto_register(dccp_proto, ARRAY_SIZE(dccp_proto));
-	if (ret < 0)
-		unregister_pernet_subsys(&dccp_net_ops);
-	return ret;
-}
-
-static void __exit nf_conntrack_proto_dccp_fini(void)
-{
-	nf_ct_l4proto_unregister(dccp_proto, ARRAY_SIZE(dccp_proto));
-	unregister_pernet_subsys(&dccp_net_ops);
-}
-
-module_init(nf_conntrack_proto_dccp_init);
-module_exit(nf_conntrack_proto_dccp_fini);
-
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("DCCP connection tracking protocol helper");
-MODULE_LICENSE("GPL");
+EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp6);
-- 
cgit v1.2.3


From 40fc3423b983b864bf70b03199191260ae9b2ea6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 3 Dec 2016 11:14:50 -0800
Subject: tcp: tsq: add tsq_flags / tsq_enum

This is a cleanup, to ease code review of following patches.

Old 'enum tsq_flags' is renamed, and a new enumeration is added
with the flags used in cmpxchg() operations as opposed to
single bit operations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   | 11 ++++++++++-
 net/ipv4/tcp_output.c | 16 ++++++++--------
 2 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 734bab4c3bef..d8be083ab0b0 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -364,7 +364,7 @@ struct tcp_sock {
 	u32	*saved_syn;
 };
 
-enum tsq_flags {
+enum tsq_enum {
 	TSQ_THROTTLED,
 	TSQ_QUEUED,
 	TCP_TSQ_DEFERRED,	   /* tcp_tasklet_func() found socket was owned */
@@ -375,6 +375,15 @@ enum tsq_flags {
 				    */
 };
 
+enum tsq_flags {
+	TSQF_THROTTLED			= (1UL << TSQ_THROTTLED),
+	TSQF_QUEUED			= (1UL << TSQ_QUEUED),
+	TCPF_TSQ_DEFERRED		= (1UL << TCP_TSQ_DEFERRED),
+	TCPF_WRITE_TIMER_DEFERRED	= (1UL << TCP_WRITE_TIMER_DEFERRED),
+	TCPF_DELACK_TIMER_DEFERRED	= (1UL << TCP_DELACK_TIMER_DEFERRED),
+	TCPF_MTU_REDUCED_DEFERRED	= (1UL << TCP_MTU_REDUCED_DEFERRED),
+};
+
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
 {
 	return (struct tcp_sock *)sk;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c7adcb57654e..8f0289b0fb24 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -784,10 +784,10 @@ static void tcp_tasklet_func(unsigned long data)
 	}
 }
 
-#define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) |		\
-			  (1UL << TCP_WRITE_TIMER_DEFERRED) |	\
-			  (1UL << TCP_DELACK_TIMER_DEFERRED) |	\
-			  (1UL << TCP_MTU_REDUCED_DEFERRED))
+#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED |		\
+			  TCPF_WRITE_TIMER_DEFERRED |	\
+			  TCPF_DELACK_TIMER_DEFERRED |	\
+			  TCPF_MTU_REDUCED_DEFERRED)
 /**
  * tcp_release_cb - tcp release_sock() callback
  * @sk: socket
@@ -808,7 +808,7 @@ void tcp_release_cb(struct sock *sk)
 		nflags = flags & ~TCP_DEFERRED_ALL;
 	} while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
 
-	if (flags & (1UL << TCP_TSQ_DEFERRED))
+	if (flags & TCPF_TSQ_DEFERRED)
 		tcp_tsq_handler(sk);
 
 	/* Here begins the tricky part :
@@ -822,15 +822,15 @@ void tcp_release_cb(struct sock *sk)
 	 */
 	sock_release_ownership(sk);
 
-	if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
+	if (flags & TCPF_WRITE_TIMER_DEFERRED) {
 		tcp_write_timer_handler(sk);
 		__sock_put(sk);
 	}
-	if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
+	if (flags & TCPF_DELACK_TIMER_DEFERRED) {
 		tcp_delack_timer_handler(sk);
 		__sock_put(sk);
 	}
-	if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
+	if (flags & TCPF_MTU_REDUCED_DEFERRED) {
 		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
 		__sock_put(sk);
 	}
-- 
cgit v1.2.3


From 7aa5470c2c09265902b5e4289afa82e4e7c2987e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 3 Dec 2016 11:14:57 -0800
Subject: tcp: tsq: move tsq_flags close to sk_wmem_alloc

tsq_flags being in the same cache line than sk_wmem_alloc
makes a lot of sense. Both fields are changed from tcp_wfree()
and more generally by various TSQ related functions.

Prior patch made room in struct sock and added sk_tsq_flags,
this patch deletes tsq_flags from struct tcp_sock.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  1 -
 net/ipv4/tcp.c        |  4 ++--
 net/ipv4/tcp_ipv4.c   |  2 +-
 net/ipv4/tcp_output.c | 24 +++++++++++-------------
 net/ipv4/tcp_timer.c  |  4 ++--
 net/ipv6/tcp_ipv6.c   |  2 +-
 6 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index d8be083ab0b0..fc5848dad7a4 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -186,7 +186,6 @@ struct tcp_sock {
 	u32	tsoffset;	/* timestamp offset */
 
 	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
-	unsigned long	tsq_flags;
 
 	/* Data for direct copy to user */
 	struct {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1149b48700a1..1ef3165114ba 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -663,9 +663,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
 	if (tcp_should_autocork(sk, skb, size_goal)) {
 
 		/* avoid atomic op if TSQ_THROTTLED bit is already set */
-		if (!test_bit(TSQ_THROTTLED, &tp->tsq_flags)) {
+		if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
 			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
-			set_bit(TSQ_THROTTLED, &tp->tsq_flags);
+			set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
 		}
 		/* It is possible TX completion already happened
 		 * before we set TSQ_THROTTLED.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b50f05905ced..30d81f533ada 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -443,7 +443,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 			if (!sock_owned_by_user(sk)) {
 				tcp_v4_mtu_reduced(sk);
 			} else {
-				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
+				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
 					sock_hold(sk);
 			}
 			goto out;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5f04bee4c86a..b45101f3d2bd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -767,14 +767,15 @@ static void tcp_tasklet_func(unsigned long data)
 	list_for_each_safe(q, n, &list) {
 		tp = list_entry(q, struct tcp_sock, tsq_node);
 		list_del(&tp->tsq_node);
-		clear_bit(TSQ_QUEUED, &tp->tsq_flags);
 
 		sk = (struct sock *)tp;
+		clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
+
 		if (!sk->sk_lock.owned &&
-		    test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags)) {
+		    test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
 			bh_lock_sock(sk);
 			if (!sock_owned_by_user(sk)) {
-				clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+				clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
 				tcp_tsq_handler(sk);
 			}
 			bh_unlock_sock(sk);
@@ -797,16 +798,15 @@ static void tcp_tasklet_func(unsigned long data)
  */
 void tcp_release_cb(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned long flags, nflags;
 
 	/* perform an atomic operation only if at least one flag is set */
 	do {
-		flags = tp->tsq_flags;
+		flags = sk->sk_tsq_flags;
 		if (!(flags & TCP_DEFERRED_ALL))
 			return;
 		nflags = flags & ~TCP_DEFERRED_ALL;
-	} while (cmpxchg(&tp->tsq_flags, flags, nflags) != flags);
+	} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
 
 	if (flags & TCPF_TSQ_DEFERRED)
 		tcp_tsq_handler(sk);
@@ -878,7 +878,7 @@ void tcp_wfree(struct sk_buff *skb)
 	if (wmem >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
 		goto out;
 
-	for (oval = READ_ONCE(tp->tsq_flags);; oval = nval) {
+	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
 		struct tsq_tasklet *tsq;
 		bool empty;
 
@@ -886,7 +886,7 @@ void tcp_wfree(struct sk_buff *skb)
 			goto out;
 
 		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
-		nval = cmpxchg(&tp->tsq_flags, oval, nval);
+		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
 		if (nval != oval)
 			continue;
 
@@ -2100,7 +2100,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 		    skb->prev == sk->sk_write_queue.next)
 			return false;
 
-		set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
+		set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
 		/* It is possible TX completion already happened
 		 * before we set TSQ_THROTTLED, so we must
 		 * test again the condition.
@@ -2241,8 +2241,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
 			break;
 
-		if (test_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags))
-			clear_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
+		if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
+			clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
 		if (tcp_small_queue_check(sk, skb, 0))
 			break;
 
@@ -3545,8 +3545,6 @@ void tcp_send_ack(struct sock *sk)
 	/* We do not want pure acks influencing TCP Small Queues or fq/pacing
 	 * too much.
 	 * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
-	 * We also avoid tcp_wfree() overhead (cache line miss accessing
-	 * tp->tsq_flags) by using regular sock_wfree()
 	 */
 	skb_set_tcp_pure_ack(buff);
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3ea1cf804748..3705075f42c3 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -310,7 +310,7 @@ static void tcp_delack_timer(unsigned long data)
 		inet_csk(sk)->icsk_ack.blocked = 1;
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
 		/* deleguate our work to tcp_release_cb() */
-		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+		if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
 			sock_hold(sk);
 	}
 	bh_unlock_sock(sk);
@@ -592,7 +592,7 @@ static void tcp_write_timer(unsigned long data)
 		tcp_write_timer_handler(sk);
 	} else {
 		/* delegate our work to tcp_release_cb() */
-		if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
+		if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
 			sock_hold(sk);
 	}
 	bh_unlock_sock(sk);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index a2185a214abc..73bc8fc68acd 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -399,7 +399,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		if (!sock_owned_by_user(sk))
 			tcp_v6_mtu_reduced(sk);
 		else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
-					   &tp->tsq_flags))
+					   &sk->sk_tsq_flags))
 			sock_hold(sk);
 		goto out;
 	}
-- 
cgit v1.2.3


From 7bd509e311f408f7a5132fcdde2069af65fa05ae Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 4 Dec 2016 23:19:41 +0100
Subject: bpf: add prog_digest and expose it via fdinfo/netlink

When loading a BPF program via bpf(2), calculate the digest over
the program's instruction stream and store it in struct bpf_prog's
digest member. This is done at a point in time before any instructions
are rewritten by the verifier. Any unstable map file descriptor
number part of the imm field will be zeroed for the hash.

fdinfo example output for progs:

  # cat /proc/1590/fdinfo/5
  pos:          0
  flags:        02000002
  mnt_id:       11
  prog_type:    1
  prog_jited:   1
  prog_digest:  b27e8b06da22707513aa97363dfb11c7c3675d28
  memlock:      4096

When programs are pinned and retrieved by an ELF loader, the loader
can check the program's digest through fdinfo and compare it against
one that was generated over the ELF file's program section to see
if the program needs to be reloaded. Furthermore, this can also be
exposed through other means such as netlink in case of a tc cls/act
dump (or xdp in future), but also through tracepoints or other
facilities to identify the program. Other than that, the digest can
also serve as a base name for the work in progress kallsyms support
of programs. The digest doesn't depend/select the crypto layer, since
we need to keep dependencies to a minimum. iproute2 will get support
for this facility.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h                |  1 +
 include/linux/filter.h             |  7 +++-
 include/uapi/linux/pkt_cls.h       |  1 +
 include/uapi/linux/tc_act/tc_bpf.h |  1 +
 kernel/bpf/core.c                  | 65 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c               | 24 +++++++++++++-
 kernel/bpf/verifier.c              |  2 ++
 net/sched/act_bpf.c                |  9 ++++++
 net/sched/cls_bpf.c                |  8 +++++
 9 files changed, 116 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 69d0a7f12a3b..8796ff03f472 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -216,6 +216,7 @@ u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
+void bpf_prog_calc_digest(struct bpf_prog *fp);
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 97338134398f..f078d2b1cff6 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -14,6 +14,7 @@
 #include <linux/workqueue.h>
 #include <linux/sched.h>
 #include <linux/capability.h>
+#include <linux/cryptohash.h>
 
 #include <net/sch_generic.h>
 
@@ -56,6 +57,9 @@ struct bpf_prog_aux;
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
+/* Maximum BPF program size in bytes. */
+#define MAX_BPF_SIZE	(BPF_MAXINSNS * sizeof(struct bpf_insn))
+
 /* Helper macros for filter block array initializers. */
 
 /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
@@ -404,8 +408,9 @@ struct bpf_prog {
 				cb_access:1,	/* Is control block accessed? */
 				dst_needed:1;	/* Do we need dst entry? */
 	kmemcheck_bitfield_end(meta);
-	u32			len;		/* Number of filter blocks */
 	enum bpf_prog_type	type;		/* Type of BPF program */
+	u32			len;		/* Number of filter blocks */
+	u32			digest[SHA_DIGEST_WORDS]; /* Program digest */
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
 	unsigned int		(*bpf_func)(const void *ctx,
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 86786d45ee66..1adc0b654996 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -397,6 +397,7 @@ enum {
 	TCA_BPF_NAME,
 	TCA_BPF_FLAGS,
 	TCA_BPF_FLAGS_GEN,
+	TCA_BPF_DIGEST,
 	__TCA_BPF_MAX,
 };
 
diff --git a/include/uapi/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h
index 063d9d465119..a6b88a6f7f71 100644
--- a/include/uapi/linux/tc_act/tc_bpf.h
+++ b/include/uapi/linux/tc_act/tc_bpf.h
@@ -27,6 +27,7 @@ enum {
 	TCA_ACT_BPF_FD,
 	TCA_ACT_BPF_NAME,
 	TCA_ACT_BPF_PAD,
+	TCA_ACT_BPF_DIGEST,
 	__TCA_ACT_BPF_MAX,
 };
 #define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 82a04143368e..bdcc9f4ba767 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -136,6 +136,71 @@ void __bpf_prog_free(struct bpf_prog *fp)
 	vfree(fp);
 }
 
+#define SHA_BPF_RAW_SIZE						\
+	round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES)
+
+/* Called under verifier mutex. */
+void bpf_prog_calc_digest(struct bpf_prog *fp)
+{
+	const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64);
+	static u32 ws[SHA_WORKSPACE_WORDS];
+	static u8 raw[SHA_BPF_RAW_SIZE];
+	struct bpf_insn *dst = (void *)raw;
+	u32 i, bsize, psize, blocks;
+	bool was_ld_map;
+	u8 *todo = raw;
+	__be32 *result;
+	__be64 *bits;
+
+	sha_init(fp->digest);
+	memset(ws, 0, sizeof(ws));
+
+	/* We need to take out the map fd for the digest calculation
+	 * since they are unstable from user space side.
+	 */
+	for (i = 0, was_ld_map = false; i < fp->len; i++) {
+		dst[i] = fp->insnsi[i];
+		if (!was_ld_map &&
+		    dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+		    dst[i].src_reg == BPF_PSEUDO_MAP_FD) {
+			was_ld_map = true;
+			dst[i].imm = 0;
+		} else if (was_ld_map &&
+			   dst[i].code == 0 &&
+			   dst[i].dst_reg == 0 &&
+			   dst[i].src_reg == 0 &&
+			   dst[i].off == 0) {
+			was_ld_map = false;
+			dst[i].imm = 0;
+		} else {
+			was_ld_map = false;
+		}
+	}
+
+	psize = fp->len * sizeof(struct bpf_insn);
+	memset(&raw[psize], 0, sizeof(raw) - psize);
+	raw[psize++] = 0x80;
+
+	bsize  = round_up(psize, SHA_MESSAGE_BYTES);
+	blocks = bsize / SHA_MESSAGE_BYTES;
+	if (bsize - psize >= sizeof(__be64)) {
+		bits = (__be64 *)(todo + bsize - sizeof(__be64));
+	} else {
+		bits = (__be64 *)(todo + bsize + bits_offset);
+		blocks++;
+	}
+	*bits = cpu_to_be64((psize - 1) << 3);
+
+	while (blocks--) {
+		sha_transform(fp->digest, todo, ws);
+		todo += SHA_MESSAGE_BYTES;
+	}
+
+	result = (__force __be32 *)fp->digest;
+	for (i = 0; i < SHA_DIGEST_WORDS; i++)
+		result[i] = cpu_to_be32(fp->digest[i]);
+}
+
 static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
 {
 	return BPF_CLASS(insn->code) == BPF_JMP  &&
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 85af86c496cd..c0d2b423ce93 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -662,8 +662,30 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+#ifdef CONFIG_PROC_FS
+static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+	const struct bpf_prog *prog = filp->private_data;
+	char prog_digest[sizeof(prog->digest) * 2 + 1] = { };
+
+	bin2hex(prog_digest, prog->digest, sizeof(prog->digest));
+	seq_printf(m,
+		   "prog_type:\t%u\n"
+		   "prog_jited:\t%u\n"
+		   "prog_digest:\t%s\n"
+		   "memlock:\t%llu\n",
+		   prog->type,
+		   prog->jited,
+		   prog_digest,
+		   prog->pages * 1ULL << PAGE_SHIFT);
+}
+#endif
+
 static const struct file_operations bpf_prog_fops = {
-        .release = bpf_prog_release,
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= bpf_prog_show_fdinfo,
+#endif
+	.release	= bpf_prog_release,
 };
 
 int bpf_prog_new_fd(struct bpf_prog *prog)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 38d05da84a49..cb37339ca0da 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3176,6 +3176,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		log_level = 0;
 	}
 
+	bpf_prog_calc_digest(env->prog);
+
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
 		goto skip_full_check;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 84c1d2da4f8b..1c60317f0121 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -117,10 +117,19 @@ static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog,
 static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog,
 				  struct sk_buff *skb)
 {
+	struct nlattr *nla;
+
 	if (prog->bpf_name &&
 	    nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name))
 		return -EMSGSIZE;
 
+	nla = nla_reserve(skb, TCA_ACT_BPF_DIGEST,
+			  sizeof(prog->filter->digest));
+	if (nla == NULL)
+		return -EMSGSIZE;
+
+	memcpy(nla_data(nla), prog->filter->digest, nla_len(nla));
+
 	return 0;
 }
 
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index f70e03d2d2c8..adc776048d1a 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -549,10 +549,18 @@ static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog,
 static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
 				  struct sk_buff *skb)
 {
+	struct nlattr *nla;
+
 	if (prog->bpf_name &&
 	    nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name))
 		return -EMSGSIZE;
 
+	nla = nla_reserve(skb, TCA_BPF_DIGEST, sizeof(prog->filter->digest));
+	if (nla == NULL)
+		return -EMSGSIZE;
+
+	memcpy(nla_data(nla), prog->filter->digest, nla_len(nla));
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 0aa8c57a04907a5d02068ff9f917629be97ea78d Mon Sep 17 00:00:00 2001
From: Aaron Conole <aconole@bytheb.org>
Date: Tue, 15 Nov 2016 17:48:44 -0500
Subject: netfilter: introduce accessor functions for hook entries

This allows easier future refactoring.

Signed-off-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h       | 27 +++++++++++++++++++++++++++
 net/bridge/br_netfilter_hooks.c |  2 +-
 net/netfilter/core.c            | 10 ++++------
 net/netfilter/nf_queue.c        |  5 ++---
 4 files changed, 34 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 69230140215b..575aa198097e 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -79,6 +79,33 @@ struct nf_hook_entry {
 	const struct nf_hook_ops	*orig_ops;
 };
 
+static inline void
+nf_hook_entry_init(struct nf_hook_entry *entry,	const struct nf_hook_ops *ops)
+{
+	entry->next = NULL;
+	entry->ops = *ops;
+	entry->orig_ops = ops;
+}
+
+static inline int
+nf_hook_entry_priority(const struct nf_hook_entry *entry)
+{
+	return entry->ops.priority;
+}
+
+static inline int
+nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb,
+		     struct nf_hook_state *state)
+{
+	return entry->ops.hook(entry->ops.priv, skb, state);
+}
+
+static inline const struct nf_hook_ops *
+nf_hook_entry_ops(const struct nf_hook_entry *entry)
+{
+	return entry->orig_ops;
+}
+
 static inline void nf_hook_state_init(struct nf_hook_state *p,
 				      unsigned int hook,
 				      u_int8_t pf,
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 83d937f4415e..adad2eed29e6 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -1010,7 +1010,7 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net,
 
 	elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]);
 
-	while (elem && (elem->ops.priority <= NF_BR_PRI_BRNF))
+	while (elem && (nf_hook_entry_priority(elem) <= NF_BR_PRI_BRNF))
 		elem = rcu_dereference(elem->next);
 
 	if (!elem)
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index de30e08d58f2..2bb46e2d8d30 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -102,15 +102,13 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 	if (!entry)
 		return -ENOMEM;
 
-	entry->orig_ops	= reg;
-	entry->ops	= *reg;
-	entry->next	= NULL;
+	nf_hook_entry_init(entry, reg);
 
 	mutex_lock(&nf_hook_mutex);
 
 	/* Find the spot in the list */
 	while ((p = nf_entry_dereference(*pp)) != NULL) {
-		if (reg->priority < p->orig_ops->priority)
+		if (reg->priority < nf_hook_entry_priority(p))
 			break;
 		pp = &p->next;
 	}
@@ -140,7 +138,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 
 	mutex_lock(&nf_hook_mutex);
 	while ((p = nf_entry_dereference(*pp)) != NULL) {
-		if (p->orig_ops == reg) {
+		if (nf_hook_entry_ops(p) == reg) {
 			rcu_assign_pointer(*pp, p->next);
 			break;
 		}
@@ -311,7 +309,7 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
 	int ret;
 
 	do {
-		verdict = entry->ops.hook(entry->ops.priv, skb, state);
+		verdict = nf_hook_entry_hookfn(entry, skb, state);
 		switch (verdict & NF_VERDICT_MASK) {
 		case NF_ACCEPT:
 			entry = rcu_dereference(entry->next);
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 77cba9f6ccb6..4a7662486f44 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -185,7 +185,7 @@ static unsigned int nf_iterate(struct sk_buff *skb,
 
 	do {
 repeat:
-		verdict = (*entryp)->ops.hook((*entryp)->ops.priv, skb, state);
+		verdict = nf_hook_entry_hookfn((*entryp), skb, state);
 		if (verdict != NF_ACCEPT) {
 			if (verdict != NF_REPEAT)
 				return verdict;
@@ -200,7 +200,6 @@ repeat:
 void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 {
 	struct nf_hook_entry *hook_entry = entry->hook;
-	struct nf_hook_ops *elem = &hook_entry->ops;
 	struct sk_buff *skb = entry->skb;
 	const struct nf_afinfo *afinfo;
 	int err;
@@ -209,7 +208,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 
 	/* Continue traversal iff userspace said ok... */
 	if (verdict == NF_REPEAT)
-		verdict = elem->hook(elem->priv, skb, &entry->state);
+		verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);
 
 	if (verdict == NF_ACCEPT) {
 		afinfo = nf_get_afinfo(entry->state.pf);
-- 
cgit v1.2.3


From d415b9eb76fc55c03ef5451691170aa5771dcea3 Mon Sep 17 00:00:00 2001
From: Aaron Conole <aconole@redhat.com>
Date: Tue, 15 Nov 2016 17:48:45 -0500
Subject: netfilter: decouple nf_hook_entry and nf_hook_ops

During nfhook traversal we only need a very small subset of
nf_hook_ops members.

We need:
- next element
- hook function to call
- hook function priv argument

Bridge netfilter also needs 'thresh'; can be obtained via ->orig_ops.

nf_hook_entry struct is now 32 bytes on x86_64.

A followup patch will turn the run-time list into an array that only
stores hook functions plus their priv arguments, eliminating the ->next
element.

Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 575aa198097e..a4b97be30b28 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -75,7 +75,8 @@ struct nf_hook_ops {
 
 struct nf_hook_entry {
 	struct nf_hook_entry __rcu	*next;
-	struct nf_hook_ops		ops;
+	nf_hookfn			*hook;
+	void				*priv;
 	const struct nf_hook_ops	*orig_ops;
 };
 
@@ -83,21 +84,22 @@ static inline void
 nf_hook_entry_init(struct nf_hook_entry *entry,	const struct nf_hook_ops *ops)
 {
 	entry->next = NULL;
-	entry->ops = *ops;
+	entry->hook = ops->hook;
+	entry->priv = ops->priv;
 	entry->orig_ops = ops;
 }
 
 static inline int
 nf_hook_entry_priority(const struct nf_hook_entry *entry)
 {
-	return entry->ops.priority;
+	return entry->orig_ops->priority;
 }
 
 static inline int
 nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb,
 		     struct nf_hook_state *state)
 {
-	return entry->ops.hook(entry->ops.priv, skb, state);
+	return entry->hook(entry->priv, skb, state);
 }
 
 static inline const struct nf_hook_ops *
-- 
cgit v1.2.3


From 4d31eef5176df06f218201bc9c0ce40babb41660 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 22 Nov 2016 14:44:17 +0100
Subject: netfilter: x_tables: pass xt_counters struct instead of packet
 counter

On SMP we overload the packet counter (unsigned long) to contain
percpu offset.  Hide this from callers and pass xt_counters address
instead.

Preparation patch to allocate the percpu counters in page-sized batch
chunks.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 6 +-----
 net/ipv4/netfilter/arp_tables.c    | 4 ++--
 net/ipv4/netfilter/ip_tables.c     | 4 ++--
 net/ipv6/netfilter/ip6_tables.c    | 5 ++---
 net/netfilter/x_tables.c           | 9 +++++++++
 5 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index cd4eaf8df445..6e61edeb68e3 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -430,11 +430,7 @@ static inline unsigned long xt_percpu_counter_alloc(void)
 
 	return 0;
 }
-static inline void xt_percpu_counter_free(u64 pcnt)
-{
-	if (nr_cpu_ids > 1)
-		free_percpu((void __percpu *) (unsigned long) pcnt);
-}
+void xt_percpu_counter_free(struct xt_counters *cnt);
 
 static inline struct xt_counters *
 xt_get_this_cpu_counter(struct xt_counters *cnt)
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 848a0704b28f..019f8e8dda6d 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -439,7 +439,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
 err:
 	module_put(t->u.kernel.target->me);
 out:
-	xt_percpu_counter_free(e->counters.pcnt);
+	xt_percpu_counter_free(&e->counters);
 
 	return ret;
 }
@@ -519,7 +519,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 	module_put(par.target->me);
-	xt_percpu_counter_free(e->counters.pcnt);
+	xt_percpu_counter_free(&e->counters);
 }
 
 /* Checks and translates the user-supplied table segment (held in
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 46815c8a60d7..acc9a0c45bdf 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -582,7 +582,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 		cleanup_match(ematch, net);
 	}
 
-	xt_percpu_counter_free(e->counters.pcnt);
+	xt_percpu_counter_free(&e->counters);
 
 	return ret;
 }
@@ -670,7 +670,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net)
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 	module_put(par.target->me);
-	xt_percpu_counter_free(e->counters.pcnt);
+	xt_percpu_counter_free(&e->counters);
 }
 
 /* Checks and translates the user-supplied table segment (held in
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 6ff42b8301cc..88b56a98905b 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -612,7 +612,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 		cleanup_match(ematch, net);
 	}
 
-	xt_percpu_counter_free(e->counters.pcnt);
+	xt_percpu_counter_free(&e->counters);
 
 	return ret;
 }
@@ -699,8 +699,7 @@ static void cleanup_entry(struct ip6t_entry *e, struct net *net)
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 	module_put(par.target->me);
-
-	xt_percpu_counter_free(e->counters.pcnt);
+	xt_percpu_counter_free(&e->counters);
 }
 
 /* Checks and translates the user-supplied table segment (held in
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index ad818e52859b..0580029eb0ee 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1615,6 +1615,15 @@ void xt_proto_fini(struct net *net, u_int8_t af)
 }
 EXPORT_SYMBOL_GPL(xt_proto_fini);
 
+void xt_percpu_counter_free(struct xt_counters *counters)
+{
+	unsigned long pcnt = counters->pcnt;
+
+	if (nr_cpu_ids > 1)
+		free_percpu((void __percpu *)pcnt);
+}
+EXPORT_SYMBOL_GPL(xt_percpu_counter_free);
+
 static int __net_init xt_net_init(struct net *net)
 {
 	int i;
-- 
cgit v1.2.3


From f28e15bacedd444608e25421c72eb2cf4527c9ca Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 22 Nov 2016 14:44:18 +0100
Subject: netfilter: x_tables: pass xt_counters struct to counter allocator

Keeps some noise away from a followup patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 27 +--------------------------
 net/ipv4/netfilter/arp_tables.c    |  5 +----
 net/ipv4/netfilter/ip_tables.c     |  5 +----
 net/ipv6/netfilter/ip6_tables.c    |  5 +----
 net/netfilter/x_tables.c           | 30 ++++++++++++++++++++++++++++++
 5 files changed, 34 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 6e61edeb68e3..05a94bd32c55 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -404,32 +404,7 @@ static inline unsigned long ifname_compare_aligned(const char *_a,
 }
 
 
-/* On SMP, ip(6)t_entry->counters.pcnt holds address of the
- * real (percpu) counter.  On !SMP, its just the packet count,
- * so nothing needs to be done there.
- *
- * xt_percpu_counter_alloc returns the address of the percpu
- * counter, or 0 on !SMP. We force an alignment of 16 bytes
- * so that bytes/packets share a common cache line.
- *
- * Hence caller must use IS_ERR_VALUE to check for error, this
- * allows us to return 0 for single core systems without forcing
- * callers to deal with SMP vs. NONSMP issues.
- */
-static inline unsigned long xt_percpu_counter_alloc(void)
-{
-	if (nr_cpu_ids > 1) {
-		void __percpu *res = __alloc_percpu(sizeof(struct xt_counters),
-						    sizeof(struct xt_counters));
-
-		if (res == NULL)
-			return -ENOMEM;
-
-		return (__force unsigned long) res;
-	}
-
-	return 0;
-}
+bool xt_percpu_counter_alloc(struct xt_counters *counters);
 void xt_percpu_counter_free(struct xt_counters *cnt);
 
 static inline struct xt_counters *
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 019f8e8dda6d..808deb275ceb 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -415,13 +415,10 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
 {
 	struct xt_entry_target *t;
 	struct xt_target *target;
-	unsigned long pcnt;
 	int ret;
 
-	pcnt = xt_percpu_counter_alloc();
-	if (IS_ERR_VALUE(pcnt))
+	if (!xt_percpu_counter_alloc(&e->counters))
 		return -ENOMEM;
-	e->counters.pcnt = pcnt;
 
 	t = arpt_get_target(e);
 	target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index acc9a0c45bdf..a48430d3420f 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -539,12 +539,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 	unsigned int j;
 	struct xt_mtchk_param mtpar;
 	struct xt_entry_match *ematch;
-	unsigned long pcnt;
 
-	pcnt = xt_percpu_counter_alloc();
-	if (IS_ERR_VALUE(pcnt))
+	if (!xt_percpu_counter_alloc(&e->counters))
 		return -ENOMEM;
-	e->counters.pcnt = pcnt;
 
 	j = 0;
 	mtpar.net	= net;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 88b56a98905b..a5a92083fd62 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -570,12 +570,9 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 	unsigned int j;
 	struct xt_mtchk_param mtpar;
 	struct xt_entry_match *ematch;
-	unsigned long pcnt;
 
-	pcnt = xt_percpu_counter_alloc();
-	if (IS_ERR_VALUE(pcnt))
+	if (!xt_percpu_counter_alloc(&e->counters))
 		return -ENOMEM;
-	e->counters.pcnt = pcnt;
 
 	j = 0;
 	mtpar.net	= net;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 0580029eb0ee..be5e83047594 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1615,6 +1615,36 @@ void xt_proto_fini(struct net *net, u_int8_t af)
 }
 EXPORT_SYMBOL_GPL(xt_proto_fini);
 
+/**
+ * xt_percpu_counter_alloc - allocate x_tables rule counter
+ *
+ * @counter: pointer to counter struct inside the ip(6)/arpt_entry struct
+ *
+ * On SMP, the packet counter [ ip(6)t_entry->counters.pcnt ] will then
+ * contain the address of the real (percpu) counter.
+ *
+ * Rule evaluation needs to use xt_get_this_cpu_counter() helper
+ * to fetch the real percpu counter.
+ *
+ * returns false on error.
+ */
+bool xt_percpu_counter_alloc(struct xt_counters *counter)
+{
+	void __percpu *res;
+
+	if (nr_cpu_ids <= 1)
+		return true;
+
+	res = __alloc_percpu(sizeof(struct xt_counters),
+			     sizeof(struct xt_counters));
+	if (!res)
+		return false;
+
+	counter->pcnt = (__force unsigned long)res;
+	return true;
+}
+EXPORT_SYMBOL_GPL(xt_percpu_counter_alloc);
+
 void xt_percpu_counter_free(struct xt_counters *counters)
 {
 	unsigned long pcnt = counters->pcnt;
-- 
cgit v1.2.3


From ae0ac0ed6fcf5af3be0f63eb935f483f44a402d2 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 22 Nov 2016 14:44:19 +0100
Subject: netfilter: x_tables: pack percpu counter allocations

instead of allocating each xt_counter individually, allocate 4k chunks
and then use these for counter allocation requests.

This should speed up rule evaluation by increasing data locality,
also speeds up ruleset loading because we reduce calls to the percpu
allocator.

As Eric points out we can't use PAGE_SIZE, page_allocator would fail on
arches with 64k page size.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h |  7 ++++++-
 net/ipv4/netfilter/arp_tables.c    |  9 ++++++---
 net/ipv4/netfilter/ip_tables.c     |  9 ++++++---
 net/ipv6/netfilter/ip6_tables.c    |  9 ++++++---
 net/netfilter/x_tables.c           | 33 ++++++++++++++++++++++++---------
 5 files changed, 48 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 05a94bd32c55..5117e4d2ddfa 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -403,8 +403,13 @@ static inline unsigned long ifname_compare_aligned(const char *_a,
 	return ret;
 }
 
+struct xt_percpu_counter_alloc_state {
+	unsigned int off;
+	const char __percpu *mem;
+};
 
-bool xt_percpu_counter_alloc(struct xt_counters *counters);
+bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state,
+			     struct xt_counters *counter);
 void xt_percpu_counter_free(struct xt_counters *cnt);
 
 static inline struct xt_counters *
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 808deb275ceb..1258a9ab62ef 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -411,13 +411,14 @@ static inline int check_target(struct arpt_entry *e, const char *name)
 }
 
 static inline int
-find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
+find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
+		 struct xt_percpu_counter_alloc_state *alloc_state)
 {
 	struct xt_entry_target *t;
 	struct xt_target *target;
 	int ret;
 
-	if (!xt_percpu_counter_alloc(&e->counters))
+	if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
 		return -ENOMEM;
 
 	t = arpt_get_target(e);
@@ -525,6 +526,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
 static int translate_table(struct xt_table_info *newinfo, void *entry0,
 			   const struct arpt_replace *repl)
 {
+	struct xt_percpu_counter_alloc_state alloc_state = { 0 };
 	struct arpt_entry *iter;
 	unsigned int *offsets;
 	unsigned int i;
@@ -587,7 +589,8 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
 	/* Finally, each sanity check must pass */
 	i = 0;
 	xt_entry_foreach(iter, entry0, newinfo->size) {
-		ret = find_check_entry(iter, repl->name, repl->size);
+		ret = find_check_entry(iter, repl->name, repl->size,
+				       &alloc_state);
 		if (ret != 0)
 			break;
 		++i;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index a48430d3420f..308b456723f0 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -531,7 +531,8 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name)
 
 static int
 find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
-		 unsigned int size)
+		 unsigned int size,
+		 struct xt_percpu_counter_alloc_state *alloc_state)
 {
 	struct xt_entry_target *t;
 	struct xt_target *target;
@@ -540,7 +541,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 	struct xt_mtchk_param mtpar;
 	struct xt_entry_match *ematch;
 
-	if (!xt_percpu_counter_alloc(&e->counters))
+	if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
 		return -ENOMEM;
 
 	j = 0;
@@ -676,6 +677,7 @@ static int
 translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 		const struct ipt_replace *repl)
 {
+	struct xt_percpu_counter_alloc_state alloc_state = { 0 };
 	struct ipt_entry *iter;
 	unsigned int *offsets;
 	unsigned int i;
@@ -735,7 +737,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 	/* Finally, each sanity check must pass */
 	i = 0;
 	xt_entry_foreach(iter, entry0, newinfo->size) {
-		ret = find_check_entry(iter, net, repl->name, repl->size);
+		ret = find_check_entry(iter, net, repl->name, repl->size,
+				       &alloc_state);
 		if (ret != 0)
 			break;
 		++i;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index a5a92083fd62..d56d8ac09a94 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -562,7 +562,8 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
 
 static int
 find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
-		 unsigned int size)
+		 unsigned int size,
+		 struct xt_percpu_counter_alloc_state *alloc_state)
 {
 	struct xt_entry_target *t;
 	struct xt_target *target;
@@ -571,7 +572,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 	struct xt_mtchk_param mtpar;
 	struct xt_entry_match *ematch;
 
-	if (!xt_percpu_counter_alloc(&e->counters))
+	if (!xt_percpu_counter_alloc(alloc_state, &e->counters))
 		return -ENOMEM;
 
 	j = 0;
@@ -705,6 +706,7 @@ static int
 translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 		const struct ip6t_replace *repl)
 {
+	struct xt_percpu_counter_alloc_state alloc_state = { 0 };
 	struct ip6t_entry *iter;
 	unsigned int *offsets;
 	unsigned int i;
@@ -764,7 +766,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 	/* Finally, each sanity check must pass */
 	i = 0;
 	xt_entry_foreach(iter, entry0, newinfo->size) {
-		ret = find_check_entry(iter, net, repl->name, repl->size);
+		ret = find_check_entry(iter, net, repl->name, repl->size,
+				       &alloc_state);
 		if (ret != 0)
 			break;
 		++i;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index be5e83047594..f6ce4a7036e6 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -40,6 +40,7 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
 MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
 
 #define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+#define XT_PCPU_BLOCK_SIZE 4096
 
 struct compat_delta {
 	unsigned int offset; /* offset in kernel */
@@ -1618,6 +1619,7 @@ EXPORT_SYMBOL_GPL(xt_proto_fini);
 /**
  * xt_percpu_counter_alloc - allocate x_tables rule counter
  *
+ * @state: pointer to xt_percpu allocation state
  * @counter: pointer to counter struct inside the ip(6)/arpt_entry struct
  *
  * On SMP, the packet counter [ ip(6)t_entry->counters.pcnt ] will then
@@ -1626,21 +1628,34 @@ EXPORT_SYMBOL_GPL(xt_proto_fini);
  * Rule evaluation needs to use xt_get_this_cpu_counter() helper
  * to fetch the real percpu counter.
  *
+ * To speed up allocation and improve data locality, a 4kb block is
+ * allocated.
+ *
+ * xt_percpu_counter_alloc_state contains the base address of the
+ * allocated page and the current sub-offset.
+ *
  * returns false on error.
  */
-bool xt_percpu_counter_alloc(struct xt_counters *counter)
+bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state,
+			     struct xt_counters *counter)
 {
-	void __percpu *res;
+	BUILD_BUG_ON(XT_PCPU_BLOCK_SIZE < (sizeof(*counter) * 2));
 
 	if (nr_cpu_ids <= 1)
 		return true;
 
-	res = __alloc_percpu(sizeof(struct xt_counters),
-			     sizeof(struct xt_counters));
-	if (!res)
-		return false;
-
-	counter->pcnt = (__force unsigned long)res;
+	if (!state->mem) {
+		state->mem = __alloc_percpu(XT_PCPU_BLOCK_SIZE,
+					    XT_PCPU_BLOCK_SIZE);
+		if (!state->mem)
+			return false;
+	}
+	counter->pcnt = (__force unsigned long)(state->mem + state->off);
+	state->off += sizeof(*counter);
+	if (state->off > (XT_PCPU_BLOCK_SIZE - sizeof(*counter))) {
+		state->mem = NULL;
+		state->off = 0;
+	}
 	return true;
 }
 EXPORT_SYMBOL_GPL(xt_percpu_counter_alloc);
@@ -1649,7 +1664,7 @@ void xt_percpu_counter_free(struct xt_counters *counters)
 {
 	unsigned long pcnt = counters->pcnt;
 
-	if (nr_cpu_ids > 1)
+	if (nr_cpu_ids > 1 && (pcnt & (XT_PCPU_BLOCK_SIZE - 1)) == 0)
 		free_percpu((void __percpu *)pcnt);
 }
 EXPORT_SYMBOL_GPL(xt_percpu_counter_free);
-- 
cgit v1.2.3


From df122f58b834b24c27d7e2ac02a4910d3e56f6ae Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 28 Nov 2016 11:40:05 +0100
Subject: netfilter: ingress: translate 0 nf_hook_slow retval to -1

The caller assumes that < 0 means that skb was stolen (or free'd).

All other return values continue skb processing.

nf_hook_slow returns 3 different return value types:

A) a (negative) errno value: the skb was dropped (NF_DROP, e.g.
by iptables '-j DROP' rule).

B) 0. The skb was stolen by the hook or queued to userspace.

C) 1. all hooks returned NF_ACCEPT so the caller should invoke
   the okfn so packet processing can continue.

nft ingress facility currently doesn't have the 'okfn' that
the NF_HOOK() macros use; there is no nfqueue support either.

So 1 means that nf_hook_ingress() caller should go on processing the skb.

In order to allow use of NF_STOLEN from ingress we need to translate
this to an errno number, else we'd crash because we continue with
already-free'd (or about to be free-d) skb.

The errno value isn't checked, its just important that its less than 0,
so return -1.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ingress.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
index 2dc3b49b804a..59476061de86 100644
--- a/include/linux/netfilter_ingress.h
+++ b/include/linux/netfilter_ingress.h
@@ -19,6 +19,7 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 {
 	struct nf_hook_entry *e = rcu_dereference(skb->dev->nf_hooks_ingress);
 	struct nf_hook_state state;
+	int ret;
 
 	/* Must recheck the ingress hook head, in the event it became NULL
 	 * after the check in nf_hook_ingress_active evaluated to true.
@@ -29,7 +30,11 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 	nf_hook_state_init(&state, NF_NETDEV_INGRESS,
 			   NFPROTO_NETDEV, skb->dev, NULL, NULL,
 			   dev_net(skb->dev), NULL);
-	return nf_hook_slow(skb, &state, e);
+	ret = nf_hook_slow(skb, &state, e);
+	if (ret == 0)
+		return -1;
+
+	return ret;
 }
 
 static inline void nf_hook_ingress_init(struct net_device *dev)
-- 
cgit v1.2.3


From 89caaa2d80b7bf9bd8632cd3137254f8c685e5db Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@axis.com>
Date: Wed, 7 Dec 2016 15:20:07 +0100
Subject: net: stmmac: add support for independent DMA pbl for tx/rx

GMAC and newer supports independent programmable burst lengths for
DMA tx/rx. Add new optional devicetree properties representing this.

To be backwards compatible, snps,pbl will still be valid, but
snps,txpbl/snps,rxpbl will override the value in snps,pbl if set.

If the IP is synthesized to use the AXI interface, there is a register
and a matching DT property inside the optional stmmac-axi-config DT node
for controlling burst lengths, named snps,blen.
However, using this register, it is not possible to control tx and rx
independently. Also, this register is not available if the IP was
synthesized with, e.g., the AHB interface.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
Acked-by: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/stmmac.txt      |  6 +++++-
 Documentation/networking/stmmac.txt                   | 19 +++++++++++++------
 drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c   | 12 ++++++------
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c      | 12 +++++++-----
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c |  2 ++
 include/linux/stmmac.h                                |  2 ++
 6 files changed, 35 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index b95ff998ba73..8080038ff1b2 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -34,7 +34,11 @@ Optional properties:
   platforms.
 - tx-fifo-depth: See ethernet.txt file in the same directory
 - rx-fifo-depth: See ethernet.txt file in the same directory
-- snps,pbl		Programmable Burst Length
+- snps,pbl		Programmable Burst Length (tx and rx)
+- snps,txpbl		Tx Programmable Burst Length. Only for GMAC and newer.
+			If set, DMA tx will use this value rather than snps,pbl.
+- snps,rxpbl		Rx Programmable Burst Length. Only for GMAC and newer.
+			If set, DMA rx will use this value rather than snps,pbl.
 - snps,aal		Address-Aligned Beats
 - snps,fixed-burst	Program the DMA to use the fixed burst mode
 - snps,mixed-burst	Program the DMA to use the mixed burst mode
diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
index 014f4f756cb7..6add57374f70 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -153,7 +153,8 @@ Where:
    o pbl: the Programmable Burst Length is maximum number of beats to
        be transferred in one DMA transaction.
        GMAC also enables the 4xPBL by default.
-   o fixed_burst/mixed_burst/burst_len
+   o txpbl/rxpbl: GMAC and newer supports independent DMA pbl for tx/rx.
+   o fixed_burst/mixed_burst/aal
  o clk_csr: fixed CSR Clock range selection.
  o has_gmac: uses the GMAC core.
  o enh_desc: if sets the MAC will use the enhanced descriptor structure.
@@ -205,16 +206,22 @@ tuned according to the HW capabilities.
 
 struct stmmac_dma_cfg {
 	int pbl;
+	int txpbl;
+	int rxpbl;
 	int fixed_burst;
-	int burst_len_supported;
+	int mixed_burst;
+	bool aal;
 };
 
 Where:
- o pbl: Programmable Burst Length
+ o pbl: Programmable Burst Length (tx and rx)
+ o txpbl: Transmit Programmable Burst Length. Only for GMAC and newer.
+	 If set, DMA tx will use this value rather than pbl.
+ o rxpbl: Receive Programmable Burst Length. Only for GMAC and newer.
+	 If set, DMA rx will use this value rather than pbl.
  o fixed_burst: program the DMA to use the fixed burst mode
- o burst_len: this is the value we put in the register
-	      supported values are provided as macros in
-	      linux/stmmac.h header file.
+ o mixed_burst: program the DMA to use the mixed burst mode
+ o aal: Address-Aligned Beats
 
 ---
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
index 318ae9f10104..99b8040af592 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
@@ -89,20 +89,20 @@ static void dwmac1000_dma_init(void __iomem *ioaddr,
 			       u32 dma_tx, u32 dma_rx, int atds)
 {
 	u32 value = readl(ioaddr + DMA_BUS_MODE);
+	int txpbl = dma_cfg->txpbl ?: dma_cfg->pbl;
+	int rxpbl = dma_cfg->rxpbl ?: dma_cfg->pbl;
 
 	/*
 	 * Set the DMA PBL (Programmable Burst Length) mode.
 	 *
 	 * Note: before stmmac core 3.50 this mode bit was 4xPBL, and
 	 * post 3.5 mode bit acts as 8*PBL.
-	 *
-	 * This configuration doesn't take care about the Separate PBL
-	 * so only the bits: 13-8 are programmed with the PBL passed from the
-	 * platform.
 	 */
 	value |= DMA_BUS_MODE_MAXPBL;
-	value &= ~DMA_BUS_MODE_PBL_MASK;
-	value |= (dma_cfg->pbl << DMA_BUS_MODE_PBL_SHIFT);
+	value |= DMA_BUS_MODE_USP;
+	value &= ~(DMA_BUS_MODE_PBL_MASK | DMA_BUS_MODE_RPBL_MASK);
+	value |= (txpbl << DMA_BUS_MODE_PBL_SHIFT);
+	value |= (rxpbl << DMA_BUS_MODE_RPBL_SHIFT);
 
 	/* Set the Fixed burst mode */
 	if (dma_cfg->fixed_burst)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index 7d82a3464097..2c3b2098f350 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -71,11 +71,14 @@ static void dwmac4_dma_axi(void __iomem *ioaddr, struct stmmac_axi *axi)
 	writel(value, ioaddr + DMA_SYS_BUS_MODE);
 }
 
-static void dwmac4_dma_init_channel(void __iomem *ioaddr, int pbl,
+static void dwmac4_dma_init_channel(void __iomem *ioaddr,
+				    struct stmmac_dma_cfg *dma_cfg,
 				    u32 dma_tx_phy, u32 dma_rx_phy,
 				    u32 channel)
 {
 	u32 value;
+	int txpbl = dma_cfg->txpbl ?: dma_cfg->pbl;
+	int rxpbl = dma_cfg->rxpbl ?: dma_cfg->pbl;
 
 	/* set PBL for each channels. Currently we affect same configuration
 	 * on each channel
@@ -85,11 +88,11 @@ static void dwmac4_dma_init_channel(void __iomem *ioaddr, int pbl,
 	writel(value, ioaddr + DMA_CHAN_CONTROL(channel));
 
 	value = readl(ioaddr + DMA_CHAN_TX_CONTROL(channel));
-	value = value | (pbl << DMA_BUS_MODE_PBL_SHIFT);
+	value = value | (txpbl << DMA_BUS_MODE_PBL_SHIFT);
 	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(channel));
 
 	value = readl(ioaddr + DMA_CHAN_RX_CONTROL(channel));
-	value = value | (pbl << DMA_BUS_MODE_RPBL_SHIFT);
+	value = value | (rxpbl << DMA_BUS_MODE_RPBL_SHIFT);
 	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(channel));
 
 	/* Mask interrupts by writing to CSR7 */
@@ -120,8 +123,7 @@ static void dwmac4_dma_init(void __iomem *ioaddr,
 	writel(value, ioaddr + DMA_SYS_BUS_MODE);
 
 	for (i = 0; i < DMA_CHANNEL_NB_MAX; i++)
-		dwmac4_dma_init_channel(ioaddr, dma_cfg->pbl,
-					dma_tx, dma_rx, i);
+		dwmac4_dma_init_channel(ioaddr, dma_cfg, dma_tx, dma_rx, i);
 }
 
 static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 81800f23a9c4..96afe0561c99 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -315,6 +315,8 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 	of_property_read_u32(np, "snps,pbl", &dma_cfg->pbl);
 	if (!dma_cfg->pbl)
 		dma_cfg->pbl = DEFAULT_DMA_PBL;
+	of_property_read_u32(np, "snps,txpbl", &dma_cfg->txpbl);
+	of_property_read_u32(np, "snps,rxpbl", &dma_cfg->rxpbl);
 
 	dma_cfg->aal = of_property_read_bool(np, "snps,aal");
 	dma_cfg->fixed_burst = of_property_read_bool(np, "snps,fixed-burst");
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 3537fb33cc90..e6d7a5940819 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -88,6 +88,8 @@ struct stmmac_mdio_bus_data {
 
 struct stmmac_dma_cfg {
 	int pbl;
+	int txpbl;
+	int rxpbl;
 	int fixed_burst;
 	int mixed_burst;
 	bool aal;
-- 
cgit v1.2.3


From 4022d039a315951e59d95d22e79198d861ce4490 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@axis.com>
Date: Wed, 7 Dec 2016 15:20:08 +0100
Subject: net: smmac: allow configuring lower pbl values

The driver currently always sets the PBLx8/PBLx4 bit, which means that
the pbl values configured via the pbl/txpbl/rxpbl DT properties are
always multiplied by 8/4 in the hardware.

In order to allow the DT to configure lower pbl values, while at the
same time not changing behavior of any existing device trees using the
pbl/txpbl/rxpbl settings, add a property to disable the multiplication
of the pbl by 8/4 in the hardware.

Suggested-by: Rabin Vincent <rabinv@axis.com>
Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
Acked-by: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/stmmac.txt      | 2 ++
 Documentation/networking/stmmac.txt                   | 5 ++++-
 drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c   | 3 ++-
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c      | 3 ++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c      | 2 ++
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 1 +
 include/linux/stmmac.h                                | 1 +
 7 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index 8080038ff1b2..128da752fec9 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -39,6 +39,8 @@ Optional properties:
 			If set, DMA tx will use this value rather than snps,pbl.
 - snps,rxpbl		Rx Programmable Burst Length. Only for GMAC and newer.
 			If set, DMA rx will use this value rather than snps,pbl.
+- snps,no-pbl-x8	Don't multiply the pbl/txpbl/rxpbl values by 8.
+			For core rev < 3.50, don't multiply the values by 4.
 - snps,aal		Address-Aligned Beats
 - snps,fixed-burst	Program the DMA to use the fixed burst mode
 - snps,mixed-burst	Program the DMA to use the mixed burst mode
diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
index 6add57374f70..2bb07078f535 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -152,8 +152,9 @@ Where:
  o dma_cfg: internal DMA parameters
    o pbl: the Programmable Burst Length is maximum number of beats to
        be transferred in one DMA transaction.
-       GMAC also enables the 4xPBL by default.
+       GMAC also enables the 4xPBL by default. (8xPBL for GMAC 3.50 and newer)
    o txpbl/rxpbl: GMAC and newer supports independent DMA pbl for tx/rx.
+   o pblx8: Enable 8xPBL (4xPBL for core rev < 3.50). Enabled by default.
    o fixed_burst/mixed_burst/aal
  o clk_csr: fixed CSR Clock range selection.
  o has_gmac: uses the GMAC core.
@@ -208,6 +209,7 @@ struct stmmac_dma_cfg {
 	int pbl;
 	int txpbl;
 	int rxpbl;
+	bool pblx8;
 	int fixed_burst;
 	int mixed_burst;
 	bool aal;
@@ -219,6 +221,7 @@ Where:
 	 If set, DMA tx will use this value rather than pbl.
  o rxpbl: Receive Programmable Burst Length. Only for GMAC and newer.
 	 If set, DMA rx will use this value rather than pbl.
+ o pblx8: Enable 8xPBL (4xPBL for core rev < 3.50). Enabled by default.
  o fixed_burst: program the DMA to use the fixed burst mode
  o mixed_burst: program the DMA to use the mixed burst mode
  o aal: Address-Aligned Beats
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
index 99b8040af592..612d3aaac9a4 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
@@ -98,7 +98,8 @@ static void dwmac1000_dma_init(void __iomem *ioaddr,
 	 * Note: before stmmac core 3.50 this mode bit was 4xPBL, and
 	 * post 3.5 mode bit acts as 8*PBL.
 	 */
-	value |= DMA_BUS_MODE_MAXPBL;
+	if (dma_cfg->pblx8)
+		value |= DMA_BUS_MODE_MAXPBL;
 	value |= DMA_BUS_MODE_USP;
 	value &= ~(DMA_BUS_MODE_PBL_MASK | DMA_BUS_MODE_RPBL_MASK);
 	value |= (txpbl << DMA_BUS_MODE_PBL_SHIFT);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index 2c3b2098f350..8196ab5fc33c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -84,7 +84,8 @@ static void dwmac4_dma_init_channel(void __iomem *ioaddr,
 	 * on each channel
 	 */
 	value = readl(ioaddr + DMA_CHAN_CONTROL(channel));
-	value = value | DMA_BUS_MODE_PBL;
+	if (dma_cfg->pblx8)
+		value = value | DMA_BUS_MODE_PBL;
 	writel(value, ioaddr + DMA_CHAN_CONTROL(channel));
 
 	value = readl(ioaddr + DMA_CHAN_TX_CONTROL(channel));
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
index 56c8a2342c14..a2831773431a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
@@ -81,6 +81,7 @@ static void stmmac_default_data(struct plat_stmmacenet_data *plat)
 	plat->mdio_bus_data->phy_mask = 0;
 
 	plat->dma_cfg->pbl = 32;
+	plat->dma_cfg->pblx8 = true;
 	/* TODO: AXI */
 
 	/* Set default value for multicast hash bins */
@@ -115,6 +116,7 @@ static int quark_default_data(struct plat_stmmacenet_data *plat,
 	plat->mdio_bus_data->phy_mask = 0;
 
 	plat->dma_cfg->pbl = 16;
+	plat->dma_cfg->pblx8 = true;
 	plat->dma_cfg->fixed_burst = 1;
 	/* AXI (TODO) */
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 96afe0561c99..082cd48db6a7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -317,6 +317,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 		dma_cfg->pbl = DEFAULT_DMA_PBL;
 	of_property_read_u32(np, "snps,txpbl", &dma_cfg->txpbl);
 	of_property_read_u32(np, "snps,rxpbl", &dma_cfg->rxpbl);
+	dma_cfg->pblx8 = !of_property_read_bool(np, "snps,no-pbl-x8");
 
 	dma_cfg->aal = of_property_read_bool(np, "snps,aal");
 	dma_cfg->fixed_burst = of_property_read_bool(np, "snps,fixed-burst");
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index e6d7a5940819..266dab9ad782 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -90,6 +90,7 @@ struct stmmac_dma_cfg {
 	int pbl;
 	int txpbl;
 	int rxpbl;
+	bool pblx8;
 	int fixed_burst;
 	int mixed_burst;
 	bool aal;
-- 
cgit v1.2.3


From 13bfff25c081f4e060af761c4082b5a96f756810 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 7 Dec 2016 08:29:10 -0800
Subject: net: rfs: add a jump label

RFS is not commonly used, so add a jump label to avoid some conditionals
in fast path.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h  |  1 +
 include/net/sock.h         | 25 ++++++++++++++-----------
 net/core/dev.c             |  2 ++
 net/core/sysctl_net_core.c |  5 ++++-
 4 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1ff5ea6e1221..994f7423a74b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -192,6 +192,7 @@ struct net_device_stats {
 #ifdef CONFIG_RPS
 #include <linux/static_key.h>
 extern struct static_key rps_needed;
+extern struct static_key rfs_needed;
 #endif
 
 struct neighbour;
diff --git a/include/net/sock.h b/include/net/sock.h
index 1749e38d0301..2729e77950b7 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -913,17 +913,20 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
 static inline void sock_rps_record_flow(const struct sock *sk)
 {
 #ifdef CONFIG_RPS
-	/* Reading sk->sk_rxhash might incur an expensive cache line miss.
-	 *
-	 * TCP_ESTABLISHED does cover almost all states where RFS
-	 * might be useful, and is cheaper [1] than testing :
-	 *	IPv4: inet_sk(sk)->inet_daddr
-	 * 	IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
-	 * OR	an additional socket flag
-	 * [1] : sk_state and sk_prot are in the same cache line.
-	 */
-	if (sk->sk_state == TCP_ESTABLISHED)
-		sock_rps_record_flow_hash(sk->sk_rxhash);
+	if (static_key_false(&rfs_needed)) {
+		/* Reading sk->sk_rxhash might incur an expensive cache line
+		 * miss.
+		 *
+		 * TCP_ESTABLISHED does cover almost all states where RFS
+		 * might be useful, and is cheaper [1] than testing :
+		 *	IPv4: inet_sk(sk)->inet_daddr
+		 * 	IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
+		 * OR	an additional socket flag
+		 * [1] : sk_state and sk_prot are in the same cache line.
+		 */
+		if (sk->sk_state == TCP_ESTABLISHED)
+			sock_rps_record_flow_hash(sk->sk_rxhash);
+	}
 #endif
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index bffb5253e778..1d33ce03365f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3447,6 +3447,8 @@ EXPORT_SYMBOL(rps_cpu_mask);
 
 struct static_key rps_needed __read_mostly;
 EXPORT_SYMBOL(rps_needed);
+struct static_key rfs_needed __read_mostly;
+EXPORT_SYMBOL(rfs_needed);
 
 static struct rps_dev_flow *
 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 0df2aa652530..2a46e4009f62 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -79,10 +79,13 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 
 		if (sock_table != orig_sock_table) {
 			rcu_assign_pointer(rps_sock_flow_table, sock_table);
-			if (sock_table)
+			if (sock_table) {
 				static_key_slow_inc(&rps_needed);
+				static_key_slow_inc(&rfs_needed);
+			}
 			if (orig_sock_table) {
 				static_key_slow_dec(&rps_needed);
+				static_key_slow_dec(&rfs_needed);
 				synchronize_rcu();
 				vfree(orig_sock_table);
 			}
-- 
cgit v1.2.3


From c8c8b127091b758f5768f906bcdeeb88bc9951ca Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 7 Dec 2016 09:19:33 -0800
Subject: udp: under rx pressure, try to condense skbs

Under UDP flood, many softirq producers try to add packets to
UDP receive queue, and one user thread is burning one cpu trying
to dequeue packets as fast as possible.

Two parts of the per packet cost are :
- copying payload from kernel space to user space,
- freeing memory pieces associated with skb.

If socket is under pressure, softirq handler(s) can try to pull in
skb->head the payload of the packet if it fits.

Meaning the softirq handler(s) can free/reuse the page fragment
immediately, instead of letting udp_recvmsg() do this hundreds of usec
later, possibly from another node.

Additional gains :
- We reduce skb->truesize and thus can store more packets per SO_RCVBUF
- We avoid cache line misses at copyout() time and consume_skb() time,
and avoid one put_page() with potential alien freeing on NUMA hosts.

This comes at the cost of a copy, bounded to available tail room, which
is usually small. (We might have to fix GRO_MAX_HEAD which looks bigger
than necessary)

This patch gave me about 5 % increase in throughput in my tests.

skb_condense() helper could probably used in other contexts.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  2 ++
 net/core/skbuff.c      | 28 ++++++++++++++++++++++++++++
 net/ipv4/udp.c         | 12 +++++++++++-
 3 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9c535fbccf2c..0cd92b0f2af5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1966,6 +1966,8 @@ static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len)
 	return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL;
 }
 
+void skb_condense(struct sk_buff *skb);
+
 /**
  *	skb_headroom - bytes at buffer head
  *	@skb: buffer to check
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b45cd1494243..84151cf40aeb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4931,3 +4931,31 @@ struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
 	return clone;
 }
 EXPORT_SYMBOL(pskb_extract);
+
+/**
+ * skb_condense - try to get rid of fragments/frag_list if possible
+ * @skb: buffer
+ *
+ * Can be used to save memory before skb is added to a busy queue.
+ * If packet has bytes in frags and enough tail room in skb->head,
+ * pull all of them, so that we can free the frags right now and adjust
+ * truesize.
+ * Notes:
+ *	We do not reallocate skb->head thus can not fail.
+ *	Caller must re-evaluate skb->truesize if needed.
+ */
+void skb_condense(struct sk_buff *skb)
+{
+	if (!skb->data_len ||
+	    skb->data_len > skb->end - skb->tail ||
+	    skb_cloned(skb))
+		return;
+
+	/* Nice, we can free page frag(s) right now */
+	__pskb_pull_tail(skb, skb->data_len);
+
+	/* Now adjust skb->truesize, since __pskb_pull_tail() does
+	 * not do this.
+	 */
+	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
+}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 16d88ba9ff1c..f5628ada47b5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1199,7 +1199,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct sk_buff_head *list = &sk->sk_receive_queue;
 	int rmem, delta, amt, err = -ENOMEM;
-	int size = skb->truesize;
+	int size;
 
 	/* try to avoid the costly atomic add/sub pair when the receive
 	 * queue is full; always allow at least a packet
@@ -1208,6 +1208,16 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 	if (rmem > sk->sk_rcvbuf)
 		goto drop;
 
+	/* Under mem pressure, it might be helpful to help udp_recvmsg()
+	 * having linear skbs :
+	 * - Reduce memory overhead and thus increase receive queue capacity
+	 * - Less cache line misses at copyout() time
+	 * - Less work at consume_skb() (less alien page frag freeing)
+	 */
+	if (rmem > (sk->sk_rcvbuf >> 1))
+		skb_condense(skb);
+	size = skb->truesize;
+
 	/* we drop only if the receive buf is full and the receive
 	 * queue contains some other skb
 	 */
-- 
cgit v1.2.3


From d2a4dd37f6b41fbcad76efbf63124eb3126c66fe Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 7 Dec 2016 10:57:59 -0800
Subject: bpf: fix state equivalence

Commmits 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers")
and 484611357c19 ("bpf: allow access into map value arrays") by themselves
are correct, but in combination they make state equivalence ignore 'id' field
of the register state which can lead to accepting invalid program.

Fixes: 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers")
Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h | 14 +++++++-------
 kernel/bpf/verifier.c        |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7453c1281531..a13b031dc6b8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -18,13 +18,6 @@
 
 struct bpf_reg_state {
 	enum bpf_reg_type type;
-	/*
-	 * Used to determine if any memory access using this register will
-	 * result in a bad access.
-	 */
-	s64 min_value;
-	u64 max_value;
-	u32 id;
 	union {
 		/* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
 		s64 imm;
@@ -40,6 +33,13 @@ struct bpf_reg_state {
 		 */
 		struct bpf_map *map_ptr;
 	};
+	u32 id;
+	/* Used to determine if any memory access using this register will
+	 * result in a bad access. These two fields must be last.
+	 * See states_equal()
+	 */
+	s64 min_value;
+	u64 max_value;
 };
 
 enum bpf_stack_slot_type {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index da9fb2a9b7eb..5b14f85f45c6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2528,7 +2528,7 @@ static bool states_equal(struct bpf_verifier_env *env,
 		 * we didn't do a variable access into a map then we are a-ok.
 		 */
 		if (!varlen_map_access &&
-		    rold->type == rcur->type && rold->imm == rcur->imm)
+		    memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0)
 			continue;
 
 		/* If we didn't map access then again we don't care about the
-- 
cgit v1.2.3


From f38e7a32ee4fc9c8aeeac59e6e0462cd281586e3 Mon Sep 17 00:00:00 2001
From: "Woojung.Huh@microchip.com" <Woojung.Huh@microchip.com>
Date: Wed, 7 Dec 2016 20:26:07 +0000
Subject: phy: add phy fixup unregister functions

>From : Woojung Huh <woojung.huh@microchip.com>

Add functions to unregister phy fixup for modules.

int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask)
	Unregister phy fixup from phy_fixup_list per bus_id, phy_uid &
	phy_uid_mask

int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask)
	Unregister phy fixup from phy_fixup_list.
	Use it for fixup registered by phy_register_fixup_for_uid()

int phy_unregister_fixup_for_id(const char *bus_id)
	Unregister phy fixup from phy_fixup_list.
	Use it for fixup registered by phy_register_fixup_for_id()

Signed-off-by: Woojung Huh <woojung.huh@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/phy.txt |  9 ++++++++
 drivers/net/phy/phy_device.c     | 47 ++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h              |  4 ++++
 3 files changed, 60 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/networking/phy.txt b/Documentation/networking/phy.txt
index e017d933d530..16f90d817224 100644
--- a/Documentation/networking/phy.txt
+++ b/Documentation/networking/phy.txt
@@ -407,6 +407,15 @@ Board Fixups
  The stubs set one of the two matching criteria, and set the other one to
  match anything.
 
+ When phy_register_fixup() or *_for_uid()/*_for_id() is called at module,
+ unregister fixup and free allocate memory are required.
+
+ Call one of following function before unloading module.
+
+ int phy_unregister_fixup(const char *phy_id, u32 phy_uid, u32 phy_uid_mask);
+ int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask);
+ int phy_register_fixup_for_id(const char *phy_id);
+
 Standards
 
  IEEE Standard 802.3: CSMA/CD Access Method and Physical Layer Specifications, Section Two:
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index aeaf1bcb12d0..32fa7c76f29c 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -235,6 +235,53 @@ int phy_register_fixup_for_id(const char *bus_id,
 }
 EXPORT_SYMBOL(phy_register_fixup_for_id);
 
+/**
+ * phy_unregister_fixup - remove a phy_fixup from the list
+ * @bus_id: A string matches fixup->bus_id (or PHY_ANY_ID) in phy_fixup_list
+ * @phy_uid: A phy id matches fixup->phy_id (or PHY_ANY_UID) in phy_fixup_list
+ * @phy_uid_mask: Applied to phy_uid and fixup->phy_uid before comparison
+ */
+int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask)
+{
+	struct list_head *pos, *n;
+	struct phy_fixup *fixup;
+	int ret;
+
+	ret = -ENODEV;
+
+	mutex_lock(&phy_fixup_lock);
+	list_for_each_safe(pos, n, &phy_fixup_list) {
+		fixup = list_entry(pos, struct phy_fixup, list);
+
+		if ((!strcmp(fixup->bus_id, bus_id)) &&
+		    ((fixup->phy_uid & phy_uid_mask) ==
+		     (phy_uid & phy_uid_mask))) {
+			list_del(&fixup->list);
+			kfree(fixup);
+			ret = 0;
+			break;
+		}
+	}
+	mutex_unlock(&phy_fixup_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(phy_unregister_fixup);
+
+/* Unregisters a fixup of any PHY with the UID in phy_uid */
+int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask)
+{
+	return phy_unregister_fixup(PHY_ANY_ID, phy_uid, phy_uid_mask);
+}
+EXPORT_SYMBOL(phy_unregister_fixup_for_uid);
+
+/* Unregisters a fixup of the PHY with id string bus_id */
+int phy_unregister_fixup_for_id(const char *bus_id)
+{
+	return phy_unregister_fixup(bus_id, PHY_ANY_UID, 0xffffffff);
+}
+EXPORT_SYMBOL(phy_unregister_fixup_for_id);
+
 /* Returns 1 if fixup matches phydev in bus_id and phy_uid.
  * Fixups can be set to match any in one or more fields.
  */
diff --git a/include/linux/phy.h b/include/linux/phy.h
index feb8a98e8dd3..f7d95f644eed 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -860,6 +860,10 @@ int phy_register_fixup_for_id(const char *bus_id,
 int phy_register_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask,
 			       int (*run)(struct phy_device *));
 
+int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask);
+int phy_unregister_fixup_for_id(const char *bus_id);
+int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask);
+
 int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable);
 int phy_get_eee_err(struct phy_device *phydev);
 int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data);
-- 
cgit v1.2.3


From 17bedab2723145d17b14084430743549e6943d03 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 7 Dec 2016 15:53:11 -0800
Subject: bpf: xdp: Allow head adjustment in XDP prog

This patch allows XDP prog to extend/remove the packet
data at the head (like adding or removing header).  It is
done by adding a new XDP helper bpf_xdp_adjust_head().

It also renames bpf_helper_changes_skb_data() to
bpf_helper_changes_pkt_data() to better reflect
that XDP prog does not work on skb.

This patch adds one "xdp_adjust_head" bit to bpf_prog for the
XDP-capable driver to check if the XDP prog requires
bpf_xdp_adjust_head() support.  The driver can then decide
to error out during XDP_SETUP_PROG.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/powerpc/net/bpf_jit_comp64.c                  |  4 ++--
 arch/s390/net/bpf_jit_comp.c                       |  2 +-
 arch/x86/net/bpf_jit_comp.c                        |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  5 ++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  5 ++++
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |  4 ++++
 drivers/net/ethernet/qlogic/qede/qede_main.c       |  5 ++++
 include/linux/filter.h                             |  6 +++--
 include/uapi/linux/bpf.h                           | 11 ++++++++-
 kernel/bpf/core.c                                  |  2 +-
 kernel/bpf/syscall.c                               |  2 ++
 kernel/bpf/verifier.c                              |  2 +-
 net/core/filter.c                                  | 28 ++++++++++++++++++++--
 13 files changed, 67 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 0fe98a567125..73a5cf18fd84 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -766,7 +766,7 @@ emit_clear:
 			func = (u8 *) __bpf_call_base + imm;
 
 			/* Save skb pointer if we need to re-cache skb data */
-			if (bpf_helper_changes_skb_data(func))
+			if (bpf_helper_changes_pkt_data(func))
 				PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
 
 			bpf_jit_emit_func_call(image, ctx, (u64)func);
@@ -775,7 +775,7 @@ emit_clear:
 			PPC_MR(b2p[BPF_REG_0], 3);
 
 			/* refresh skb cache */
-			if (bpf_helper_changes_skb_data(func)) {
+			if (bpf_helper_changes_pkt_data(func)) {
 				/* reload skb pointer to r3 */
 				PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
 				bpf_jit_emit_skb_loads(image, ctx);
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index bee281f3163d..167b31b186c1 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -981,7 +981,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 		EMIT2(0x0d00, REG_14, REG_W1);
 		/* lgr %b0,%r2: load return value into %b0 */
 		EMIT4(0xb9040000, BPF_REG_0, REG_2);
-		if (bpf_helper_changes_skb_data((void *)func)) {
+		if (bpf_helper_changes_pkt_data((void *)func)) {
 			jit->seen |= SEEN_SKB_CHANGE;
 			/* lg %b1,ST_OFF_SKBP(%r15) */
 			EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index fe04a04dab8e..e76d1af60f7a 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -853,7 +853,7 @@ xadd:			if (is_imm8(insn->off))
 			func = (u8 *) __bpf_call_base + imm32;
 			jmp_offset = func - (image + addrs[i]);
 			if (seen_ld_abs) {
-				reload_skb_data = bpf_helper_changes_skb_data(func);
+				reload_skb_data = bpf_helper_changes_pkt_data(func);
 				if (reload_skb_data) {
 					EMIT1(0x57); /* push %rdi */
 					jmp_offset += 22; /* pop, mov, sub, mov */
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 49a81f1fc1d6..f441eda63bec 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2686,6 +2686,11 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 	int err;
 	int i;
 
+	if (prog && prog->xdp_adjust_head) {
+		en_err(priv, "Does not support bpf_xdp_adjust_head()\n");
+		return -EOPNOTSUPP;
+	}
+
 	xdp_ring_num = prog ? priv->rx_ring_num : 0;
 
 	/* No need to reconfigure buffers when simply swapping the
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 07020276fe73..cbfa38fc72c0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3183,6 +3183,11 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
 	bool reset, was_opened;
 	int i;
 
+	if (prog && prog->xdp_adjust_head) {
+		netdev_err(netdev, "Does not support bpf_xdp_adjust_head()\n");
+		return -EOPNOTSUPP;
+	}
+
 	mutex_lock(&priv->state_lock);
 
 	if ((netdev->features & NETIF_F_LRO) && prog) {
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 00d9a03be31d..e8d448109e03 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2946,6 +2946,10 @@ static int nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog)
 	};
 	int err;
 
+	if (prog && prog->xdp_adjust_head) {
+		nn_err(nn, "Does not support bpf_xdp_adjust_head()\n");
+		return -EOPNOTSUPP;
+	}
 	if (!prog && !nn->xdp_prog)
 		return 0;
 	if (prog && nn->xdp_prog) {
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index cf1dd1436d93..aecdd1c5c0ea 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -2507,6 +2507,11 @@ static int qede_xdp_set(struct qede_dev *edev, struct bpf_prog *prog)
 {
 	struct qede_reload_args args;
 
+	if (prog && prog->xdp_adjust_head) {
+		DP_ERR(edev, "Does not support bpf_xdp_adjust_head()\n");
+		return -EOPNOTSUPP;
+	}
+
 	/* If we're called, there was already a bpf reference increment */
 	args.func = &qede_xdp_reload_func;
 	args.u.new_prog = prog;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f078d2b1cff6..6a1658308612 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -406,7 +406,8 @@ struct bpf_prog {
 	u16			jited:1,	/* Is our filter JIT'ed? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1;	/* Do we need dst entry? */
+				dst_needed:1,	/* Do we need dst entry? */
+				xdp_adjust_head:1; /* Adjusting pkt head? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
@@ -440,6 +441,7 @@ struct bpf_skb_data_end {
 struct xdp_buff {
 	void *data;
 	void *data_end;
+	void *data_hard_start;
 };
 
 /* compute the linear packet data range [data, data_end) which
@@ -595,7 +597,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
-bool bpf_helper_changes_skb_data(void *func);
+bool bpf_helper_changes_pkt_data(void *func);
 
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6123d9b8e828..0eb0e87dbe9f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -424,6 +424,12 @@ union bpf_attr {
  *     @len: length of header to be pushed in front
  *     @flags: Flags (unused for now)
  *     Return: 0 on success or negative error
+ *
+ * int bpf_xdp_adjust_head(xdp_md, delta)
+ *     Adjust the xdp_md.data by delta
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: An positive/negative integer to be added to xdp_md.data
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -469,7 +475,8 @@ union bpf_attr {
 	FN(csum_update),		\
 	FN(set_hash_invalid),		\
 	FN(get_numa_node_id),		\
-	FN(skb_change_head),
+	FN(skb_change_head),		\
+	FN(xdp_adjust_head),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -576,6 +583,8 @@ struct bpf_sock {
 	__u32 protocol;
 };
 
+#define XDP_PACKET_HEADROOM 256
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index bdcc9f4ba767..83e0d153b0b4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1143,7 +1143,7 @@ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
 	return prog;
 }
 
-bool __weak bpf_helper_changes_skb_data(void *func)
+bool __weak bpf_helper_changes_pkt_data(void *func)
 {
 	return false;
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 88f609f1c0c3..4819ec9d95f6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -579,6 +579,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
 				prog->dst_needed = 1;
 			if (insn->imm == BPF_FUNC_get_prandom_u32)
 				bpf_user_rnd_init_once();
+			if (insn->imm == BPF_FUNC_xdp_adjust_head)
+				prog->xdp_adjust_head = 1;
 			if (insn->imm == BPF_FUNC_tail_call) {
 				/* mark bpf_tail_call as different opcode
 				 * to avoid conditional branch in
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5b14f85f45c6..d28f9a3380a9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1216,7 +1216,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
 		return -EINVAL;
 	}
 
-	changes_data = bpf_helper_changes_skb_data(fn->func);
+	changes_data = bpf_helper_changes_pkt_data(fn->func);
 
 	memset(&meta, 0, sizeof(meta));
 	meta.pkt_access = fn->pkt_access;
diff --git a/net/core/filter.c b/net/core/filter.c
index b751202e12f8..b1461708a977 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2234,7 +2234,28 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-bool bpf_helper_changes_skb_data(void *func)
+BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
+{
+	void *data = xdp->data + offset;
+
+	if (unlikely(data < xdp->data_hard_start ||
+		     data > xdp->data_end - ETH_HLEN))
+		return -EINVAL;
+
+	xdp->data = data;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
+	.func		= bpf_xdp_adjust_head,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
 {
 	if (func == bpf_skb_vlan_push ||
 	    func == bpf_skb_vlan_pop ||
@@ -2244,7 +2265,8 @@ bool bpf_helper_changes_skb_data(void *func)
 	    func == bpf_skb_change_tail ||
 	    func == bpf_skb_pull_data ||
 	    func == bpf_l3_csum_replace ||
-	    func == bpf_l4_csum_replace)
+	    func == bpf_l4_csum_replace ||
+	    func == bpf_xdp_adjust_head)
 		return true;
 
 	return false;
@@ -2670,6 +2692,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 		return &bpf_xdp_event_output_proto;
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_xdp_adjust_head:
+		return &bpf_xdp_adjust_head_proto;
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From c84d949057cab262b4d3110ead9a42a58c2958f7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 8 Dec 2016 11:41:55 -0800
Subject: udp: copy skb->truesize in the first cache line

In UDP RX handler, we currently clear skb->dev before skb
is added to receive queue, because device pointer is no longer
available once we exit from RCU section.

Since this first cache line is always hot, lets reuse this space
to store skb->truesize and thus avoid a cache line miss at
udp_recvmsg()/udp_skb_destructor time while receive queue
spinlock is held.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  9 ++++++++-
 net/ipv4/udp.c         | 13 ++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0cd92b0f2af5..332e76756f54 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -645,8 +645,15 @@ struct sk_buff {
 		struct rb_node	rbnode; /* used in netem & tcp stack */
 	};
 	struct sock		*sk;
-	struct net_device	*dev;
 
+	union {
+		struct net_device	*dev;
+		/* Some protocols might use this space to store information,
+		 * while device pointer would be NULL.
+		 * UDP receive path is one user.
+		 */
+		unsigned long		dev_scratch;
+	};
 	/*
 	 * This is the control buffer. It is free to use for every
 	 * layer. Please put your private variables there. If you
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e6a68d66f3b2..c608334d99aa 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1188,10 +1188,14 @@ static void udp_rmem_release(struct sock *sk, int size, int partial)
 		__sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
 }
 
-/* Note: called with sk_receive_queue.lock held */
+/* Note: called with sk_receive_queue.lock held.
+ * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
+ * This avoids a cache line miss while receive_queue lock is held.
+ * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
+ */
 void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
 {
-	udp_rmem_release(sk, skb->truesize, 1);
+	udp_rmem_release(sk, skb->dev_scratch, 1);
 }
 EXPORT_SYMBOL(udp_skb_destructor);
 
@@ -1246,6 +1250,10 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 		busy = busylock_acquire(sk);
 	}
 	size = skb->truesize;
+	/* Copy skb->truesize into skb->dev_scratch to avoid a cache line miss
+	 * in udp_skb_destructor()
+	 */
+	skb->dev_scratch = size;
 
 	/* we drop only if the receive buf is full and the receive
 	 * queue contains some other skb
@@ -1272,7 +1280,6 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 	/* no need to setup a destructor, we will explicitly release the
 	 * forward allocated memory on dequeue
 	 */
-	skb->dev = NULL;
 	sock_skb_set_dropcount(sk, skb);
 
 	__skb_queue_tail(list, skb);
-- 
cgit v1.2.3


From 6b229cf77d683f634f0edd876c6d1015402303ad Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 8 Dec 2016 11:41:56 -0800
Subject: udp: add batching to udp_rmem_release()

If udp_recvmsg() constantly releases sk_rmem_alloc
for every read packet, it gives opportunity for
producers to immediately grab spinlocks and desperatly
try adding another packet, causing false sharing.

We can add a simple heuristic to give the signal
by batches of ~25 % of the queue capacity.

This patch considerably increases performance under
flood by about 50 %, since the thread draining the queue
is no longer slowed by false sharing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h |  3 +++
 net/ipv4/udp.c      | 12 ++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index d1fd8cd39478..c0f530809d1f 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -79,6 +79,9 @@ struct udp_sock {
 	int			(*gro_complete)(struct sock *sk,
 						struct sk_buff *skb,
 						int nhoff);
+
+	/* This field is dirtied by udp_recvmsg() */
+	int		forward_deficit;
 };
 
 static inline struct udp_sock *udp_sk(const struct sock *sk)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c608334d99aa..5a38faa12cde 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1177,8 +1177,20 @@ out:
 /* fully reclaim rmem/fwd memory allocated for skb */
 static void udp_rmem_release(struct sock *sk, int size, int partial)
 {
+	struct udp_sock *up = udp_sk(sk);
 	int amt;
 
+	if (likely(partial)) {
+		up->forward_deficit += size;
+		size = up->forward_deficit;
+		if (size < (sk->sk_rcvbuf >> 2) &&
+		    !skb_queue_empty(&sk->sk_receive_queue))
+			return;
+	} else {
+		size += up->forward_deficit;
+	}
+	up->forward_deficit = 0;
+
 	atomic_sub(size, &sk->sk_rmem_alloc);
 	sk->sk_forward_alloc += size;
 	amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
-- 
cgit v1.2.3