From 92ee76d4a926843d0e135aa0c2d9f57504c6876c Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Fri, 6 Dec 2019 15:31:03 +0100
Subject: trivial: mac80211: fix indentation

Signed-off-by: John Crispin <john@phrozen.org>
Link: https://lore.kernel.org/r/20191206143103.3645-1-john@phrozen.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 7d3f2ced92d1..e172d0c7bf74 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2047,7 +2047,7 @@ ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info)
 #define IEEE80211_HE_OPERATION_ER_SU_DISABLE			0x00010000
 #define IEEE80211_HE_OPERATION_6GHZ_OP_INFO			0x00020000
 #define IEEE80211_HE_OPERATION_BSS_COLOR_MASK			0x3f000000
-#define IEEE80211_HE_OPERATION_BSS_COLOR_OFFSET		24
+#define IEEE80211_HE_OPERATION_BSS_COLOR_OFFSET			24
 #define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR		0x40000000
 #define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED		0x80000000
 
-- 
cgit v1.2.3


From 30b2f0be23fb40e58d0ad2caf8702c2a44cda2e1 Mon Sep 17 00:00:00 2001
From: Thomas Pedersen <thomas@adapt-ip.com>
Date: Mon, 13 Jan 2020 21:59:40 -0800
Subject: mac80211: add ieee80211_is_any_nullfunc()

commit 08a5bdde3812 ("mac80211: consider QoS Null frames for STA_NULLFUNC_ACKED")
Fixed a bug where we failed to take into account a
nullfunc frame can be either non-QoS or QoS. It turns out
there is at least one more bug in
ieee80211_sta_tx_notify(), introduced in
commit 7b6ddeaf27ec ("mac80211: use QoS NDP for AP probing"),
where we forgot to check for the QoS variant and so
assumed the QoS nullfunc frame never went out

Fix this by adding a helper ieee80211_is_any_nullfunc()
which consolidates the check for non-QoS and QoS nullfunc
frames. Replace existing compound conditionals and add a
couple more missing checks for QoS variant.

Signed-off-by: Thomas Pedersen <thomas@adapt-ip.com>
Link: https://lore.kernel.org/r/20200114055940.18502-3-thomas@adapt-ip.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 9 +++++++++
 net/mac80211/mlme.c       | 2 +-
 net/mac80211/rx.c         | 8 +++-----
 net/mac80211/status.c     | 5 ++---
 net/mac80211/tx.c         | 2 +-
 5 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index e172d0c7bf74..1c4409b4c012 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -619,6 +619,15 @@ static inline bool ieee80211_is_qos_nullfunc(__le16 fc)
 	       cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC);
 }
 
+/**
+ * ieee80211_is_any_nullfunc - check if frame is regular or QoS nullfunc frame
+ * @fc: frame control bytes in little-endian byteorder
+ */
+static inline bool ieee80211_is_any_nullfunc(__le16 fc)
+{
+	return (ieee80211_is_nullfunc(fc) || ieee80211_is_qos_nullfunc(fc));
+}
+
 /**
  * ieee80211_is_bufferable_mmpdu - check if frame is bufferable MMPDU
  * @fc: frame control field in little-endian byteorder
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 6e4099009eab..cb6fd0a09e07 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2460,7 +2460,7 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
 	if (!ieee80211_is_data(hdr->frame_control))
 	    return;
 
-	if (ieee80211_is_nullfunc(hdr->frame_control) &&
+	if (ieee80211_is_any_nullfunc(hdr->frame_control) &&
 	    sdata->u.mgd.probe_send_count > 0) {
 		if (ack)
 			ieee80211_sta_reset_conn_monitor(sdata);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 0e05ff037672..619c223f1cde 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1450,8 +1450,7 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx)
 		return RX_CONTINUE;
 
 	if (ieee80211_is_ctl(hdr->frame_control) ||
-	    ieee80211_is_nullfunc(hdr->frame_control) ||
-	    ieee80211_is_qos_nullfunc(hdr->frame_control) ||
+	    ieee80211_is_any_nullfunc(hdr->frame_control) ||
 	    is_multicast_ether_addr(hdr->addr1))
 		return RX_CONTINUE;
 
@@ -1838,8 +1837,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
 	 * Drop (qos-)data::nullfunc frames silently, since they
 	 * are used only to control station power saving mode.
 	 */
-	if (ieee80211_is_nullfunc(hdr->frame_control) ||
-	    ieee80211_is_qos_nullfunc(hdr->frame_control)) {
+	if (ieee80211_is_any_nullfunc(hdr->frame_control)) {
 		I802_DEBUG_INC(rx->local->rx_handlers_drop_nullfunc);
 
 		/*
@@ -2319,7 +2317,7 @@ static int ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc)
 
 	/* Drop unencrypted frames if key is set. */
 	if (unlikely(!ieee80211_has_protected(fc) &&
-		     !ieee80211_is_nullfunc(fc) &&
+		     !ieee80211_is_any_nullfunc(fc) &&
 		     ieee80211_is_data(fc) && rx->key))
 		return -EACCES;
 
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index c9b90d38c54d..22512805eafb 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -643,8 +643,7 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
 		rcu_read_lock();
 		sdata = ieee80211_sdata_from_skb(local, skb);
 		if (sdata) {
-			if (ieee80211_is_nullfunc(hdr->frame_control) ||
-			    ieee80211_is_qos_nullfunc(hdr->frame_control))
+			if (ieee80211_is_any_nullfunc(hdr->frame_control))
 				cfg80211_probe_status(sdata->dev, hdr->addr1,
 						      cookie, acked,
 						      info->status.ack_signal,
@@ -1061,7 +1060,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
 			I802_DEBUG_INC(local->dot11FailedCount);
 	}
 
-	if ((ieee80211_is_nullfunc(fc) || ieee80211_is_qos_nullfunc(fc)) &&
+	if (ieee80211_is_any_nullfunc(fc) &&
 	    ieee80211_has_pm(fc) &&
 	    ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) &&
 	    !(info->flags & IEEE80211_TX_CTL_INJECTED) &&
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index c56d801e708f..4296d9d71311 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -297,7 +297,7 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
 	if (unlikely(test_bit(SCAN_SW_SCANNING, &tx->local->scanning)) &&
 	    test_bit(SDATA_STATE_OFFCHANNEL, &tx->sdata->state) &&
 	    !ieee80211_is_probe_req(hdr->frame_control) &&
-	    !ieee80211_is_nullfunc(hdr->frame_control))
+	    !ieee80211_is_any_nullfunc(hdr->frame_control))
 		/*
 		 * When software scanning only nullfunc frames (to notify
 		 * the sleep state to the AP) and probe requests (for the
-- 
cgit v1.2.3


From f93d6b21a93ceb02140eafd84e4fd77f5d00180a Mon Sep 17 00:00:00 2001
From: Zvika Yehudai <zvikayeh@gmail.com>
Date: Mon, 3 Feb 2020 10:08:23 +0200
Subject: ieee80211: fix 'the' doubling in comments

Remove redundant 'the' where 'the the' was written.

Signed-off-by: Zvika Yehudai <zvikayeh@gmail.com>
Link: https://lore.kernel.org/r/20200203080823.24949-1-zvikayeh@gmail.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 1c4409b4c012..095a7108c394 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2062,7 +2062,7 @@ ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info)
 
 /*
  * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size
- * @he_oper_ie: byte data of the He Operations IE, stating from the the byte
+ * @he_oper_ie: byte data of the He Operations IE, stating from the byte
  *	after the ext ID byte. It is assumed that he_oper_ie has at least
  *	sizeof(struct ieee80211_he_operation) bytes, the caller must have
  *	validated this.
@@ -2100,7 +2100,7 @@ ieee80211_he_oper_size(const u8 *he_oper_ie)
 
 /*
  * ieee80211_he_spr_size - calculate 802.11ax HE Spatial Reuse IE size
- * @he_spr_ie: byte data of the He Spatial Reuse IE, stating from the the byte
+ * @he_spr_ie: byte data of the He Spatial Reuse IE, stating from the byte
  *	after the ext ID byte. It is assumed that he_spr_ie has at least
  *	sizeof(struct ieee80211_he_spr) bytes, the caller must have validated
  *	this
@@ -2743,7 +2743,7 @@ enum ieee80211_tdls_actioncode {
  */
 #define WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT	BIT(6)
 
-/* TDLS capabilities in the the 4th byte of @WLAN_EID_EXT_CAPABILITY */
+/* TDLS capabilities in the 4th byte of @WLAN_EID_EXT_CAPABILITY */
 #define WLAN_EXT_CAPA4_TDLS_BUFFER_STA		BIT(4)
 #define WLAN_EXT_CAPA4_TDLS_PEER_PSM		BIT(5)
 #define WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH		BIT(6)
-- 
cgit v1.2.3


From c0058df73309906ef4d5383fbaa10c43ebddc48a Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Fri, 31 Jan 2020 13:12:57 +0200
Subject: mac80211: parse also the RSNXE IE

Parse also the RSN Extension IE when parsing the rest of the IEs.
It will be used in a later patch.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Link: https://lore.kernel.org/r/20200131111300.891737-21-luca@coelho.fi
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  | 8 ++++++++
 net/mac80211/ieee80211_i.h | 2 ++
 net/mac80211/util.c        | 7 ++++++-
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 095a7108c394..6f3e7c5c600a 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2532,6 +2532,7 @@ enum ieee80211_eid {
 	WLAN_EID_FILS_INDICATION = 240,
 	WLAN_EID_DILS = 241,
 	WLAN_EID_FRAGMENT = 242,
+	WLAN_EID_RSNX = 244,
 	WLAN_EID_EXTENSION = 255
 };
 
@@ -3421,4 +3422,11 @@ static inline bool for_each_element_completed(const struct element *element,
 	return (const u8 *)element == (const u8 *)data + datalen;
 }
 
+/**
+ * RSNX Capabilities:
+ * bits 0-3: Field length (n-1)
+ */
+#define WLAN_RSNX_CAPA_PROTECTED_TWT BIT(4)
+#define WLAN_RSNX_CAPA_SAE_H2E BIT(5)
+
 #endif /* LINUX_IEEE80211_H */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 7074af92b536..8a49d78ad7c9 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1472,6 +1472,7 @@ struct ieee802_11_elems {
 	const struct ieee80211_tim_ie *tim;
 	const u8 *challenge;
 	const u8 *rsn;
+	const u8 *rsnx;
 	const u8 *erp_info;
 	const u8 *ext_supp_rates;
 	const u8 *wmm_info;
@@ -1519,6 +1520,7 @@ struct ieee802_11_elems {
 	u8 tim_len;
 	u8 challenge_len;
 	u8 rsn_len;
+	u8 rsnx_len;
 	u8 ext_supp_rates_len;
 	u8 wmm_info_len;
 	u8 wmm_param_len;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 72039c8dbc38..7ddf0508779f 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -6,7 +6,7 @@
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright (C) 2015-2017	Intel Deutschland GmbH
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
  *
  * utilities for mac80211
  */
@@ -994,6 +994,7 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
 		case WLAN_EID_CHAN_SWITCH_TIMING:
 		case WLAN_EID_LINK_ID:
 		case WLAN_EID_BSS_MAX_IDLE_PERIOD:
+		case WLAN_EID_RSNX:
 		/*
 		 * not listing WLAN_EID_CHANNEL_SWITCH_WRAPPER -- it seems possible
 		 * that if the content gets bigger it might be needed more than once
@@ -1264,6 +1265,10 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
 			if (elen >= sizeof(*elems->max_idle_period_ie))
 				elems->max_idle_period_ie = (void *)pos;
 			break;
+		case WLAN_EID_RSNX:
+			elems->rsnx = pos;
+			elems->rsnx_len = elen;
+			break;
 		case WLAN_EID_EXTENSION:
 			ieee80211_parse_extension_element(calc_crc ?
 								&crc : NULL,
-- 
cgit v1.2.3


From 256db7423c31c873abe6fb780513dd7b5705a510 Mon Sep 17 00:00:00 2001
From: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Date: Thu, 13 Feb 2020 13:16:17 +0000
Subject: ieee80211: add WPA3 OWE AKM suite selector

Add the definition for Opportunistic Wireless Encryption AKM selector.

Signed-off-by: Sergey Matyukevich <sergey.matyukevich.os@quantenna.com>
Link: https://lore.kernel.org/r/20200213131608.10541-3-sergey.matyukevich.os@quantenna.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 6f3e7c5c600a..33d907eec0b6 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -3044,6 +3044,7 @@ struct ieee80211_multiple_bssid_configuration {
 #define WLAN_AKM_SUITE_FILS_SHA384		SUITE(0x000FAC, 15)
 #define WLAN_AKM_SUITE_FT_FILS_SHA256		SUITE(0x000FAC, 16)
 #define WLAN_AKM_SUITE_FT_FILS_SHA384		SUITE(0x000FAC, 17)
+#define WLAN_AKM_SUITE_OWE			SUITE(0x000FAC, 18)
 
 #define WLAN_MAX_KEY_LEN		32
 
-- 
cgit v1.2.3


From d71151a39c97d551378a441c089508b0bca48210 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Tue, 11 Feb 2020 12:52:49 +0800
Subject: ptp_qoriq: drop the code of alarm

The alarm function hadn't been supported by PTP clock driver.
The recommended solution PHC + phc2sys + nanosleep provides
best performance. So drop the code of alarm in ptp_qoriq driver.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_qoriq.c       | 29 +----------------------------
 include/linux/fsl/ptp_qoriq.h |  2 --
 2 files changed, 1 insertion(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c
index b27c46ebfc8f..c09c16be0edf 100644
--- a/drivers/ptp/ptp_qoriq.c
+++ b/drivers/ptp/ptp_qoriq.c
@@ -131,8 +131,7 @@ irqreturn_t ptp_qoriq_isr(int irq, void *priv)
 	struct ptp_qoriq *ptp_qoriq = priv;
 	struct ptp_qoriq_registers *regs = &ptp_qoriq->regs;
 	struct ptp_clock_event event;
-	u64 ns;
-	u32 ack = 0, lo, hi, mask, val, irqs;
+	u32 ack = 0, mask, val, irqs;
 
 	spin_lock(&ptp_qoriq->lock);
 
@@ -153,32 +152,6 @@ irqreturn_t ptp_qoriq_isr(int irq, void *priv)
 		extts_clean_up(ptp_qoriq, 1, true);
 	}
 
-	if (irqs & ALM2) {
-		ack |= ALM2;
-		if (ptp_qoriq->alarm_value) {
-			event.type = PTP_CLOCK_ALARM;
-			event.index = 0;
-			event.timestamp = ptp_qoriq->alarm_value;
-			ptp_clock_event(ptp_qoriq->clock, &event);
-		}
-		if (ptp_qoriq->alarm_interval) {
-			ns = ptp_qoriq->alarm_value + ptp_qoriq->alarm_interval;
-			hi = ns >> 32;
-			lo = ns & 0xffffffff;
-			ptp_qoriq->write(&regs->alarm_regs->tmr_alarm2_l, lo);
-			ptp_qoriq->write(&regs->alarm_regs->tmr_alarm2_h, hi);
-			ptp_qoriq->alarm_value = ns;
-		} else {
-			spin_lock(&ptp_qoriq->lock);
-			mask = ptp_qoriq->read(&regs->ctrl_regs->tmr_temask);
-			mask &= ~ALM2EN;
-			ptp_qoriq->write(&regs->ctrl_regs->tmr_temask, mask);
-			spin_unlock(&ptp_qoriq->lock);
-			ptp_qoriq->alarm_value = 0;
-			ptp_qoriq->alarm_interval = 0;
-		}
-	}
-
 	if (irqs & PP1) {
 		ack |= PP1;
 		event.type = PTP_CLOCK_PPS;
diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h
index b0b743563f43..75884563059f 100644
--- a/include/linux/fsl/ptp_qoriq.h
+++ b/include/linux/fsl/ptp_qoriq.h
@@ -149,8 +149,6 @@ struct ptp_qoriq {
 	bool extts_fifo_support;
 	int irq;
 	int phc_index;
-	u64 alarm_interval; /* for periodic alarm */
-	u64 alarm_value;
 	u32 tclk_period;  /* nanoseconds */
 	u32 tmr_prsc;
 	u32 tmr_add;
-- 
cgit v1.2.3


From 8062e2333f8f7dcd5627e22b99e18d1cbb53eedb Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Sat, 15 Feb 2020 23:57:36 +0000
Subject: net: linkmode: make linkmode_test_bit() take const pointer

linkmode_test_bit() does not modify the address; test_bit() is also
declared const volatile for the same reason. There's no need for
linkmode_test_bit() to be any different, and allows implementation of
helpers that take a const linkmode pointer.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/linkmode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
index fe740031339d..8e5b352e44f2 100644
--- a/include/linux/linkmode.h
+++ b/include/linux/linkmode.h
@@ -71,7 +71,7 @@ static inline void linkmode_change_bit(int nr, volatile unsigned long *addr)
 	__change_bit(nr, addr);
 }
 
-static inline int linkmode_test_bit(int nr, volatile unsigned long *addr)
+static inline int linkmode_test_bit(int nr, const volatile unsigned long *addr)
 {
 	return test_bit(nr, addr);
 }
-- 
cgit v1.2.3


From a87ae8a963bde755b0962bcc18db83d611f63e7a Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Sat, 15 Feb 2020 15:49:27 +0000
Subject: net: add helpers to resolve negotiated flow control

Add a couple of helpers to resolve negotiated flow control. Two helpers
are provided:

- linkmode_resolve_pause() which takes the link partner and local
  advertisements, and decodes whether we should enable TX or RX pause
  at the MAC. This is useful outside of phylib, e.g. in phylink.
- phy_get_pause(), which returns the TX/RX enablement status for the
  current negotiation results of the PHY.

This allows us to centralise the flow control resolution, rather than
spreading it around.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Makefile     |  3 ++-
 drivers/net/phy/linkmode.c   | 44 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/phy/phy_device.c | 26 ++++++++++++++++++++++++++
 include/linux/linkmode.h     |  4 ++++
 include/linux/phy.h          |  3 +++
 5 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/phy/linkmode.c

(limited to 'include/linux')

diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index fe5badf13b65..d523fd5670e4 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for Linux PHY drivers and MDIO bus drivers
 
-libphy-y			:= phy.o phy-c45.o phy-core.o phy_device.o
+libphy-y			:= phy.o phy-c45.o phy-core.o phy_device.o \
+				   linkmode.o
 mdio-bus-y			+= mdio_bus.o mdio_device.o
 
 ifdef CONFIG_MDIO_DEVICE
diff --git a/drivers/net/phy/linkmode.c b/drivers/net/phy/linkmode.c
new file mode 100644
index 000000000000..969918795228
--- /dev/null
+++ b/drivers/net/phy/linkmode.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0+
+#include <linux/linkmode.h>
+
+/**
+ * linkmode_resolve_pause - resolve the allowable pause modes
+ * @local_adv: local advertisement in ethtool format
+ * @partner_adv: partner advertisement in ethtool format
+ * @tx_pause: pointer to bool to indicate whether transmit pause should be
+ * enabled.
+ * @rx_pause: pointer to bool to indicate whether receive pause should be
+ * enabled.
+ *
+ * Flow control is resolved according to our and the link partners
+ * advertisements using the following drawn from the 802.3 specs:
+ *  Local device  Link partner
+ *  Pause AsymDir Pause AsymDir Result
+ *    0     X       0     X     Disabled
+ *    0     1       1     0     Disabled
+ *    0     1       1     1     TX
+ *    1     0       0     X     Disabled
+ *    1     X       1     X     TX+RX
+ *    1     1       0     1     RX
+ */
+void linkmode_resolve_pause(const unsigned long *local_adv,
+			    const unsigned long *partner_adv,
+			    bool *tx_pause, bool *rx_pause)
+{
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(m);
+
+	linkmode_and(m, local_adv, partner_adv);
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, m)) {
+		*tx_pause = true;
+		*rx_pause = true;
+	} else if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, m)) {
+		*tx_pause = linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT,
+					      partner_adv);
+		*rx_pause = linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT,
+					      local_adv);
+	} else {
+		*tx_pause = false;
+		*rx_pause = false;
+	}
+}
+EXPORT_SYMBOL_GPL(linkmode_resolve_pause);
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 6a5056e0ae77..f5a7a077ec1f 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -2409,6 +2409,32 @@ bool phy_validate_pause(struct phy_device *phydev,
 }
 EXPORT_SYMBOL(phy_validate_pause);
 
+/**
+ * phy_get_pause - resolve negotiated pause modes
+ * @phydev: phy_device struct
+ * @tx_pause: pointer to bool to indicate whether transmit pause should be
+ * enabled.
+ * @rx_pause: pointer to bool to indicate whether receive pause should be
+ * enabled.
+ *
+ * Resolve and return the flow control modes according to the negotiation
+ * result. This includes checking that we are operating in full duplex mode.
+ * See linkmode_resolve_pause() for further details.
+ */
+void phy_get_pause(struct phy_device *phydev, bool *tx_pause, bool *rx_pause)
+{
+	if (phydev->duplex != DUPLEX_FULL) {
+		*tx_pause = false;
+		*rx_pause = false;
+		return;
+	}
+
+	return linkmode_resolve_pause(phydev->advertising,
+				      phydev->lp_advertising,
+				      tx_pause, rx_pause);
+}
+EXPORT_SYMBOL(phy_get_pause);
+
 static bool phy_drv_supports_irq(struct phy_driver *phydrv)
 {
 	return phydrv->config_intr && phydrv->ack_interrupt;
diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
index 8e5b352e44f2..9ec210f31d06 100644
--- a/include/linux/linkmode.h
+++ b/include/linux/linkmode.h
@@ -88,4 +88,8 @@ static inline int linkmode_subset(const unsigned long *src1,
 	return bitmap_subset(src1, src2, __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
+void linkmode_resolve_pause(const unsigned long *local_adv,
+			    const unsigned long *partner_adv,
+			    bool *tx_pause, bool *rx_pause);
+
 #endif /* __LINKMODE_H */
diff --git a/include/linux/phy.h b/include/linux/phy.h
index c570e162e05e..80f8b2158271 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1257,6 +1257,9 @@ void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx,
 void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx);
 bool phy_validate_pause(struct phy_device *phydev,
 			struct ethtool_pauseparam *pp);
+void phy_get_pause(struct phy_device *phydev, bool *tx_pause, bool *rx_pause);
+void phy_resolve_pause(unsigned long *local_adv, unsigned long *partner_adv,
+		       bool *tx_pause, bool *rx_pause);
 
 int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask,
 		       int (*run)(struct phy_device *));
-- 
cgit v1.2.3


From 45c767faef151899ac1a5e14a59c0e0a5bdba27b Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Sat, 15 Feb 2020 15:49:32 +0000
Subject: net: add linkmode helper for setting flow control advertisement

Add a linkmode helper to set the flow control advertisement in an
ethtool linkmode mask according to the tx/rx capabilities. This
implementation is moved from phylib, and documented with an
analysis of its shortcomings.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/linkmode.c   | 51 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/phy/phy_device.c | 17 +--------------
 include/linux/linkmode.h     |  2 ++
 3 files changed, 54 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/linkmode.c b/drivers/net/phy/linkmode.c
index 969918795228..f60560fe3499 100644
--- a/drivers/net/phy/linkmode.c
+++ b/drivers/net/phy/linkmode.c
@@ -42,3 +42,54 @@ void linkmode_resolve_pause(const unsigned long *local_adv,
 	}
 }
 EXPORT_SYMBOL_GPL(linkmode_resolve_pause);
+
+/**
+ * linkmode_set_pause - set the pause mode advertisement
+ * @advertisement: advertisement in ethtool format
+ * @tx: boolean from ethtool struct ethtool_pauseparam tx_pause member
+ * @rx: boolean from ethtool struct ethtool_pauseparam rx_pause member
+ *
+ * Configure the advertised Pause and Asym_Pause bits according to the
+ * capabilities of provided in @tx and @rx.
+ *
+ * We convert as follows:
+ *  tx rx  Pause AsymDir
+ *  0  0   0     0
+ *  0  1   1     1
+ *  1  0   0     1
+ *  1  1   1     0
+ *
+ * Note: this translation from ethtool tx/rx notation to the advertisement
+ * is actually very problematical. Here are some examples:
+ *
+ * For tx=0 rx=1, meaning transmit is unsupported, receive is supported:
+ *
+ *  Local device  Link partner
+ *  Pause AsymDir Pause AsymDir Result
+ *    1     1       1     0     TX + RX - but we have no TX support.
+ *    1     1       0     1	Only this gives RX only
+ *
+ * For tx=1 rx=1, meaning we have the capability to transmit and receive
+ * pause frames:
+ *
+ *  Local device  Link partner
+ *  Pause AsymDir Pause AsymDir Result
+ *    1     0       0     1     Disabled - but since we do support tx and rx,
+ *				this should resolve to RX only.
+ *
+ * Hence, asking for:
+ *  rx=1 tx=0 gives Pause+AsymDir advertisement, but we may end up
+ *            resolving to tx+rx pause or only rx pause depending on
+ *            the partners advertisement.
+ *  rx=0 tx=1 gives AsymDir only, which will only give tx pause if
+ *            the partners advertisement allows it.
+ *  rx=1 tx=1 gives Pause only, which will only allow tx+rx pause
+ *            if the other end also advertises Pause.
+ */
+void linkmode_set_pause(unsigned long *advertisement, bool tx, bool rx)
+{
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertisement, rx);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertisement,
+			 rx ^ tx);
+}
+EXPORT_SYMBOL_GPL(linkmode_set_pause);
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index f5a7a077ec1f..2a973265de80 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -2361,22 +2361,7 @@ void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx)
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(oldadv);
 
 	linkmode_copy(oldadv, phydev->advertising);
-
-	linkmode_clear_bit(ETHTOOL_LINK_MODE_Pause_BIT,
-			   phydev->advertising);
-	linkmode_clear_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
-			   phydev->advertising);
-
-	if (rx) {
-		linkmode_set_bit(ETHTOOL_LINK_MODE_Pause_BIT,
-				 phydev->advertising);
-		linkmode_set_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
-				 phydev->advertising);
-	}
-
-	if (tx)
-		linkmode_change_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
-				    phydev->advertising);
+	linkmode_set_pause(phydev->advertising, tx, rx);
 
 	if (!linkmode_equal(oldadv, phydev->advertising) &&
 	    phydev->autoneg)
diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
index 9ec210f31d06..c664c27a29a0 100644
--- a/include/linux/linkmode.h
+++ b/include/linux/linkmode.h
@@ -92,4 +92,6 @@ void linkmode_resolve_pause(const unsigned long *local_adv,
 			    const unsigned long *partner_adv,
 			    bool *tx_pause, bool *rx_pause);
 
+void linkmode_set_pause(unsigned long *advertisement, bool tx, bool rx);
+
 #endif /* __LINKMODE_H */
-- 
cgit v1.2.3


From 4e5aeb4157c879a021e6d92373dc7e4684ebd8c0 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Sat, 15 Feb 2020 15:49:53 +0000
Subject: net: phylink: resolve fixed link flow control

Resolve the fixed link flow control using the recently introduced
linkmode_resolve_pause() helper, which we use in
phylink_get_fixed_state() only when operating in full duplex mode.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 70 +++++++++++++++++++++--------------------------
 include/linux/phylink.h   |  8 ++----
 2 files changed, 34 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index e65e9c9dc759..c29648b90ce7 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -181,9 +181,11 @@ static int phylink_parse_fixedlink(struct phylink *pl,
 		/* We treat the "pause" and "asym-pause" terminology as
 		 * defining the link partner's ability. */
 		if (fwnode_property_read_bool(fixed_node, "pause"))
-			pl->link_config.pause |= MLO_PAUSE_SYM;
+			__set_bit(ETHTOOL_LINK_MODE_Pause_BIT,
+				  pl->link_config.lp_advertising);
 		if (fwnode_property_read_bool(fixed_node, "asym-pause"))
-			pl->link_config.pause |= MLO_PAUSE_ASYM;
+			__set_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+				  pl->link_config.lp_advertising);
 
 		if (ret == 0) {
 			desc = fwnode_gpiod_get_index(fixed_node, "link", 0,
@@ -215,9 +217,11 @@ static int phylink_parse_fixedlink(struct phylink *pl,
 						DUPLEX_FULL : DUPLEX_HALF;
 			pl->link_config.speed = prop[2];
 			if (prop[3])
-				pl->link_config.pause |= MLO_PAUSE_SYM;
+				__set_bit(ETHTOOL_LINK_MODE_Pause_BIT,
+					  pl->link_config.lp_advertising);
 			if (prop[4])
-				pl->link_config.pause |= MLO_PAUSE_ASYM;
+				__set_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+					  pl->link_config.lp_advertising);
 		}
 	}
 
@@ -351,6 +355,22 @@ static void phylink_apply_manual_flow(struct phylink *pl,
 		state->pause = pl->link_config.pause;
 }
 
+static void phylink_resolve_flow(struct phylink_link_state *state)
+{
+	bool tx_pause, rx_pause;
+
+	state->pause = MLO_PAUSE_NONE;
+	if (state->duplex == DUPLEX_FULL) {
+		linkmode_resolve_pause(state->advertising,
+				       state->lp_advertising,
+				       &tx_pause, &rx_pause);
+		if (tx_pause)
+			state->pause |= MLO_PAUSE_TX;
+		if (rx_pause)
+			state->pause |= MLO_PAUSE_RX;
+	}
+}
+
 static void phylink_mac_config(struct phylink *pl,
 			       const struct phylink_link_state *state)
 {
@@ -399,44 +419,16 @@ static void phylink_mac_pcs_get_state(struct phylink *pl,
 /* The fixed state is... fixed except for the link state,
  * which may be determined by a GPIO or a callback.
  */
-static void phylink_get_fixed_state(struct phylink *pl, struct phylink_link_state *state)
+static void phylink_get_fixed_state(struct phylink *pl,
+				    struct phylink_link_state *state)
 {
 	*state = pl->link_config;
 	if (pl->get_fixed_state)
 		pl->get_fixed_state(pl->netdev, state);
 	else if (pl->link_gpio)
 		state->link = !!gpiod_get_value_cansleep(pl->link_gpio);
-}
 
-/* Flow control is resolved according to our and the link partners
- * advertisements using the following drawn from the 802.3 specs:
- *  Local device  Link partner
- *  Pause AsymDir Pause AsymDir Result
- *    1     X       1     X     TX+RX
- *    0     1       1     1     TX
- *    1     1       0     1     RX
- */
-static void phylink_resolve_flow(struct phylink *pl,
-				 struct phylink_link_state *state)
-{
-	int new_pause = 0;
-	int pause = 0;
-
-	if (phylink_test(pl->link_config.advertising, Pause))
-		pause |= MLO_PAUSE_SYM;
-	if (phylink_test(pl->link_config.advertising, Asym_Pause))
-		pause |= MLO_PAUSE_ASYM;
-
-	pause &= state->pause;
-
-	if (pause & MLO_PAUSE_SYM)
-		new_pause = MLO_PAUSE_TX | MLO_PAUSE_RX;
-	else if (pause & MLO_PAUSE_ASYM)
-		new_pause = state->pause & MLO_PAUSE_SYM ?
-			 MLO_PAUSE_TX : MLO_PAUSE_RX;
-
-	state->pause &= ~MLO_PAUSE_TXRX_MASK;
-	state->pause |= new_pause;
+	phylink_resolve_flow(state);
 }
 
 static const char *phylink_pause_to_str(int pause)
@@ -1393,8 +1385,7 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl,
 	    !pause->autoneg && pause->rx_pause != pause->tx_pause)
 		return -EINVAL;
 
-	config->pause &= ~(MLO_PAUSE_AN | MLO_PAUSE_TXRX_MASK);
-
+	config->pause = 0;
 	if (pause->autoneg)
 		config->pause |= MLO_PAUSE_AN;
 	if (pause->rx_pause)
@@ -1505,13 +1496,14 @@ static int phylink_mii_emul_read(unsigned int reg,
 				 struct phylink_link_state *state)
 {
 	struct fixed_phy_status fs;
+	unsigned long *lpa = state->lp_advertising;
 	int val;
 
 	fs.link = state->link;
 	fs.speed = state->speed;
 	fs.duplex = state->duplex;
-	fs.pause = state->pause & MLO_PAUSE_SYM;
-	fs.asym_pause = state->pause & MLO_PAUSE_ASYM;
+	fs.pause = test_bit(ETHTOOL_LINK_MODE_Pause_BIT, lpa);
+	fs.asym_pause = test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, lpa);
 
 	val = swphy_read_reg(reg, &fs);
 	if (reg == MII_BMSR) {
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 523209e70947..0d6073c2b2b7 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -12,12 +12,10 @@ struct net_device;
 
 enum {
 	MLO_PAUSE_NONE,
-	MLO_PAUSE_ASYM = BIT(0),
-	MLO_PAUSE_SYM = BIT(1),
-	MLO_PAUSE_RX = BIT(2),
-	MLO_PAUSE_TX = BIT(3),
+	MLO_PAUSE_RX = BIT(0),
+	MLO_PAUSE_TX = BIT(1),
 	MLO_PAUSE_TXRX_MASK = MLO_PAUSE_TX | MLO_PAUSE_RX,
-	MLO_PAUSE_AN = BIT(4),
+	MLO_PAUSE_AN = BIT(2),
 
 	MLO_AN_PHY = 0,	/* Conventional PHY */
 	MLO_AN_FIXED,	/* Fixed-link mode */
-- 
cgit v1.2.3


From b70486f94bb4820e84491089da5e30d29e774b0d Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Sat, 15 Feb 2020 15:50:09 +0000
Subject: net: phylink: clarify flow control settings in documentation

Clarify the expected flow control settings operation in the phylink
documentation for each negotiation mode.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phylink.h | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 0d6073c2b2b7..812357c03df4 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -152,13 +152,20 @@ void mac_pcs_get_state(struct phylink_config *config,
  * guaranteed to be correct, and so any mac_config() implementation must
  * never reference these fields.
  *
+ * In all negotiation modes, as defined by @mode, @state->pause indicates the
+ * pause settings which should be applied as follows. If %MLO_PAUSE_AN is not
+ * set, %MLO_PAUSE_TX and %MLO_PAUSE_RX indicate whether the MAC should send
+ * pause frames and/or act on received pause frames respectively. Otherwise,
+ * the results of in-band negotiation/status from the MAC PCS should be used
+ * to control the MAC pause mode settings.
+ *
  * The action performed depends on the currently selected mode:
  *
  * %MLO_AN_FIXED, %MLO_AN_PHY:
- *   Configure the specified @state->speed, @state->duplex and
- *   @state->pause (%MLO_PAUSE_TX / %MLO_PAUSE_RX) modes over a link
- *   specified by @state->interface.  @state->advertising may be used,
- *   but is not required.  Other members of @state must be ignored.
+ *   Configure the specified @state->speed and @state->duplex over a link
+ *   specified by @state->interface. @state->advertising may be used, but
+ *   is not required. Pause modes as above. Other members of @state must
+ *   be ignored.
  *
  *   Valid state members: interface, speed, duplex, pause, advertising.
  *
@@ -170,11 +177,14 @@ void mac_pcs_get_state(struct phylink_config *config,
  *   mac_pcs_get_state() callback. Changes in link state must be made
  *   by calling phylink_mac_change().
  *
+ *   Interface mode specific details are mentioned below.
+ *
  *   If in 802.3z mode, the link speed is fixed, dependent on the
- *   @state->interface. Duplex is negotiated, and pause is advertised
- *   according to @state->an_enabled, @state->pause and
- *   @state->advertising flags. Beware of MACs which only support full
- *   duplex at gigabit and higher speeds.
+ *   @state->interface. Duplex and pause modes are negotiated via
+ *   the in-band configuration word. Advertised pause modes are set
+ *   according to the @state->an_enabled and @state->advertising
+ *   flags. Beware of MACs which only support full duplex at gigabit
+ *   and higher speeds.
  *
  *   If in Cisco SGMII mode, the link speed and duplex mode are passed
  *   in the serial bitstream 16-bit configuration word, and the MAC
-- 
cgit v1.2.3


From d8fab4815a371e8013e1a769c31da1bcaf618b01 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Fri, 14 Feb 2020 15:30:01 +0100
Subject: net/mlx5: fix spelling mistake "reserverd" -> "reserved"

The reserved member should be named reserved.

Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc_fpga.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h
index 37e065a80a43..07d77323f78a 100644
--- a/include/linux/mlx5/mlx5_ifc_fpga.h
+++ b/include/linux/mlx5/mlx5_ifc_fpga.h
@@ -608,7 +608,7 @@ struct mlx5_ifc_tls_cmd_bits {
 struct mlx5_ifc_tls_resp_bits {
 	u8         syndrome[0x20];
 	u8         stream_id[0x20];
-	u8         reserverd[0x40];
+	u8         reserved[0x40];
 };
 
 #define MLX5_TLS_COMMAND_SIZE (0x100)
-- 
cgit v1.2.3


From 12206b17235aed1ca6390b3e516825ae276f8345 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@mellanox.com>
Date: Tue, 11 Feb 2020 14:32:43 -0800
Subject: net/mlx5: Add support for resource dump

On driver load:
- Initialize resource dump data structure and memory access tools (mkey
  & pd).
- Read the resource dump's menu which contains the FW segment
  identifier. Each record is identified by the segment name (ASCII).

During the driver's course of life, users (like reporters) may request
dumps per segment. The user should create a command providing the
segment identifier (SW enumeration) and command keys. In return, the
user receives a command context. In order to receive the dump, the user
should supply the command context and a memory (aligned to a PAGE) on
which the dump content will be written. Since the dump may be larger
than the given memory, the user may resubmit the command until received
an indication of end-of-dump. It is the user's responsibility to destroy
the command.

Signed-off-by: Aya Levin <ayal@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 .../ethernet/mellanox/mlx5/core/diag/rsc_dump.c    | 286 +++++++++++++++++++++
 .../ethernet/mellanox/mlx5/core/diag/rsc_dump.h    |  58 +++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  12 +
 include/linux/mlx5/driver.h                        |   1 +
 5 files changed, 358 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.h

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index d3e06cec8317..e0bb8e12356e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -16,7 +16,7 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \
 		fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \
 		lib/devcom.o lib/pci_vsc.o lib/dm.o diag/fs_tracepoint.o \
-		diag/fw_tracer.o diag/crdump.o devlink.o
+		diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o
 
 #
 # Netdev basic
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c
new file mode 100644
index 000000000000..17ab7efe693d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#include "rsc_dump.h"
+#include "lib/mlx5.h"
+
+#define MLX5_SGMT_TYPE(SGMT) MLX5_SGMT_TYPE_##SGMT
+#define MLX5_SGMT_STR_ASSING(SGMT)[MLX5_SGMT_TYPE(SGMT)] = #SGMT
+static const char *const mlx5_rsc_sgmt_name[] = {
+	MLX5_SGMT_STR_ASSING(HW_CQPC),
+	MLX5_SGMT_STR_ASSING(HW_SQPC),
+	MLX5_SGMT_STR_ASSING(HW_RQPC),
+	MLX5_SGMT_STR_ASSING(FULL_SRQC),
+	MLX5_SGMT_STR_ASSING(FULL_CQC),
+	MLX5_SGMT_STR_ASSING(FULL_EQC),
+	MLX5_SGMT_STR_ASSING(FULL_QPC),
+	MLX5_SGMT_STR_ASSING(SND_BUFF),
+	MLX5_SGMT_STR_ASSING(RCV_BUFF),
+	MLX5_SGMT_STR_ASSING(SRQ_BUFF),
+	MLX5_SGMT_STR_ASSING(CQ_BUFF),
+	MLX5_SGMT_STR_ASSING(EQ_BUFF),
+	MLX5_SGMT_STR_ASSING(SX_SLICE),
+	MLX5_SGMT_STR_ASSING(SX_SLICE_ALL),
+	MLX5_SGMT_STR_ASSING(RDB),
+	MLX5_SGMT_STR_ASSING(RX_SLICE_ALL),
+};
+
+struct mlx5_rsc_dump {
+	u32 pdn;
+	struct mlx5_core_mkey mkey;
+	u16 fw_segment_type[MLX5_SGMT_TYPE_NUM];
+};
+
+struct mlx5_rsc_dump_cmd {
+	u64 mem_size;
+	u8 cmd[MLX5_ST_SZ_BYTES(resource_dump)];
+};
+
+static int mlx5_rsc_dump_sgmt_get_by_name(char *name)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mlx5_rsc_sgmt_name); i++)
+		if (!strcmp(name, mlx5_rsc_sgmt_name[i]))
+			return i;
+
+	return -EINVAL;
+}
+
+static void mlx5_rsc_dump_read_menu_sgmt(struct mlx5_rsc_dump *rsc_dump, struct page *page)
+{
+	void *data = page_address(page);
+	enum mlx5_sgmt_type sgmt_idx;
+	int num_of_items;
+	char *sgmt_name;
+	void *member;
+	void *menu;
+	int i;
+
+	menu = MLX5_ADDR_OF(menu_resource_dump_response, data, menu);
+	num_of_items = MLX5_GET(resource_dump_menu_segment, menu, num_of_records);
+
+	for (i = 0; i < num_of_items; i++) {
+		member = MLX5_ADDR_OF(resource_dump_menu_segment, menu, record[i]);
+		sgmt_name =  MLX5_ADDR_OF(resource_dump_menu_record, member, segment_name);
+		sgmt_idx = mlx5_rsc_dump_sgmt_get_by_name(sgmt_name);
+		if (sgmt_idx == -EINVAL)
+			continue;
+		rsc_dump->fw_segment_type[sgmt_idx] = MLX5_GET(resource_dump_menu_record,
+							       member, segment_type);
+	}
+}
+
+static int mlx5_rsc_dump_trigger(struct mlx5_core_dev *dev, struct mlx5_rsc_dump_cmd *cmd,
+				 struct page *page)
+{
+	struct mlx5_rsc_dump *rsc_dump = dev->rsc_dump;
+	struct device *ddev = &dev->pdev->dev;
+	u32 out_seq_num;
+	u32 in_seq_num;
+	dma_addr_t dma;
+	int err;
+
+	dma = dma_map_page(ddev, page, 0, cmd->mem_size, DMA_FROM_DEVICE);
+	if (unlikely(dma_mapping_error(ddev, dma)))
+		return -ENOMEM;
+
+	in_seq_num = MLX5_GET(resource_dump, cmd->cmd, seq_num);
+	MLX5_SET(resource_dump, cmd->cmd, mkey, rsc_dump->mkey.key);
+	MLX5_SET64(resource_dump, cmd->cmd, address, dma);
+
+	err = mlx5_core_access_reg(dev, cmd->cmd, sizeof(cmd->cmd), cmd->cmd,
+				   sizeof(cmd->cmd), MLX5_REG_RESOURCE_DUMP, 0, 1);
+	if (err) {
+		mlx5_core_err(dev, "Resource dump: Failed to access err %d\n", err);
+		goto out;
+	}
+	out_seq_num = MLX5_GET(resource_dump, cmd->cmd, seq_num);
+	if (out_seq_num && (in_seq_num + 1 != out_seq_num))
+		err = -EIO;
+out:
+	dma_unmap_page(ddev, dma, cmd->mem_size, DMA_FROM_DEVICE);
+	return err;
+}
+
+struct mlx5_rsc_dump_cmd *mlx5_rsc_dump_cmd_create(struct mlx5_core_dev *dev,
+						   struct mlx5_rsc_key *key)
+{
+	struct mlx5_rsc_dump_cmd *cmd;
+	int sgmt_type;
+
+	if (IS_ERR_OR_NULL(dev->rsc_dump))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	sgmt_type = dev->rsc_dump->fw_segment_type[key->rsc];
+	if (!sgmt_type && key->rsc != MLX5_SGMT_TYPE_MENU)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (!cmd) {
+		mlx5_core_err(dev, "Resource dump: Failed to allocate command\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	MLX5_SET(resource_dump, cmd->cmd, segment_type, sgmt_type);
+	MLX5_SET(resource_dump, cmd->cmd, index1, key->index1);
+	MLX5_SET(resource_dump, cmd->cmd, index2, key->index2);
+	MLX5_SET(resource_dump, cmd->cmd, num_of_obj1, key->num_of_obj1);
+	MLX5_SET(resource_dump, cmd->cmd, num_of_obj2, key->num_of_obj2);
+	MLX5_SET(resource_dump, cmd->cmd, size, key->size);
+	cmd->mem_size = key->size;
+	return cmd;
+}
+
+void mlx5_rsc_dump_cmd_destroy(struct mlx5_rsc_dump_cmd *cmd)
+{
+	kfree(cmd);
+}
+
+int mlx5_rsc_dump_next(struct mlx5_core_dev *dev, struct mlx5_rsc_dump_cmd *cmd,
+		       struct page *page, int *size)
+{
+	bool more_dump;
+	int err;
+
+	if (IS_ERR_OR_NULL(dev->rsc_dump))
+		return -EOPNOTSUPP;
+
+	err = mlx5_rsc_dump_trigger(dev, cmd, page);
+	if (err) {
+		mlx5_core_err(dev, "Resource dump: Failed to trigger dump, %d\n", err);
+		return err;
+	}
+	*size = MLX5_GET(resource_dump, cmd->cmd, size);
+	more_dump = MLX5_GET(resource_dump, cmd->cmd, more_dump);
+
+	return more_dump;
+}
+
+#define MLX5_RSC_DUMP_MENU_SEGMENT 0xffff
+static int mlx5_rsc_dump_menu(struct mlx5_core_dev *dev)
+{
+	struct mlx5_rsc_dump_cmd *cmd = NULL;
+	struct mlx5_rsc_key key = {};
+	struct page *page;
+	int size;
+	int err;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	key.rsc = MLX5_SGMT_TYPE_MENU;
+	key.size = PAGE_SIZE;
+	cmd  = mlx5_rsc_dump_cmd_create(dev, &key);
+	if (IS_ERR(cmd)) {
+		err = PTR_ERR(cmd);
+		goto free_page;
+	}
+	MLX5_SET(resource_dump, cmd->cmd, segment_type, MLX5_RSC_DUMP_MENU_SEGMENT);
+
+	do {
+		err = mlx5_rsc_dump_next(dev, cmd, page, &size);
+		if (err < 0)
+			goto destroy_cmd;
+
+		mlx5_rsc_dump_read_menu_sgmt(dev->rsc_dump, page);
+
+	} while (err > 0);
+
+destroy_cmd:
+	mlx5_rsc_dump_cmd_destroy(cmd);
+free_page:
+	__free_page(page);
+
+	return err;
+}
+
+static int mlx5_rsc_dump_create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
+				     struct mlx5_core_mkey *mkey)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	void *mkc;
+	u32 *in;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
+	MLX5_SET(mkc, mkc, lw, 1);
+	MLX5_SET(mkc, mkc, lr, 1);
+
+	MLX5_SET(mkc, mkc, pd, pdn);
+	MLX5_SET(mkc, mkc, length64, 1);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+
+	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+
+struct mlx5_rsc_dump *mlx5_rsc_dump_create(struct mlx5_core_dev *dev)
+{
+	struct mlx5_rsc_dump *rsc_dump;
+
+	if (!MLX5_CAP_DEBUG(dev, resource_dump)) {
+		mlx5_core_dbg(dev, "Resource dump: capability not present\n");
+		return NULL;
+	}
+	rsc_dump = kzalloc(sizeof(*rsc_dump), GFP_KERNEL);
+	if (!rsc_dump)
+		return ERR_PTR(-ENOMEM);
+
+	return rsc_dump;
+}
+
+void mlx5_rsc_dump_destroy(struct mlx5_core_dev *dev)
+{
+	if (IS_ERR_OR_NULL(dev->rsc_dump))
+		return;
+	kfree(dev->rsc_dump);
+}
+
+int mlx5_rsc_dump_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_rsc_dump *rsc_dump = dev->rsc_dump;
+	int err;
+
+	if (IS_ERR_OR_NULL(dev->rsc_dump))
+		return 0;
+
+	err = mlx5_core_alloc_pd(dev, &rsc_dump->pdn);
+	if (err) {
+		mlx5_core_warn(dev, "Resource dump: Failed to allocate PD %d\n", err);
+		return err;
+	}
+	err = mlx5_rsc_dump_create_mkey(dev, rsc_dump->pdn, &rsc_dump->mkey);
+	if (err) {
+		mlx5_core_err(dev, "Resource dump: Failed to create mkey, %d\n", err);
+		goto free_pd;
+	}
+	err = mlx5_rsc_dump_menu(dev);
+	if (err) {
+		mlx5_core_err(dev, "Resource dump: Failed to read menu, %d\n", err);
+		goto destroy_mkey;
+	}
+	return err;
+
+destroy_mkey:
+	mlx5_core_destroy_mkey(dev, &rsc_dump->mkey);
+free_pd:
+	mlx5_core_dealloc_pd(dev, rsc_dump->pdn);
+	return err;
+}
+
+void mlx5_rsc_dump_cleanup(struct mlx5_core_dev *dev)
+{
+	if (IS_ERR_OR_NULL(dev->rsc_dump))
+		return;
+
+	mlx5_core_destroy_mkey(dev, &dev->rsc_dump->mkey);
+	mlx5_core_dealloc_pd(dev, dev->rsc_dump->pdn);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.h
new file mode 100644
index 000000000000..3b7573461a45
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/rsc_dump.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __MLX5_RSC_DUMP_H
+#define __MLX5_RSC_DUMP__H
+
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+
+enum mlx5_sgmt_type {
+	MLX5_SGMT_TYPE_HW_CQPC,
+	MLX5_SGMT_TYPE_HW_SQPC,
+	MLX5_SGMT_TYPE_HW_RQPC,
+	MLX5_SGMT_TYPE_FULL_SRQC,
+	MLX5_SGMT_TYPE_FULL_CQC,
+	MLX5_SGMT_TYPE_FULL_EQC,
+	MLX5_SGMT_TYPE_FULL_QPC,
+	MLX5_SGMT_TYPE_SND_BUFF,
+	MLX5_SGMT_TYPE_RCV_BUFF,
+	MLX5_SGMT_TYPE_SRQ_BUFF,
+	MLX5_SGMT_TYPE_CQ_BUFF,
+	MLX5_SGMT_TYPE_EQ_BUFF,
+	MLX5_SGMT_TYPE_SX_SLICE,
+	MLX5_SGMT_TYPE_SX_SLICE_ALL,
+	MLX5_SGMT_TYPE_RDB,
+	MLX5_SGMT_TYPE_RX_SLICE_ALL,
+	MLX5_SGMT_TYPE_MENU,
+	MLX5_SGMT_TYPE_TERMINATE,
+
+	MLX5_SGMT_TYPE_NUM, /* Keep last */
+};
+
+struct mlx5_rsc_key {
+	enum mlx5_sgmt_type rsc;
+	int index1;
+	int index2;
+	int num_of_obj1;
+	int num_of_obj2;
+	int size;
+};
+
+#define MLX5_RSC_DUMP_ALL 0xFFFF
+struct mlx5_rsc_dump_cmd;
+struct mlx5_rsc_dump;
+
+struct mlx5_rsc_dump *mlx5_rsc_dump_create(struct mlx5_core_dev *dev);
+void mlx5_rsc_dump_destroy(struct mlx5_core_dev *dev);
+
+int mlx5_rsc_dump_init(struct mlx5_core_dev *dev);
+void mlx5_rsc_dump_cleanup(struct mlx5_core_dev *dev);
+
+struct mlx5_rsc_dump_cmd *mlx5_rsc_dump_cmd_create(struct mlx5_core_dev *dev,
+						   struct mlx5_rsc_key *key);
+void mlx5_rsc_dump_cmd_destroy(struct mlx5_rsc_dump_cmd *cmd);
+
+int mlx5_rsc_dump_next(struct mlx5_core_dev *dev, struct mlx5_rsc_dump_cmd *cmd,
+		       struct page *page, int *size);
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index f554cfddcf4e..204a26bf0a5f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -70,6 +70,7 @@
 #include "diag/fw_tracer.h"
 #include "ecpf.h"
 #include "lib/hv_vhca.h"
+#include "diag/rsc_dump.h"
 
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
 MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver");
@@ -880,6 +881,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 
 	dev->tracer = mlx5_fw_tracer_create(dev);
 	dev->hv_vhca = mlx5_hv_vhca_create(dev);
+	dev->rsc_dump = mlx5_rsc_dump_create(dev);
 
 	return 0;
 
@@ -909,6 +911,7 @@ err_devcom:
 
 static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 {
+	mlx5_rsc_dump_destroy(dev);
 	mlx5_hv_vhca_destroy(dev->hv_vhca);
 	mlx5_fw_tracer_destroy(dev->tracer);
 	mlx5_dm_cleanup(dev);
@@ -1079,6 +1082,12 @@ static int mlx5_load(struct mlx5_core_dev *dev)
 
 	mlx5_hv_vhca_init(dev->hv_vhca);
 
+	err = mlx5_rsc_dump_init(dev);
+	if (err) {
+		mlx5_core_err(dev, "Failed to init Resource dump\n");
+		goto err_rsc_dump;
+	}
+
 	err = mlx5_fpga_device_start(dev);
 	if (err) {
 		mlx5_core_err(dev, "fpga device start failed %d\n", err);
@@ -1134,6 +1143,8 @@ err_tls_start:
 err_ipsec_start:
 	mlx5_fpga_device_stop(dev);
 err_fpga_start:
+	mlx5_rsc_dump_cleanup(dev);
+err_rsc_dump:
 	mlx5_hv_vhca_cleanup(dev->hv_vhca);
 	mlx5_fw_tracer_cleanup(dev->tracer);
 err_fw_tracer:
@@ -1155,6 +1166,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev)
 	mlx5_accel_ipsec_cleanup(dev);
 	mlx5_accel_tls_cleanup(dev);
 	mlx5_fpga_device_stop(dev);
+	mlx5_rsc_dump_cleanup(dev);
 	mlx5_hv_vhca_cleanup(dev->hv_vhca);
 	mlx5_fw_tracer_cleanup(dev->tracer);
 	mlx5_eq_table_destroy(dev);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 277a51d3ec40..f99cbe249425 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -722,6 +722,7 @@ struct mlx5_core_dev {
 	struct mlx5_clock        clock;
 	struct mlx5_ib_clock_info  *clock_info;
 	struct mlx5_fw_tracer   *tracer;
+	struct mlx5_rsc_dump    *rsc_dump;
 	u32                      vsc_addr;
 	struct mlx5_hv_vhca	*hv_vhca;
 };
-- 
cgit v1.2.3


From a4393861a351f66fef1102e775743c86a276afce Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Mon, 17 Feb 2020 12:15:28 +0000
Subject: bpf, sk_msg: Let ULP restore sk_proto and write_space callback

We don't need a fallback for when the socket is not using ULP.
tcp_update_ulp handles this case exactly the same as we do in
sk_psock_restore_proto. Get rid of the duplicated code.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200217121530.754315-2-jakub@cloudflare.com
---
 include/linux/skmsg.h | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 14d61bba0b79..8605947d6c08 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -361,16 +361,7 @@ static inline void sk_psock_restore_proto(struct sock *sk,
 	sk->sk_prot->unhash = psock->saved_unhash;
 
 	if (psock->sk_proto) {
-		struct inet_connection_sock *icsk = inet_csk(sk);
-		bool has_ulp = !!icsk->icsk_ulp_data;
-
-		if (has_ulp) {
-			tcp_update_ulp(sk, psock->sk_proto,
-				       psock->saved_write_space);
-		} else {
-			sk->sk_prot = psock->sk_proto;
-			sk->sk_write_space = psock->saved_write_space;
-		}
+		tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
 		psock->sk_proto = NULL;
 	} else {
 		sk->sk_write_space = psock->saved_write_space;
-- 
cgit v1.2.3


From a178b4585865a4c756c41bc5376f63416b7d9271 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Mon, 17 Feb 2020 12:15:29 +0000
Subject: bpf, sk_msg: Don't clear saved sock proto on restore

There is no need to clear psock->sk_proto when restoring socket protocol
callbacks in sk->sk_prot. The psock is about to get detached from the sock
and eventually destroyed. At worst we will restore the protocol callbacks
and the write callback twice.

This makes reasoning about psock state easier. Once psock is initialized,
we can count on psock->sk_proto always being set.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200217121530.754315-3-jakub@cloudflare.com
---
 include/linux/skmsg.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 8605947d6c08..d90ef61712a1 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -359,13 +359,7 @@ static inline void sk_psock_restore_proto(struct sock *sk,
 					  struct sk_psock *psock)
 {
 	sk->sk_prot->unhash = psock->saved_unhash;
-
-	if (psock->sk_proto) {
-		tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
-		psock->sk_proto = NULL;
-	} else {
-		sk->sk_write_space = psock->saved_write_space;
-	}
+	tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
 }
 
 static inline void sk_psock_set_state(struct sk_psock *psock,
-- 
cgit v1.2.3


From 0f0d3827c0b4d6c3d219a73ea103077dc5bc17aa Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Sun, 16 Feb 2020 12:01:26 +0200
Subject: net/mlx5: E-Switch, Move source port on reg_c0 to the upper 16 bits

Multi chain support requires the miss path to continue the processing
from the last chain id, and for that we need to save the chain
miss tag (a mapping for 32bit chain id) on reg_c0 which will
come in a next patch.

Currently reg_c0 is exclusively used to store the source port
metadata, giving it 32bit, it is created from 16bits of vcha_id,
and 16bits of vport number.

We will move this source port metadata to upper 16bits, and leave the
lower bits for the chain miss tag. We compress the reg_c0 source port
metadata to 16bits by taking 8 bits from vhca_id, and 8bits from
the vport number.

Since we compress the vport number to 8bits statically, and leave two
top ids for special PF/ECPF numbers, we will only support a max of 254
vports with this strategy.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Oz Shlomo <ozsh@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c                  |  3 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 64 ++++++++++++++++++----
 include/linux/mlx5/eswitch.h                       | 29 +++++++++-
 3 files changed, 83 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index e874d688d040..230028cb292c 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3570,7 +3570,8 @@ static void mlx5_ib_set_rule_source_port(struct mlx5_ib_dev *dev,
 		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
 				    misc_parameters_2);
 
-		MLX5_SET_TO_ONES(fte_match_set_misc2, misc, metadata_reg_c_0);
+		MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
+			 mlx5_eswitch_get_vport_metadata_mask());
 	} else {
 		misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
 				    misc_parameters);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 979f13bdc203..788bb83b18e5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -85,7 +85,8 @@ mlx5_eswitch_set_rule_source_port(struct mlx5_eswitch *esw,
 								   attr->in_rep->vport));
 
 		misc2 = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters_2);
-		MLX5_SET_TO_ONES(fte_match_set_misc2, misc2, metadata_reg_c_0);
+		MLX5_SET(fte_match_set_misc2, misc2, metadata_reg_c_0,
+			 mlx5_eswitch_get_vport_metadata_mask());
 
 		spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2;
 		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
@@ -621,7 +622,8 @@ static void peer_miss_rules_setup(struct mlx5_eswitch *esw,
 	if (mlx5_eswitch_vport_match_metadata_enabled(esw)) {
 		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
 				    misc_parameters_2);
-		MLX5_SET_TO_ONES(fte_match_set_misc2, misc, metadata_reg_c_0);
+		MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
+			 mlx5_eswitch_get_vport_metadata_mask());
 
 		spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2;
 	} else {
@@ -851,8 +853,9 @@ static void esw_set_flow_group_source_port(struct mlx5_eswitch *esw,
 			 match_criteria_enable,
 			 MLX5_MATCH_MISC_PARAMETERS_2);
 
-		MLX5_SET_TO_ONES(fte_match_param, match_criteria,
-				 misc_parameters_2.metadata_reg_c_0);
+		MLX5_SET(fte_match_param, match_criteria,
+			 misc_parameters_2.metadata_reg_c_0,
+			 mlx5_eswitch_get_vport_metadata_mask());
 	} else {
 		MLX5_SET(create_flow_group_in, flow_group_in,
 			 match_criteria_enable,
@@ -1134,7 +1137,8 @@ mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, u16 vport,
 			 mlx5_eswitch_get_vport_metadata_for_match(esw, vport));
 
 		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters_2);
-		MLX5_SET_TO_ONES(fte_match_set_misc2, misc, metadata_reg_c_0);
+		MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
+			 mlx5_eswitch_get_vport_metadata_mask());
 
 		spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2;
 	} else {
@@ -1604,11 +1608,19 @@ static int esw_vport_add_ingress_acl_modify_metadata(struct mlx5_eswitch *esw,
 	static const struct mlx5_flow_spec spec = {};
 	struct mlx5_flow_act flow_act = {};
 	int err = 0;
+	u32 key;
+
+	key = mlx5_eswitch_get_vport_metadata_for_match(esw, vport->vport);
+	key >>= ESW_SOURCE_PORT_METADATA_OFFSET;
 
 	MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET);
-	MLX5_SET(set_action_in, action, field, MLX5_ACTION_IN_FIELD_METADATA_REG_C_0);
-	MLX5_SET(set_action_in, action, data,
-		 mlx5_eswitch_get_vport_metadata_for_match(esw, vport->vport));
+	MLX5_SET(set_action_in, action, field,
+		 MLX5_ACTION_IN_FIELD_METADATA_REG_C_0);
+	MLX5_SET(set_action_in, action, data, key);
+	MLX5_SET(set_action_in, action, offset,
+		 ESW_SOURCE_PORT_METADATA_OFFSET);
+	MLX5_SET(set_action_in, action, length,
+		 ESW_SOURCE_PORT_METADATA_BITS);
 
 	vport->ingress.offloads.modify_metadata =
 		mlx5_modify_header_alloc(esw->dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS,
@@ -2470,9 +2482,41 @@ bool mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw)
 }
 EXPORT_SYMBOL(mlx5_eswitch_vport_match_metadata_enabled);
 
-u32 mlx5_eswitch_get_vport_metadata_for_match(const struct mlx5_eswitch *esw,
+u32 mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw,
 					      u16 vport_num)
 {
-	return ((MLX5_CAP_GEN(esw->dev, vhca_id) & 0xffff) << 16) | vport_num;
+	u32 vport_num_mask = GENMASK(ESW_VPORT_BITS - 1, 0);
+	u32 vhca_id_mask = GENMASK(ESW_VHCA_ID_BITS - 1, 0);
+	u32 vhca_id = MLX5_CAP_GEN(esw->dev, vhca_id);
+	u32 val;
+
+	/* Make sure the vhca_id fits the ESW_VHCA_ID_BITS */
+	WARN_ON_ONCE(vhca_id >= BIT(ESW_VHCA_ID_BITS));
+
+	/* Trim vhca_id to ESW_VHCA_ID_BITS */
+	vhca_id &= vhca_id_mask;
+
+	/* Make sure pf and ecpf map to end of ESW_VPORT_BITS range so they
+	 * don't overlap with VF numbers, and themselves, after trimming.
+	 */
+	WARN_ON_ONCE((MLX5_VPORT_UPLINK & vport_num_mask) <
+		     vport_num_mask - 1);
+	WARN_ON_ONCE((MLX5_VPORT_ECPF & vport_num_mask) <
+		     vport_num_mask - 1);
+	WARN_ON_ONCE((MLX5_VPORT_UPLINK & vport_num_mask) ==
+		     (MLX5_VPORT_ECPF & vport_num_mask));
+
+	/* Make sure that the VF vport_num fits ESW_VPORT_BITS and don't
+	 * overlap with pf and ecpf.
+	 */
+	if (vport_num != MLX5_VPORT_UPLINK &&
+	    vport_num != MLX5_VPORT_ECPF)
+		WARN_ON_ONCE(vport_num >= vport_num_mask - 1);
+
+	/* We can now trim vport_num to ESW_VPORT_BITS */
+	vport_num &= vport_num_mask;
+
+	val = (vhca_id << ESW_VPORT_BITS) | vport_num;
+	return val << (32 - ESW_SOURCE_PORT_METADATA_BITS);
 }
 EXPORT_SYMBOL(mlx5_eswitch_get_vport_metadata_for_match);
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 98e667b176ef..dd1333f29f6e 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -71,7 +71,26 @@ enum devlink_eswitch_encap_mode
 mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev);
 
 bool mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw);
-u32 mlx5_eswitch_get_vport_metadata_for_match(const struct mlx5_eswitch *esw,
+
+/* Reg C0 usage:
+ * Reg C0 = < ESW_VHCA_ID_BITS(8) | ESW_VPORT BITS(8) | ESW_CHAIN_TAG(16) >
+ *
+ * Highest 8 bits of the reg c0 is the vhca_id, next 8 bits is vport_num,
+ * the rest (lowest 16 bits) is left for tc chain tag restoration.
+ * VHCA_ID + VPORT comprise the SOURCE_PORT matching.
+ */
+#define ESW_VHCA_ID_BITS 8
+#define ESW_VPORT_BITS 8
+#define ESW_SOURCE_PORT_METADATA_BITS (ESW_VHCA_ID_BITS + ESW_VPORT_BITS)
+#define ESW_SOURCE_PORT_METADATA_OFFSET (32 - ESW_SOURCE_PORT_METADATA_BITS)
+#define ESW_CHAIN_TAG_METADATA_BITS (32 - ESW_SOURCE_PORT_METADATA_BITS)
+
+static inline u32 mlx5_eswitch_get_vport_metadata_mask(void)
+{
+	return GENMASK(31, 32 - ESW_SOURCE_PORT_METADATA_BITS);
+}
+
+u32 mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw,
 					      u16 vport_num);
 u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw);
 #else  /* CONFIG_MLX5_ESWITCH */
@@ -94,11 +113,17 @@ mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw)
 };
 
 static inline u32
-mlx5_eswitch_get_vport_metadata_for_match(const struct mlx5_eswitch *esw,
+mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw,
 					  int vport_num)
 {
 	return 0;
 };
+
+static inline u32
+mlx5_eswitch_get_vport_metadata_mask(void)
+{
+	return 0;
+}
 #endif /* CONFIG_MLX5_ESWITCH */
 
 #endif
-- 
cgit v1.2.3


From 11b717d6152699623fb1133759f9b8f235935a51 Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Sun, 16 Feb 2020 12:01:27 +0200
Subject: net/mlx5: E-Switch, Get reg_c0 value on CQE

On RX side create a restore table in OFFLOADS namespace.
This table will match on all values for reg_c0 we will use,
and set it to the flow_tag. This flow tag can then be read on the CQE.

As there is no copy action from reg c0 to flow tag, instead we have to
set the flow tag explictily. We add an API so callers can add all the used
reg_c0 values (tags) and for each of those we add a restore rule.

This will be used in a following patch to save the miss chain mapping
tag on reg_c0 and from it restore the tc chain on the skb.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Oz Shlomo <ozsh@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  14 ++
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 147 +++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |   4 +-
 include/linux/mlx5/eswitch.h                       |   2 +
 4 files changed, 156 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 4472710ccc9c..a94d91cdc758 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -189,6 +189,9 @@ struct mlx5_eswitch_fdb {
 };
 
 struct mlx5_esw_offload {
+	struct mlx5_flow_table *ft_offloads_restore;
+	struct mlx5_flow_group *restore_group;
+
 	struct mlx5_flow_table *ft_offloads;
 	struct mlx5_flow_group *vport_rx_group;
 	struct mlx5_eswitch_rep *vport_reps;
@@ -623,6 +626,11 @@ void
 esw_vport_destroy_offloads_acl_tables(struct mlx5_eswitch *esw,
 				      struct mlx5_vport *vport);
 
+struct mlx5_flow_handle *
+esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag);
+u32
+esw_get_max_restore_tag(struct mlx5_eswitch *esw);
+
 #else  /* CONFIG_MLX5_ESWITCH */
 /* eswitch API stubs */
 static inline int  mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; }
@@ -638,6 +646,12 @@ static inline const u32 *mlx5_esw_query_functions(struct mlx5_core_dev *dev)
 
 static inline void mlx5_eswitch_update_num_of_vfs(struct mlx5_eswitch *esw, const int num_vfs) {}
 
+static struct mlx5_flow_handle *
+esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 #endif /* CONFIG_MLX5_ESWITCH */
 
 #endif /* __MLX5_ESWITCH_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 788bb83b18e5..81c2cbf0c308 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -838,6 +838,54 @@ out:
 	return err;
 }
 
+struct mlx5_flow_handle *
+esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag)
+{
+	struct mlx5_flow_act flow_act = { .flags = FLOW_ACT_NO_APPEND, };
+	struct mlx5_flow_table *ft = esw->offloads.ft_offloads_restore;
+	struct mlx5_flow_context *flow_context;
+	struct mlx5_flow_handle *flow_rule;
+	struct mlx5_flow_destination dest;
+	struct mlx5_flow_spec *spec;
+	void *misc;
+
+	spec = kzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return ERR_PTR(-ENOMEM);
+
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+			    misc_parameters_2);
+	MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
+		 ESW_CHAIN_TAG_METADATA_MASK);
+	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+			    misc_parameters_2);
+	MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, tag);
+	spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2;
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+
+	flow_context = &spec->flow_context;
+	flow_context->flags |= FLOW_CONTEXT_HAS_TAG;
+	flow_context->flow_tag = tag;
+	dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+	dest.ft = esw->offloads.ft_offloads;
+
+	flow_rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1);
+	kfree(spec);
+
+	if (IS_ERR(flow_rule))
+		esw_warn(esw->dev,
+			 "Failed to create restore rule for tag: %d, err(%d)\n",
+			 tag, (int)PTR_ERR(flow_rule));
+
+	return flow_rule;
+}
+
+u32
+esw_get_max_restore_tag(struct mlx5_eswitch *esw)
+{
+	return ESW_CHAIN_TAG_METADATA_MASK;
+}
+
 #define MAX_PF_SQ 256
 #define MAX_SQ_NVPORTS 32
 
@@ -1060,6 +1108,7 @@ static int esw_create_offloads_table(struct mlx5_eswitch *esw, int nvports)
 	}
 
 	ft_attr.max_fte = nvports + MLX5_ESW_MISS_FLOWS;
+	ft_attr.prio = 1;
 
 	ft_offloads = mlx5_create_flow_table(ns, &ft_attr);
 	if (IS_ERR(ft_offloads)) {
@@ -1164,6 +1213,81 @@ out:
 	return flow_rule;
 }
 
+static void esw_destroy_restore_table(struct mlx5_eswitch *esw)
+{
+	struct mlx5_esw_offload *offloads = &esw->offloads;
+
+	mlx5_destroy_flow_group(offloads->restore_group);
+	mlx5_destroy_flow_table(offloads->ft_offloads_restore);
+}
+
+static int esw_create_restore_table(struct mlx5_eswitch *esw)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_namespace *ns;
+	void *match_criteria, *misc;
+	struct mlx5_flow_table *ft;
+	struct mlx5_flow_group *g;
+	u32 *flow_group_in;
+	int err = 0;
+
+	ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_OFFLOADS);
+	if (!ns) {
+		esw_warn(esw->dev, "Failed to get offloads flow namespace\n");
+		return -EOPNOTSUPP;
+	}
+
+	flow_group_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!flow_group_in) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	ft_attr.max_fte = 1 << ESW_CHAIN_TAG_METADATA_BITS;
+	ft = mlx5_create_flow_table(ns, &ft_attr);
+	if (IS_ERR(ft)) {
+		err = PTR_ERR(ft);
+		esw_warn(esw->dev, "Failed to create restore table, err %d\n",
+			 err);
+		goto out_free;
+	}
+
+	memset(flow_group_in, 0, inlen);
+	match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in,
+				      match_criteria);
+	misc = MLX5_ADDR_OF(fte_match_param, match_criteria,
+			    misc_parameters_2);
+
+	MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
+		 ESW_CHAIN_TAG_METADATA_MASK);
+	MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
+	MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index,
+		 ft_attr.max_fte - 1);
+	MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+		 MLX5_MATCH_MISC_PARAMETERS_2);
+	g = mlx5_create_flow_group(ft, flow_group_in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		esw_warn(dev, "Failed to create restore flow group, err: %d\n",
+			 err);
+		goto err_group;
+	}
+
+	esw->offloads.ft_offloads_restore = ft;
+	esw->offloads.restore_group = g;
+
+	return 0;
+
+err_group:
+	mlx5_destroy_flow_table(ft);
+out_free:
+	kvfree(flow_group_in);
+
+	return err;
+}
+
 static int esw_offloads_start(struct mlx5_eswitch *esw,
 			      struct netlink_ext_ack *extack)
 {
@@ -1923,13 +2047,17 @@ static int esw_offloads_steering_init(struct mlx5_eswitch *esw)
 	if (err)
 		return err;
 
-	err = esw_create_offloads_fdb_tables(esw, total_vports);
+	err = esw_create_offloads_table(esw, total_vports);
 	if (err)
-		goto create_fdb_err;
+		goto create_offloads_err;
 
-	err = esw_create_offloads_table(esw, total_vports);
+	err = esw_create_restore_table(esw);
 	if (err)
-		goto create_ft_err;
+		goto create_restore_err;
+
+	err = esw_create_offloads_fdb_tables(esw, total_vports);
+	if (err)
+		goto create_fdb_err;
 
 	err = esw_create_vport_rx_group(esw, total_vports);
 	if (err)
@@ -1938,12 +2066,12 @@ static int esw_offloads_steering_init(struct mlx5_eswitch *esw)
 	return 0;
 
 create_fg_err:
-	esw_destroy_offloads_table(esw);
-
-create_ft_err:
 	esw_destroy_offloads_fdb_tables(esw);
-
 create_fdb_err:
+	esw_destroy_restore_table(esw);
+create_restore_err:
+	esw_destroy_offloads_table(esw);
+create_offloads_err:
 	esw_destroy_uplink_offloads_acl_tables(esw);
 
 	return err;
@@ -1952,8 +2080,9 @@ create_fdb_err:
 static void esw_offloads_steering_cleanup(struct mlx5_eswitch *esw)
 {
 	esw_destroy_vport_rx_group(esw);
-	esw_destroy_offloads_table(esw);
 	esw_destroy_offloads_fdb_tables(esw);
+	esw_destroy_restore_table(esw);
+	esw_destroy_offloads_table(esw);
 	esw_destroy_uplink_offloads_acl_tables(esw);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 9dc24241dc91..2660ffabb09f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -111,8 +111,8 @@
 #define ANCHOR_MIN_LEVEL (BY_PASS_MIN_LEVEL + 1)
 
 #define OFFLOADS_MAX_FT 1
-#define OFFLOADS_NUM_PRIOS 1
-#define OFFLOADS_MIN_LEVEL (ANCHOR_MIN_LEVEL + 1)
+#define OFFLOADS_NUM_PRIOS 2
+#define OFFLOADS_MIN_LEVEL (ANCHOR_MIN_LEVEL + OFFLOADS_NUM_PRIOS)
 
 #define LAG_PRIO_NUM_LEVELS 1
 #define LAG_NUM_PRIOS 1
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index dd1333f29f6e..61705e74a5bb 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -84,6 +84,8 @@ bool mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw);
 #define ESW_SOURCE_PORT_METADATA_BITS (ESW_VHCA_ID_BITS + ESW_VPORT_BITS)
 #define ESW_SOURCE_PORT_METADATA_OFFSET (32 - ESW_SOURCE_PORT_METADATA_BITS)
 #define ESW_CHAIN_TAG_METADATA_BITS (32 - ESW_SOURCE_PORT_METADATA_BITS)
+#define ESW_CHAIN_TAG_METADATA_MASK GENMASK(ESW_CHAIN_TAG_METADATA_BITS - 1,\
+					    0)
 
 static inline u32 mlx5_eswitch_get_vport_metadata_mask(void)
 {
-- 
cgit v1.2.3


From b8e202d1d1d0f182f01062804efb523ea9a9008c Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Tue, 18 Feb 2020 17:10:13 +0000
Subject: net, sk_msg: Annotate lockless access to sk_prot on clone

sk_msg and ULP frameworks override protocol callbacks pointer in
sk->sk_prot, while tcp accesses it locklessly when cloning the listening
socket, that is with neither sk_lock nor sk_callback_lock held.

Once we enable use of listening sockets with sockmap (and hence sk_msg),
there will be shared access to sk->sk_prot if socket is getting cloned
while being inserted/deleted to/from the sockmap from another CPU:

Read side:

tcp_v4_rcv
  sk = __inet_lookup_skb(...)
  tcp_check_req(sk)
    inet_csk(sk)->icsk_af_ops->syn_recv_sock
      tcp_v4_syn_recv_sock
        tcp_create_openreq_child
          inet_csk_clone_lock
            sk_clone_lock
              READ_ONCE(sk->sk_prot)

Write side:

sock_map_ops->map_update_elem
  sock_map_update_elem
    sock_map_update_common
      sock_map_link_no_progs
        tcp_bpf_init
          tcp_bpf_update_sk_prot
            sk_psock_update_proto
              WRITE_ONCE(sk->sk_prot, ops)

sock_map_ops->map_delete_elem
  sock_map_delete_elem
    __sock_map_delete
     sock_map_unref
       sk_psock_put
         sk_psock_drop
           sk_psock_restore_proto
             tcp_update_ulp
               WRITE_ONCE(sk->sk_prot, proto)

Mark the shared access with READ_ONCE/WRITE_ONCE annotations.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200218171023.844439-2-jakub@cloudflare.com
---
 include/linux/skmsg.h | 3 ++-
 net/core/sock.c       | 8 +++++---
 net/ipv4/tcp_bpf.c    | 4 +++-
 net/ipv4/tcp_ulp.c    | 3 ++-
 net/tls/tls_main.c    | 3 ++-
 5 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index d90ef61712a1..112765bd146d 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -352,7 +352,8 @@ static inline void sk_psock_update_proto(struct sock *sk,
 	psock->saved_write_space = sk->sk_write_space;
 
 	psock->sk_proto = sk->sk_prot;
-	sk->sk_prot = ops;
+	/* Pairs with lockless read in sk_clone_lock() */
+	WRITE_ONCE(sk->sk_prot, ops);
 }
 
 static inline void sk_psock_restore_proto(struct sock *sk,
diff --git a/net/core/sock.c b/net/core/sock.c
index a4c8fac781ff..bf1173b93eda 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1572,13 +1572,14 @@ static inline void sock_lock_init(struct sock *sk)
  */
 static void sock_copy(struct sock *nsk, const struct sock *osk)
 {
+	const struct proto *prot = READ_ONCE(osk->sk_prot);
 #ifdef CONFIG_SECURITY_NETWORK
 	void *sptr = nsk->sk_security;
 #endif
 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
 
 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
-	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
 
 #ifdef CONFIG_SECURITY_NETWORK
 	nsk->sk_security = sptr;
@@ -1792,16 +1793,17 @@ static void sk_init_common(struct sock *sk)
  */
 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 {
+	struct proto *prot = READ_ONCE(sk->sk_prot);
 	struct sock *newsk;
 	bool is_charged = true;
 
-	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
+	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
 	if (newsk != NULL) {
 		struct sk_filter *filter;
 
 		sock_copy(newsk, sk);
 
-		newsk->sk_prot_creator = sk->sk_prot;
+		newsk->sk_prot_creator = prot;
 
 		/* SANITY */
 		if (likely(newsk->sk_net_refcnt))
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 8a01428f80c1..dd183b050642 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -645,8 +645,10 @@ static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock)
 	/* Reinit occurs when program types change e.g. TCP_BPF_TX is removed
 	 * or added requiring sk_prot hook updates. We keep original saved
 	 * hooks in this case.
+	 *
+	 * Pairs with lockless read in sk_clone_lock().
 	 */
-	sk->sk_prot = &tcp_bpf_prots[family][config];
+	WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]);
 }
 
 static int tcp_bpf_assert_proto_ops(struct proto *ops)
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 38d3ad141161..6c43fa189195 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -106,7 +106,8 @@ void tcp_update_ulp(struct sock *sk, struct proto *proto,
 
 	if (!icsk->icsk_ulp_ops) {
 		sk->sk_write_space = write_space;
-		sk->sk_prot = proto;
+		/* Pairs with lockless read in sk_clone_lock() */
+		WRITE_ONCE(sk->sk_prot, proto);
 		return;
 	}
 
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 94774c0e5ff3..82225bcc1117 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -742,7 +742,8 @@ static void tls_update(struct sock *sk, struct proto *p,
 		ctx->sk_write_space = write_space;
 		ctx->sk_proto = p;
 	} else {
-		sk->sk_prot = p;
+		/* Pairs with lockless read in sk_clone_lock(). */
+		WRITE_ONCE(sk->sk_prot, p);
 		sk->sk_write_space = write_space;
 	}
 }
-- 
cgit v1.2.3


From ce69e2162f158d9d4a0e513971d02dabc7d14cb7 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Mon, 24 Feb 2020 16:53:58 -0600
Subject: mdio_bus: Add generic mdio_find_bus()

It appears most ethernet drivers follow one of two main strategies
for mdio bus/phy management. A monolithic model where the net driver
itself creates, probes and uses the phy, and one where an external
mdio/phy driver instantiates the mdio bus/phy and the net driver
only attaches to a known phy. Usually in this latter model the phys
are discovered via DT relationships or simply phy name/address
hardcoding.

This is a shame because modern well behaved mdio buses are self
describing and can be probed. The mdio layer itself is fully capable
of this, yet there isn't a clean way for a standalone net driver
to attach and enumerate the discovered devices. This is because
outside of of_mdio_find_bus() there isn't a straightforward way
to acquire the mii_bus pointer.

So, lets add a mdio_find_bus which can return the mii_bus based
only on its name.

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio_bus.c | 17 +++++++++++++++++
 include/linux/phy.h        |  1 +
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 9bb9f37f21dc..3ab9ca7614d1 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -462,6 +462,23 @@ static struct class mdio_bus_class = {
 	.dev_groups	= mdio_bus_groups,
 };
 
+/**
+ * mdio_find_bus - Given the name of a mdiobus, find the mii_bus.
+ * @mdio_bus_np: Pointer to the mii_bus.
+ *
+ * Returns a reference to the mii_bus, or NULL if none found.  The
+ * embedded struct device will have its reference count incremented,
+ * and this must be put_deviced'ed once the bus is finished with.
+ */
+struct mii_bus *mdio_find_bus(const char *mdio_name)
+{
+	struct device *d;
+
+	d = class_find_device_by_name(&mdio_bus_class, mdio_name);
+	return d ? to_mii_bus(d) : NULL;
+}
+EXPORT_SYMBOL(mdio_find_bus);
+
 #if IS_ENABLED(CONFIG_OF_MDIO)
 /**
  * of_mdio_find_bus - Given an mii_bus node, find the mii_bus.
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 80f8b2158271..e72dbd0d2d6a 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -289,6 +289,7 @@ static inline struct mii_bus *devm_mdiobus_alloc(struct device *dev)
 	return devm_mdiobus_alloc_size(dev, 0);
 }
 
+struct mii_bus *mdio_find_bus(const char *mdio_name);
 void devm_mdiobus_free(struct device *dev, struct mii_bus *bus);
 struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr);
 
-- 
cgit v1.2.3


From 3c58482a382bae89410439247152eb342e9872f7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 24 Feb 2020 15:01:41 +0100
Subject: bpf: Provide bpf_prog_run_pin_on_cpu() helper

BPF programs require to run on one CPU to completion as they use per CPU
storage, but according to Alexei they don't need reentrancy protection as
obviously BPF programs running in thread context can always be 'preempted'
by hard and soft interrupts and instrumentation and the same program can
run concurrently on a different CPU.

The currently used mechanism to ensure CPUness is to wrap the invocation
into a preempt_disable/enable() pair. Disabling preemption is also
disabling migration for a task.

preempt_disable/enable() is used because there is no explicit way to
reliably disable only migration.

Provide a separate macro to invoke a BPF program which can be used in
migrateable task context.

It wraps BPF_PROG_RUN() in a migrate_disable/enable() pair which maps on
non RT enabled kernels to preempt_disable/enable(). On RT enabled kernels
this merely disables migration. Both methods ensure that the invoked BPF
program runs on one CPU to completion.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200224145643.474592620@linutronix.de
---
 include/linux/filter.h | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index f349e2c0884c..38f60188bb26 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -576,8 +576,30 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 	}								\
 	ret; })
 
-#define BPF_PROG_RUN(prog, ctx) __BPF_PROG_RUN(prog, ctx,		\
-					       bpf_dispatcher_nopfunc)
+#define BPF_PROG_RUN(prog, ctx)						\
+	__BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc)
+
+/*
+ * Use in preemptible and therefore migratable context to make sure that
+ * the execution of the BPF program runs on one CPU.
+ *
+ * This uses migrate_disable/enable() explicitly to document that the
+ * invocation of a BPF program does not require reentrancy protection
+ * against a BPF program which is invoked from a preempting task.
+ *
+ * For non RT enabled kernels migrate_disable/enable() maps to
+ * preempt_disable/enable(), i.e. it disables also preemption.
+ */
+static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
+					  const void *ctx)
+{
+	u32 ret;
+
+	migrate_disable();
+	ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc);
+	migrate_enable();
+	return ret;
+}
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
-- 
cgit v1.2.3


From 37e1d9202225635772b32e340294208367279c2b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 24 Feb 2020 15:01:42 +0100
Subject: bpf: Replace cant_sleep() with cant_migrate()

As already discussed in the previous change which introduced
BPF_RUN_PROG_PIN_ON_CPU() BPF only requires to disable migration to
guarantee per CPUness.

If RT substitutes the preempt disable based migration protection then the
cant_sleep() check will obviously trigger as preemption is not disabled.

Replace it by cant_migrate() which maps to cant_sleep() on a non RT kernel
and will verify that migration is disabled on a full RT kernel.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200224145643.583038889@linutronix.de
---
 include/linux/filter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 38f60188bb26..1982a52eb4c9 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -561,7 +561,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 
 #define __BPF_PROG_RUN(prog, ctx, dfunc)	({			\
 	u32 ret;							\
-	cant_sleep();							\
+	cant_migrate();							\
 	if (static_branch_unlikely(&bpf_stats_enabled_key)) {		\
 		struct bpf_prog_stats *stats;				\
 		u64 start = sched_clock();				\
-- 
cgit v1.2.3


From 3d9f773cf2876c01a505b9fe27270901d464e90a Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Mon, 24 Feb 2020 15:01:43 +0100
Subject: bpf: Use bpf_prog_run_pin_on_cpu() at simple call sites.

All of these cases are strictly of the form:

	preempt_disable();
	BPF_PROG_RUN(...);
	preempt_enable();

Replace this with bpf_prog_run_pin_on_cpu() which wraps BPF_PROG_RUN()
with:

	migrate_disable();
	BPF_PROG_RUN(...);
	migrate_enable();

On non RT enabled kernels this maps to preempt_disable/enable() and on RT
enabled kernels this solely prevents migration, which is sufficient as
there is no requirement to prevent reentrancy to any BPF program from a
preempting task. The only requirement is that the program stays on the same
CPU.

Therefore, this is a trivially correct transformation.

The seccomp loop does not need protection over the loop. It only needs
protection per BPF filter program

[ tglx: Converted to bpf_prog_run_pin_on_cpu() ]

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200224145643.691493094@linutronix.de
---
 include/linux/filter.h    | 4 +---
 kernel/seccomp.c          | 4 +---
 net/core/flow_dissector.c | 4 +---
 net/core/skmsg.c          | 8 ++------
 net/kcm/kcmsock.c         | 4 +---
 5 files changed, 6 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1982a52eb4c9..9270de2a0df8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -717,9 +717,7 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
 	if (unlikely(prog->cb_access))
 		memset(cb_data, 0, BPF_SKB_CB_LEN);
 
-	preempt_disable();
-	res = BPF_PROG_RUN(prog, skb);
-	preempt_enable();
+	res = bpf_prog_run_pin_on_cpu(prog, skb);
 	return res;
 }
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b6ea3dcb57bf..787041eb011b 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -268,16 +268,14 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
 	 * All filters in the list are evaluated and the lowest BPF return
 	 * value always takes priority (ignoring the DATA).
 	 */
-	preempt_disable();
 	for (; f; f = f->prev) {
-		u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
+		u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd);
 
 		if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
 			ret = cur_ret;
 			*match = f;
 		}
 	}
-	preempt_enable();
 	return ret;
 }
 #endif /* CONFIG_SECCOMP_FILTER */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index a1670dff0629..3eff84824c8b 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -920,9 +920,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
 		     (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
 	flow_keys->flags = flags;
 
-	preempt_disable();
-	result = BPF_PROG_RUN(prog, ctx);
-	preempt_enable();
+	result = bpf_prog_run_pin_on_cpu(prog, ctx);
 
 	flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen);
 	flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index eeb28cb85664..c479372f2cd2 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -628,7 +628,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
 	struct bpf_prog *prog;
 	int ret;
 
-	preempt_disable();
 	rcu_read_lock();
 	prog = READ_ONCE(psock->progs.msg_parser);
 	if (unlikely(!prog)) {
@@ -638,7 +637,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
 
 	sk_msg_compute_data_pointers(msg);
 	msg->sk = sk;
-	ret = BPF_PROG_RUN(prog, msg);
+	ret = bpf_prog_run_pin_on_cpu(prog, msg);
 	ret = sk_psock_map_verd(ret, msg->sk_redir);
 	psock->apply_bytes = msg->apply_bytes;
 	if (ret == __SK_REDIRECT) {
@@ -653,7 +652,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
 	}
 out:
 	rcu_read_unlock();
-	preempt_enable();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
@@ -665,9 +663,7 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
 
 	skb->sk = psock->sk;
 	bpf_compute_data_end_sk_skb(skb);
-	preempt_disable();
-	ret = BPF_PROG_RUN(prog, skb);
-	preempt_enable();
+	ret = bpf_prog_run_pin_on_cpu(prog, skb);
 	/* strparser clones the skb before handing it to a upper layer,
 	 * meaning skb_orphan has been called. We NULL sk on the way out
 	 * to ensure we don't trigger a BUG_ON() in skb/sk operations
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index ea9e73428ed9..56fac24a627a 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -380,9 +380,7 @@ static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb)
 	struct bpf_prog *prog = psock->bpf_prog;
 	int res;
 
-	preempt_disable();
-	res = BPF_PROG_RUN(prog, skb);
-	preempt_enable();
+	res = bpf_prog_run_pin_on_cpu(prog, skb);
 	return res;
 }
 
-- 
cgit v1.2.3


From 2a916f2f546ca1c1e3323e2a4269307f6d9890eb Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Mon, 24 Feb 2020 15:01:46 +0100
Subject: bpf: Use migrate_disable/enable in array macros and cgroup/lirc code.

Replace the preemption disable/enable with migrate_disable/enable() to
reflect the actual requirement and to allow PREEMPT_RT to substitute it
with an actual migration disable mechanism which does not disable
preemption.

Including the code paths that go via __bpf_prog_run_save_cb().

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200224145643.998293311@linutronix.de
---
 include/linux/bpf.h    | 8 ++++----
 include/linux/filter.h | 5 +++--
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 49b1a70e12c8..76b3a0eb1502 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -885,7 +885,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 		struct bpf_prog *_prog;			\
 		struct bpf_prog_array *_array;		\
 		u32 _ret = 1;				\
-		preempt_disable();			\
+		migrate_disable();			\
 		rcu_read_lock();			\
 		_array = rcu_dereference(array);	\
 		if (unlikely(check_non_null && !_array))\
@@ -898,7 +898,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 		}					\
 _out:							\
 		rcu_read_unlock();			\
-		preempt_enable();			\
+		migrate_enable();			\
 		_ret;					\
 	 })
 
@@ -932,7 +932,7 @@ _out:							\
 		u32 ret;				\
 		u32 _ret = 1;				\
 		u32 _cn = 0;				\
-		preempt_disable();			\
+		migrate_disable();			\
 		rcu_read_lock();			\
 		_array = rcu_dereference(array);	\
 		_item = &_array->items[0];		\
@@ -944,7 +944,7 @@ _out:							\
 			_item++;			\
 		}					\
 		rcu_read_unlock();			\
-		preempt_enable();			\
+		migrate_enable();			\
 		if (_ret)				\
 			_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);	\
 		else					\
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9270de2a0df8..43b5e455d2f5 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -677,6 +677,7 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb)
 	return qdisc_skb_cb(skb)->data;
 }
 
+/* Must be invoked with migration disabled */
 static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
 					 struct sk_buff *skb)
 {
@@ -702,9 +703,9 @@ static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
 {
 	u32 res;
 
-	preempt_disable();
+	migrate_disable();
 	res = __bpf_prog_run_save_cb(prog, skb);
-	preempt_enable();
+	migrate_enable();
 	return res;
 }
 
-- 
cgit v1.2.3


From c518cfa0c5ad75ddf3d743f1e35b9cf5fc2c346e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 24 Feb 2020 15:01:47 +0100
Subject: bpf: Provide recursion prevention helpers

The places which need to prevent the execution of trace type BPF programs
to prevent deadlocks on the hash bucket lock do this open coded.

Provide two inline functions, bpf_disable/enable_instrumentation() to
replace these open coded protection constructs.

Use migrate_disable/enable() instead of preempt_disable/enable() right away
so this works on RT enabled kernels. On a !RT kernel migrate_disable /
enable() are mapped to preempt_disable/enable().

These helpers use this_cpu_inc/dec() instead of __this_cpu_inc/dec() on an
RT enabled kernel because migrate disabled regions are preemptible and
preemption might hit in the middle of a RMW operation which can lead to
inconsistent state.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200224145644.103910133@linutronix.de
---
 include/linux/bpf.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 76b3a0eb1502..1acd5bf70350 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -961,6 +961,36 @@ _out:							\
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
+/*
+ * Block execution of BPF programs attached to instrumentation (perf,
+ * kprobes, tracepoints) to prevent deadlocks on map operations as any of
+ * these events can happen inside a region which holds a map bucket lock
+ * and can deadlock on it.
+ *
+ * Use the preemption safe inc/dec variants on RT because migrate disable
+ * is preemptible on RT and preemption in the middle of the RMW operation
+ * might lead to inconsistent state. Use the raw variants for non RT
+ * kernels as migrate_disable() maps to preempt_disable() so the slightly
+ * more expensive save operation can be avoided.
+ */
+static inline void bpf_disable_instrumentation(void)
+{
+	migrate_disable();
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		this_cpu_inc(bpf_prog_active);
+	else
+		__this_cpu_inc(bpf_prog_active);
+}
+
+static inline void bpf_enable_instrumentation(void)
+{
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		this_cpu_dec(bpf_prog_active);
+	else
+		__this_cpu_dec(bpf_prog_active);
+	migrate_enable();
+}
+
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
 
-- 
cgit v1.2.3


From f70ce185687bbe4e2d7ff126a8c890631f5fc2af Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 27 Feb 2020 04:37:11 +0100
Subject: sysfs: add sysfs_file_change_owner()

Add helpers to change the owner of a sysfs files.
This function will be used to correctly account for kobject ownership
changes, e.g. when moving network devices between network namespaces.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/sysfs/file.c       | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sysfs.h | 10 ++++++++++
 2 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 130fc6fbcc03..4ca936ca3ba4 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -558,3 +558,50 @@ void sysfs_remove_bin_file(struct kobject *kobj,
 	kernfs_remove_by_name(kobj->sd, attr->attr.name);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
+
+static int internal_change_owner(struct kernfs_node *kn, kuid_t kuid,
+				 kgid_t kgid)
+{
+	struct iattr newattrs = {
+		.ia_valid = ATTR_UID | ATTR_GID,
+		.ia_uid = kuid,
+		.ia_gid = kgid,
+	};
+	return kernfs_setattr(kn, &newattrs);
+}
+
+/**
+ *	sysfs_file_change_owner - change owner of a sysfs file.
+ *	@kobj:	object.
+ *	@name:	name of the file to change.
+ *	@kuid:	new owner's kuid
+ *	@kgid:	new owner's kgid
+ *
+ * This function looks up the sysfs entry @name under @kobj and changes the
+ * ownership to @kuid/@kgid.
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid,
+			    kgid_t kgid)
+{
+	struct kernfs_node *kn;
+	int error;
+
+	if (!name)
+		return -EINVAL;
+
+	if (!kobj->state_in_sysfs)
+		return -EINVAL;
+
+	kn = kernfs_find_and_get(kobj->sd, name);
+	if (!kn)
+		return -ENOENT;
+
+	error = internal_change_owner(kn, kuid, kgid);
+
+	kernfs_put(kn);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_file_change_owner);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index fa7ee503fb76..a7884024a911 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -310,6 +310,9 @@ static inline void sysfs_enable_ns(struct kernfs_node *kn)
 	return kernfs_enable_ns(kn);
 }
 
+int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid,
+			    kgid_t kgid);
+
 #else /* CONFIG_SYSFS */
 
 static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
@@ -522,6 +525,13 @@ static inline void sysfs_enable_ns(struct kernfs_node *kn)
 {
 }
 
+static inline int sysfs_file_change_owner(struct kobject *kobj,
+					  const char *name, kuid_t kuid,
+					  kgid_t kgid)
+{
+	return 0;
+}
+
 #endif /* CONFIG_SYSFS */
 
 static inline int __must_check sysfs_create_file(struct kobject *kobj,
-- 
cgit v1.2.3


From 0666a3aee762cd4f7981c2eed0fd8cab87533539 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 27 Feb 2020 04:37:12 +0100
Subject: sysfs: add sysfs_link_change_owner()

Add a helper to change the owner of a sysfs link.
This function will be used to correctly account for kobject ownership
changes, e.g. when moving network devices between network namespaces.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/sysfs/file.c       | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/sysfs.h | 10 ++++++++++
 2 files changed, 51 insertions(+)

(limited to 'include/linux')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 4ca936ca3ba4..332cd69b378c 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -570,6 +570,47 @@ static int internal_change_owner(struct kernfs_node *kn, kuid_t kuid,
 	return kernfs_setattr(kn, &newattrs);
 }
 
+/**
+ *	sysfs_link_change_owner - change owner of a sysfs file.
+ *	@kobj:	object of the kernfs_node the symlink is located in.
+ *	@targ:	object of the kernfs_node the symlink points to.
+ *	@name:	name of the link.
+ *	@kuid:	new owner's kuid
+ *	@kgid:	new owner's kgid
+ *
+ * This function looks up the sysfs symlink entry @name under @kobj and changes
+ * the ownership to @kuid/@kgid. The symlink is looked up in the namespace of
+ * @targ.
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ,
+			    const char *name, kuid_t kuid, kgid_t kgid)
+{
+	struct kernfs_node *kn = NULL;
+	int error;
+
+	if (!name || !kobj->state_in_sysfs || !targ->state_in_sysfs)
+		return -EINVAL;
+
+	error = -ENOENT;
+	kn = kernfs_find_and_get_ns(kobj->sd, name, targ->sd->ns);
+	if (!kn)
+		goto out;
+
+	error = -EINVAL;
+	if (kernfs_type(kn) != KERNFS_LINK)
+		goto out;
+	if (kn->symlink.target_kn->priv != targ)
+		goto out;
+
+	error = internal_change_owner(kn, kuid, kgid);
+
+out:
+	kernfs_put(kn);
+	return error;
+}
+
 /**
  *	sysfs_file_change_owner - change owner of a sysfs file.
  *	@kobj:	object.
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index a7884024a911..7e15ebfd750e 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -312,6 +312,8 @@ static inline void sysfs_enable_ns(struct kernfs_node *kn)
 
 int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid,
 			    kgid_t kgid);
+int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ,
+			    const char *name, kuid_t kuid, kgid_t kgid);
 
 #else /* CONFIG_SYSFS */
 
@@ -532,6 +534,14 @@ static inline int sysfs_file_change_owner(struct kobject *kobj,
 	return 0;
 }
 
+static inline int sysfs_link_change_owner(struct kobject *kobj,
+					  struct kobject *targ,
+					  const char *name, kuid_t kuid,
+					  kgid_t kgid)
+{
+	return 0;
+}
+
 #endif /* CONFIG_SYSFS */
 
 static inline int __must_check sysfs_create_file(struct kobject *kobj,
-- 
cgit v1.2.3


From 303a42769c4c4d8e5e3ad928df87eb36f8c1fa60 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 27 Feb 2020 04:37:13 +0100
Subject: sysfs: add sysfs_group{s}_change_owner()

Add helpers to change the owner of sysfs groups.
This function will be used to correctly account for kobject ownership
changes, e.g. when moving network devices between network namespaces.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/sysfs/group.c      | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sysfs.h |  20 +++++++++
 2 files changed, 135 insertions(+)

(limited to 'include/linux')

diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index c4ab045926b7..5afe0e7ff7cd 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -13,6 +13,7 @@
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include <linux/err.h>
+#include <linux/fs.h>
 #include "sysfs.h"
 
 
@@ -457,3 +458,117 @@ int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
 	return PTR_ERR_OR_ZERO(link);
 }
 EXPORT_SYMBOL_GPL(__compat_only_sysfs_link_entry_to_kobj);
+
+static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn,
+					  const struct attribute_group *grp,
+					  struct iattr *newattrs)
+{
+	struct kernfs_node *kn;
+	int error;
+
+	if (grp->attrs) {
+		struct attribute *const *attr;
+
+		for (attr = grp->attrs; *attr; attr++) {
+			kn = kernfs_find_and_get(grp_kn, (*attr)->name);
+			if (!kn)
+				return -ENOENT;
+
+			error = kernfs_setattr(kn, newattrs);
+			kernfs_put(kn);
+			if (error)
+				return error;
+		}
+	}
+
+	if (grp->bin_attrs) {
+		struct bin_attribute *const *bin_attr;
+
+		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
+			kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name);
+			if (!kn)
+				return -ENOENT;
+
+			error = kernfs_setattr(kn, newattrs);
+			kernfs_put(kn);
+			if (error)
+				return error;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * sysfs_group_change_owner - change owner of an attribute group.
+ * @kobj:	The kobject containing the group.
+ * @grp:	The attribute group.
+ * @kuid:	new owner's kuid
+ * @kgid:	new owner's kgid
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int sysfs_group_change_owner(struct kobject *kobj,
+			     const struct attribute_group *grp, kuid_t kuid,
+			     kgid_t kgid)
+{
+	struct kernfs_node *grp_kn;
+	int error;
+	struct iattr newattrs = {
+		.ia_valid = ATTR_UID | ATTR_GID,
+		.ia_uid = kuid,
+		.ia_gid = kgid,
+	};
+
+	if (!kobj->state_in_sysfs)
+		return -EINVAL;
+
+	if (grp->name) {
+		grp_kn = kernfs_find_and_get(kobj->sd, grp->name);
+	} else {
+		kernfs_get(kobj->sd);
+		grp_kn = kobj->sd;
+	}
+	if (!grp_kn)
+		return -ENOENT;
+
+	error = kernfs_setattr(grp_kn, &newattrs);
+	if (!error)
+		error = sysfs_group_attrs_change_owner(grp_kn, grp, &newattrs);
+
+	kernfs_put(grp_kn);
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_group_change_owner);
+
+/**
+ * sysfs_groups_change_owner - change owner of a set of attribute groups.
+ * @kobj:	The kobject containing the groups.
+ * @groups:	The attribute groups.
+ * @kuid:	new owner's kuid
+ * @kgid:	new owner's kgid
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int sysfs_groups_change_owner(struct kobject *kobj,
+			      const struct attribute_group **groups,
+			      kuid_t kuid, kgid_t kgid)
+{
+	int error = 0, i;
+
+	if (!kobj->state_in_sysfs)
+		return -EINVAL;
+
+	if (!groups)
+		return 0;
+
+	for (i = 0; groups[i]; i++) {
+		error = sysfs_group_change_owner(kobj, groups[i], kuid, kgid);
+		if (error)
+			break;
+	}
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_groups_change_owner);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 7e15ebfd750e..3fcaabdb05ef 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -314,6 +314,12 @@ int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid,
 			    kgid_t kgid);
 int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ,
 			    const char *name, kuid_t kuid, kgid_t kgid);
+int sysfs_groups_change_owner(struct kobject *kobj,
+			      const struct attribute_group **groups,
+			      kuid_t kuid, kgid_t kgid);
+int sysfs_group_change_owner(struct kobject *kobj,
+			     const struct attribute_group *groups, kuid_t kuid,
+			     kgid_t kgid);
 
 #else /* CONFIG_SYSFS */
 
@@ -542,6 +548,20 @@ static inline int sysfs_link_change_owner(struct kobject *kobj,
 	return 0;
 }
 
+static inline int sysfs_groups_change_owner(struct kobject *kobj,
+			  const struct attribute_group **groups,
+			  kuid_t kuid, kgid_t kgid)
+{
+	return 0;
+}
+
+static inline int sysfs_group_change_owner(struct kobject *kobj,
+			 const struct attribute_group **groups,
+			 kuid_t kuid, kgid_t kgid)
+{
+	return 0;
+}
+
 #endif /* CONFIG_SYSFS */
 
 static inline int __must_check sysfs_create_file(struct kobject *kobj,
-- 
cgit v1.2.3


From 2c4f9401ceb00167a3bfd322a28aa87b646a253f Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 27 Feb 2020 04:37:14 +0100
Subject: sysfs: add sysfs_change_owner()

Add a helper to change the owner of sysfs objects.
This function will be used to correctly account for kobject ownership
changes, e.g. when moving network devices between network namespaces.

This mirrors how a kobject is added through driver core which in its guts is
done via kobject_add_internal() which in summary creates the main directory via
create_dir(), populates that directory with the groups associated with the
ktype of the kobject (if any) and populates the directory with the basic
attributes associated with the ktype of the kobject (if any). These are the
basic steps that are associated with adding a kobject in sysfs.
Any additional properties are added by the specific subsystem itself (not by
driver core) after it has registered the device. So for the example of network
devices, a network device will e.g. register a queue subdirectory under the
basic sysfs directory for the network device and than further subdirectories
within that queues subdirectory.  But that is all specific to network devices
and they call the corresponding sysfs functions to do that directly when they
create those queue objects. So anything that a subsystem adds outside of what
driver core does must also be changed by it (That's already true for removal of
files it created outside of driver core.) and it's the same for ownership
changes.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/sysfs/file.c       | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sysfs.h |  6 ++++++
 2 files changed, 66 insertions(+)

(limited to 'include/linux')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 332cd69b378c..26bbf960e2a2 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -646,3 +646,63 @@ int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid,
 	return error;
 }
 EXPORT_SYMBOL_GPL(sysfs_file_change_owner);
+
+/**
+ *	sysfs_change_owner - change owner of the given object.
+ *	@kobj:	object.
+ *	@kuid:	new owner's kuid
+ *	@kgid:	new owner's kgid
+ *
+ * Change the owner of the default directory, files, groups, and attributes of
+ * @kobj to @kuid/@kgid. Note that sysfs_change_owner mirrors how the sysfs
+ * entries for a kobject are added by driver core. In summary,
+ * sysfs_change_owner() takes care of the default directory entry for @kobj,
+ * the default attributes associated with the ktype of @kobj and the default
+ * attributes associated with the ktype of @kobj.
+ * Additional properties not added by driver core have to be changed by the
+ * driver or subsystem which created them. This is similar to how
+ * driver/subsystem specific entries are removed.
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid)
+{
+	int error;
+	const struct kobj_type *ktype;
+
+	if (!kobj->state_in_sysfs)
+		return -EINVAL;
+
+	/* Change the owner of the kobject itself. */
+	error = internal_change_owner(kobj->sd, kuid, kgid);
+	if (error)
+		return error;
+
+	ktype = get_ktype(kobj);
+	if (ktype) {
+		struct attribute **kattr;
+
+		/*
+		 * Change owner of the default attributes associated with the
+		 * ktype of @kobj.
+		 */
+		for (kattr = ktype->default_attrs; kattr && *kattr; kattr++) {
+			error = sysfs_file_change_owner(kobj, (*kattr)->name,
+							kuid, kgid);
+			if (error)
+				return error;
+		}
+
+		/*
+		 * Change owner of the default groups associated with the
+		 * ktype of @kobj.
+		 */
+		error = sysfs_groups_change_owner(kobj, ktype->default_groups,
+						  kuid, kgid);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sysfs_change_owner);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 3fcaabdb05ef..9e531ec76274 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -312,6 +312,7 @@ static inline void sysfs_enable_ns(struct kernfs_node *kn)
 
 int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid,
 			    kgid_t kgid);
+int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid);
 int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ,
 			    const char *name, kuid_t kuid, kgid_t kgid);
 int sysfs_groups_change_owner(struct kobject *kobj,
@@ -548,6 +549,11 @@ static inline int sysfs_link_change_owner(struct kobject *kobj,
 	return 0;
 }
 
+static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid)
+{
+	return 0;
+}
+
 static inline int sysfs_groups_change_owner(struct kobject *kobj,
 			  const struct attribute_group **groups,
 			  kuid_t kuid, kgid_t kgid)
-- 
cgit v1.2.3


From b8f33e5d76a7a1b87e0cc760d05bf2477b4e91d6 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 27 Feb 2020 04:37:15 +0100
Subject: device: add device_change_owner()

Add a helper to change the owner of a device's sysfs entries. This
needs to happen when the ownership of a device is changed, e.g. when
moving network devices between network namespaces.
This function will be used to correctly account for ownership changes,
e.g. when moving network devices between network namespaces.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/base/core.c    | 116 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/device.h |   1 +
 2 files changed, 117 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 42a672456432..988f34ce2eb0 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -3458,6 +3458,122 @@ out:
 }
 EXPORT_SYMBOL_GPL(device_move);
 
+static int device_attrs_change_owner(struct device *dev, kuid_t kuid,
+				     kgid_t kgid)
+{
+	struct kobject *kobj = &dev->kobj;
+	struct class *class = dev->class;
+	const struct device_type *type = dev->type;
+	int error;
+
+	if (class) {
+		/*
+		 * Change the device groups of the device class for @dev to
+		 * @kuid/@kgid.
+		 */
+		error = sysfs_groups_change_owner(kobj, class->dev_groups, kuid,
+						  kgid);
+		if (error)
+			return error;
+	}
+
+	if (type) {
+		/*
+		 * Change the device groups of the device type for @dev to
+		 * @kuid/@kgid.
+		 */
+		error = sysfs_groups_change_owner(kobj, type->groups, kuid,
+						  kgid);
+		if (error)
+			return error;
+	}
+
+	/* Change the device groups of @dev to @kuid/@kgid. */
+	error = sysfs_groups_change_owner(kobj, dev->groups, kuid, kgid);
+	if (error)
+		return error;
+
+	if (device_supports_offline(dev) && !dev->offline_disabled) {
+		/* Change online device attributes of @dev to @kuid/@kgid. */
+		error = sysfs_file_change_owner(kobj, dev_attr_online.attr.name,
+						kuid, kgid);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/**
+ * device_change_owner - change the owner of an existing device.
+ * @dev: device.
+ * @kuid: new owner's kuid
+ * @kgid: new owner's kgid
+ *
+ * This changes the owner of @dev and its corresponding sysfs entries to
+ * @kuid/@kgid. This function closely mirrors how @dev was added via driver
+ * core.
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid)
+{
+	int error;
+	struct kobject *kobj = &dev->kobj;
+
+	dev = get_device(dev);
+	if (!dev)
+		return -EINVAL;
+
+	/*
+	 * Change the kobject and the default attributes and groups of the
+	 * ktype associated with it to @kuid/@kgid.
+	 */
+	error = sysfs_change_owner(kobj, kuid, kgid);
+	if (error)
+		goto out;
+
+	/*
+	 * Change the uevent file for @dev to the new owner. The uevent file
+	 * was created in a separate step when @dev got added and we mirror
+	 * that step here.
+	 */
+	error = sysfs_file_change_owner(kobj, dev_attr_uevent.attr.name, kuid,
+					kgid);
+	if (error)
+		goto out;
+
+	/*
+	 * Change the device groups, the device groups associated with the
+	 * device class, and the groups associated with the device type of @dev
+	 * to @kuid/@kgid.
+	 */
+	error = device_attrs_change_owner(dev, kuid, kgid);
+	if (error)
+		goto out;
+
+#ifdef CONFIG_BLOCK
+	if (sysfs_deprecated && dev->class == &block_class)
+		goto out;
+#endif
+
+	/*
+	 * Change the owner of the symlink located in the class directory of
+	 * the device class associated with @dev which points to the actual
+	 * directory entry for @dev to @kuid/@kgid. This ensures that the
+	 * symlink shows the same permissions as its target.
+	 */
+	error = sysfs_link_change_owner(&dev->class->p->subsys.kobj, &dev->kobj,
+					dev_name(dev), kuid, kgid);
+	if (error)
+		goto out;
+
+out:
+	put_device(dev);
+	return error;
+}
+EXPORT_SYMBOL_GPL(device_change_owner);
+
 /**
  * device_shutdown - call ->shutdown() on each device to shutdown.
  */
diff --git a/include/linux/device.h b/include/linux/device.h
index 0cd7c647c16c..3e40533d2037 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -817,6 +817,7 @@ extern struct device *device_find_child_by_name(struct device *parent,
 extern int device_rename(struct device *dev, const char *new_name);
 extern int device_move(struct device *dev, struct device *new_parent,
 		       enum dpm_order dpm_order);
+extern int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid);
 extern const char *device_get_devnode(struct device *dev,
 				      umode_t *mode, kuid_t *uid, kgid_t *gid,
 				      const char **tmp);
-- 
cgit v1.2.3


From 91a208f2185ad4855ff03c342d0b7e4f5fc6f5df Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Wed, 26 Feb 2020 10:23:41 +0000
Subject: net: phylink: propagate resolved link config via mac_link_up()

Propagate the resolved link parameters via the mac_link_up() call for
MACs that do not automatically track their PCS state. We propagate the
link parameters via function arguments so that inappropriate members
of struct phylink_link_state can't be accessed, and creating a new
structure just for this adds needless complexity to the API.

Tested-by: Andre Przywara <andre.przywara@arm.com>
Tested-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/sfp-phylink.rst          | 17 +++++--
 drivers/net/ethernet/cadence/macb_main.c          |  7 ++-
 drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c  |  7 ++-
 drivers/net/ethernet/marvell/mvneta.c             |  8 ++--
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c   | 19 +++++---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c       |  7 +--
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |  4 +-
 drivers/net/ethernet/xilinx/xilinx_axienet_main.c |  7 +--
 drivers/net/phy/phylink.c                         |  9 +++-
 include/linux/phylink.h                           | 57 +++++++++++++++++------
 net/dsa/port.c                                    |  4 +-
 11 files changed, 105 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst
index d753a309f9d1..8d7af28cd835 100644
--- a/Documentation/networking/sfp-phylink.rst
+++ b/Documentation/networking/sfp-phylink.rst
@@ -74,10 +74,13 @@ phylib to the sfp/phylink support.  Please send patches to improve
 this documentation.
 
 1. Optionally split the network driver's phylib update function into
-   three parts dealing with link-down, link-up and reconfiguring the
-   MAC settings. This can be done as a separate preparation commit.
+   two parts dealing with link-down and link-up. This can be done as
+   a separate preparation commit.
 
-   An example of this preparation can be found in git commit fc548b991fb0.
+   An older example of this preparation can be found in git commit
+   fc548b991fb0, although this was splitting into three parts; the
+   link-up part now includes configuring the MAC for the link settings.
+   Please see :c:func:`mac_link_up` for more information on this.
 
 2. Replace::
 
@@ -207,6 +210,14 @@ this documentation.
    using. This is particularly important for in-band negotiation
    methods such as 1000base-X and SGMII.
 
+   The :c:func:`mac_link_up` method is used to inform the MAC that the
+   link has come up. The call includes the negotiation mode and interface
+   for reference only. The finalised link parameters are also supplied
+   (speed, duplex and flow control/pause enablement settings) which
+   should be used to configure the MAC when the MAC and PCS are not
+   tightly integrated, or when the settings are not coming from in-band
+   negotiation.
+
    The :c:func:`mac_config` method is used to update the MAC with the
    requested state, and must avoid unnecessarily taking the link down
    when making changes to the MAC configuration.  This means the
diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index 2c28da1737fe..7ab0bef5e1bd 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -626,8 +626,11 @@ static void macb_mac_link_down(struct phylink_config *config, unsigned int mode,
 	netif_tx_stop_all_queues(ndev);
 }
 
-static void macb_mac_link_up(struct phylink_config *config, unsigned int mode,
-			     phy_interface_t interface, struct phy_device *phy)
+static void macb_mac_link_up(struct phylink_config *config,
+			     struct phy_device *phy,
+			     unsigned int mode, phy_interface_t interface,
+			     int speed, int duplex,
+			     bool tx_pause, bool rx_pause)
 {
 	struct net_device *ndev = to_net_dev(config->dev);
 	struct macb *bp = netdev_priv(ndev);
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
index 84233e467ed1..3a75c5b58f95 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c
@@ -154,8 +154,11 @@ static void dpaa2_mac_config(struct phylink_config *config, unsigned int mode,
 		netdev_err(mac->net_dev, "dpmac_set_link_state() = %d\n", err);
 }
 
-static void dpaa2_mac_link_up(struct phylink_config *config, unsigned int mode,
-			      phy_interface_t interface, struct phy_device *phy)
+static void dpaa2_mac_link_up(struct phylink_config *config,
+			      struct phy_device *phy,
+			      unsigned int mode, phy_interface_t interface,
+			      int speed, int duplex,
+			      bool tx_pause, bool rx_pause)
 {
 	struct dpaa2_mac *mac = phylink_to_dpaa2_mac(config);
 	struct dpmac_link_state *dpmac_state = &mac->state;
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 1c391f63a26f..9af3f8d5b289 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -3965,9 +3965,11 @@ static void mvneta_mac_link_down(struct phylink_config *config,
 	mvneta_set_eee(pp, false);
 }
 
-static void mvneta_mac_link_up(struct phylink_config *config, unsigned int mode,
-			       phy_interface_t interface,
-			       struct phy_device *phy)
+static void mvneta_mac_link_up(struct phylink_config *config,
+			       struct phy_device *phy,
+			       unsigned int mode, phy_interface_t interface,
+			       int speed, int duplex,
+			       bool tx_pause, bool rx_pause)
 {
 	struct net_device *ndev = to_net_dev(config->dev);
 	struct mvneta_port *pp = netdev_priv(ndev);
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 72133cbe55d4..ed8042d97e29 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -58,8 +58,11 @@ static struct {
  */
 static void mvpp2_mac_config(struct phylink_config *config, unsigned int mode,
 			     const struct phylink_link_state *state);
-static void mvpp2_mac_link_up(struct phylink_config *config, unsigned int mode,
-			      phy_interface_t interface, struct phy_device *phy);
+static void mvpp2_mac_link_up(struct phylink_config *config,
+			      struct phy_device *phy,
+			      unsigned int mode, phy_interface_t interface,
+			      int speed, int duplex,
+			      bool tx_pause, bool rx_pause);
 
 /* Queue modes */
 #define MVPP2_QDIST_SINGLE_MODE	0
@@ -3473,8 +3476,9 @@ static void mvpp2_start_dev(struct mvpp2_port *port)
 			.interface = port->phy_interface,
 		};
 		mvpp2_mac_config(&port->phylink_config, MLO_AN_INBAND, &state);
-		mvpp2_mac_link_up(&port->phylink_config, MLO_AN_INBAND,
-				  port->phy_interface, NULL);
+		mvpp2_mac_link_up(&port->phylink_config, NULL,
+				  MLO_AN_INBAND, port->phy_interface,
+				  SPEED_UNKNOWN, DUPLEX_UNKNOWN, false, false);
 	}
 
 	netif_tx_start_all_queues(port->dev);
@@ -5141,8 +5145,11 @@ static void mvpp2_mac_config(struct phylink_config *config, unsigned int mode,
 	mvpp2_port_enable(port);
 }
 
-static void mvpp2_mac_link_up(struct phylink_config *config, unsigned int mode,
-			      phy_interface_t interface, struct phy_device *phy)
+static void mvpp2_mac_link_up(struct phylink_config *config,
+			      struct phy_device *phy,
+			      unsigned int mode, phy_interface_t interface,
+			      int speed, int duplex,
+			      bool tx_pause, bool rx_pause)
 {
 	struct net_device *dev = to_net_dev(config->dev);
 	struct mvpp2_port *port = netdev_priv(dev);
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 8c6cfd15481c..8d28f90acfe7 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -412,9 +412,10 @@ static void mtk_mac_link_down(struct phylink_config *config, unsigned int mode,
 	mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id));
 }
 
-static void mtk_mac_link_up(struct phylink_config *config, unsigned int mode,
-			    phy_interface_t interface,
-			    struct phy_device *phy)
+static void mtk_mac_link_up(struct phylink_config *config,
+			    struct phy_device *phy,
+			    unsigned int mode, phy_interface_t interface,
+			    int speed, int duplex, bool tx_pause, bool rx_pause)
 {
 	struct mtk_mac *mac = container_of(config, struct mtk_mac,
 					   phylink_config);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 37920b4da091..e039e715dcee 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -950,8 +950,10 @@ static void stmmac_mac_link_down(struct phylink_config *config,
 }
 
 static void stmmac_mac_link_up(struct phylink_config *config,
+			       struct phy_device *phy,
 			       unsigned int mode, phy_interface_t interface,
-			       struct phy_device *phy)
+			       int speed, int duplex,
+			       bool tx_pause, bool rx_pause)
 {
 	struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev));
 
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index 20746b801959..197740781157 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -1486,9 +1486,10 @@ static void axienet_mac_link_down(struct phylink_config *config,
 }
 
 static void axienet_mac_link_up(struct phylink_config *config,
-				unsigned int mode,
-				phy_interface_t interface,
-				struct phy_device *phy)
+				struct phy_device *phy,
+				unsigned int mode, phy_interface_t interface,
+				int speed, int duplex,
+				bool tx_pause, bool rx_pause)
 {
 	/* nothing meaningful to do */
 }
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 2899fbe699ab..b4367fab7899 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -480,8 +480,11 @@ static void phylink_mac_link_up(struct phylink *pl,
 	struct net_device *ndev = pl->netdev;
 
 	pl->cur_interface = link_state.interface;
-	pl->ops->mac_link_up(pl->config, pl->cur_link_an_mode,
-			     pl->cur_interface, pl->phydev);
+	pl->ops->mac_link_up(pl->config, pl->phydev,
+			     pl->cur_link_an_mode, pl->cur_interface,
+			     link_state.speed, link_state.duplex,
+			     !!(link_state.pause & MLO_PAUSE_TX),
+			     !!(link_state.pause & MLO_PAUSE_RX));
 
 	if (ndev)
 		netif_carrier_on(ndev);
@@ -547,6 +550,8 @@ static void phylink_resolve(struct work_struct *w)
 				link_state.pause = pl->phy_state.pause;
 				phylink_apply_manual_flow(pl, &link_state);
 				phylink_mac_config(pl, &link_state);
+			} else {
+				phylink_apply_manual_flow(pl, &link_state);
 			}
 			break;
 		}
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 812357c03df4..2180eb1aa254 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -91,9 +91,10 @@ struct phylink_mac_ops {
 	void (*mac_an_restart)(struct phylink_config *config);
 	void (*mac_link_down)(struct phylink_config *config, unsigned int mode,
 			      phy_interface_t interface);
-	void (*mac_link_up)(struct phylink_config *config, unsigned int mode,
-			    phy_interface_t interface,
-			    struct phy_device *phy);
+	void (*mac_link_up)(struct phylink_config *config,
+			    struct phy_device *phy, unsigned int mode,
+			    phy_interface_t interface, int speed, int duplex,
+			    bool tx_pause, bool rx_pause);
 };
 
 #if 0 /* For kernel-doc purposes only. */
@@ -152,6 +153,9 @@ void mac_pcs_get_state(struct phylink_config *config,
  * guaranteed to be correct, and so any mac_config() implementation must
  * never reference these fields.
  *
+ * (this requires a rewrite - please refer to mac_link_up() for situations
+ *  where the PCS and MAC are not tightly integrated.)
+ *
  * In all negotiation modes, as defined by @mode, @state->pause indicates the
  * pause settings which should be applied as follows. If %MLO_PAUSE_AN is not
  * set, %MLO_PAUSE_TX and %MLO_PAUSE_RX indicate whether the MAC should send
@@ -162,12 +166,20 @@ void mac_pcs_get_state(struct phylink_config *config,
  * The action performed depends on the currently selected mode:
  *
  * %MLO_AN_FIXED, %MLO_AN_PHY:
- *   Configure the specified @state->speed and @state->duplex over a link
- *   specified by @state->interface. @state->advertising may be used, but
- *   is not required. Pause modes as above. Other members of @state must
- *   be ignored.
+ *   Configure for non-inband negotiation mode, where the link settings
+ *   are completely communicated via mac_link_up().  The physical link
+ *   protocol from the MAC is specified by @state->interface.
+ *
+ *   @state->advertising may be used, but is not required.
+ *
+ *   Older drivers (prior to the mac_link_up() change) may use @state->speed,
+ *   @state->duplex and @state->pause to configure the MAC, but this is
+ *   deprecated; such drivers should be converted to use mac_link_up().
  *
- *   Valid state members: interface, speed, duplex, pause, advertising.
+ *   Other members of @state must be ignored.
+ *
+ *   Valid state members: interface, advertising.
+ *   Deprecated state members: speed, duplex, pause.
  *
  * %MLO_AN_INBAND:
  *   place the link in an inband negotiation mode (such as 802.3z
@@ -228,19 +240,34 @@ void mac_link_down(struct phylink_config *config, unsigned int mode,
 /**
  * mac_link_up() - allow the link to come up
  * @config: a pointer to a &struct phylink_config.
+ * @phy: any attached phy
  * @mode: link autonegotiation mode
  * @interface: link &typedef phy_interface_t mode
- * @phy: any attached phy
+ * @speed: link speed
+ * @duplex: link duplex
+ * @tx_pause: link transmit pause enablement status
+ * @rx_pause: link receive pause enablement status
  *
- * If @mode is not an in-band negotiation mode (as defined by
- * phylink_autoneg_inband()), allow the link to come up. If @phy
- * is non-%NULL, configure Energy Efficient Ethernet by calling
+ * Configure the MAC for an established link.
+ *
+ * @speed, @duplex, @tx_pause and @rx_pause indicate the finalised link
+ * settings, and should be used to configure the MAC block appropriately
+ * where these settings are not automatically conveyed from the PCS block,
+ * or if in-band negotiation (as defined by phylink_autoneg_inband(@mode))
+ * is disabled.
+ *
+ * Note that when 802.3z in-band negotiation is in use, it is possible
+ * that the user wishes to override the pause settings, and this should
+ * be allowed when considering the implementation of this method.
+ *
+ * If in-band negotiation mode is disabled, allow the link to come up. If
+ * @phy is non-%NULL, configure Energy Efficient Ethernet by calling
  * phy_init_eee() and perform appropriate MAC configuration for EEE.
  * Interface type selection must be done in mac_config().
  */
-void mac_link_up(struct phylink_config *config, unsigned int mode,
-		 phy_interface_t interface,
-		 struct phy_device *phy);
+void mac_link_up(struct phylink_config *config, struct phy_device *phy,
+		 unsigned int mode, phy_interface_t interface,
+		 int speed, int duplex, bool tx_pause, bool rx_pause);
 #endif
 
 struct phylink *phylink_create(struct phylink_config *, struct fwnode_handle *,
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 774facb8d547..b2f5262b35cf 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -489,9 +489,11 @@ static void dsa_port_phylink_mac_link_down(struct phylink_config *config,
 }
 
 static void dsa_port_phylink_mac_link_up(struct phylink_config *config,
+					 struct phy_device *phydev,
 					 unsigned int mode,
 					 phy_interface_t interface,
-					 struct phy_device *phydev)
+					 int speed, int duplex,
+					 bool tx_pause, bool rx_pause)
 {
 	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
 	struct dsa_switch *ds = dp->ds;
-- 
cgit v1.2.3


From d7f10df86202273155a9d8f8553bc2ad28e0dd46 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Wed, 26 Feb 2020 18:17:44 -0600
Subject: bpf: Replace zero-length array with flexible-array member

The current codebase makes use of the zero-length array language
extension to the C90 standard, but the preferred mechanism to declare
variable-length types such as these ones is a flexible array member[1][2],
introduced in C99:

struct foo {
        int stuff;
        struct boo array[];
};

By making use of the mechanism above, we will get a compiler warning
in case the flexible array does not occur last in the structure, which
will help us prevent some kind of undefined behavior bugs from being
inadvertently introduced[3] to the codebase from now on.

Also, notice that, dynamic memory allocations won't be affected by
this change:

"Flexible array members have incomplete type, and so the sizeof operator
may not be applied. As a quirk of the original implementation of
zero-length arrays, sizeof evaluates to zero."[1]

This issue was found with the help of Coccinelle.

[1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html
[2] https://github.com/KSPP/linux/issues/21
[3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour")

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200227001744.GA3317@embeddedor
---
 include/linux/bpf-cgroup.h  | 2 +-
 include/linux/bpf.h         | 2 +-
 include/uapi/linux/bpf.h    | 2 +-
 kernel/bpf/bpf_struct_ops.c | 2 +-
 kernel/bpf/hashtab.c        | 2 +-
 kernel/bpf/lpm_trie.c       | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index a11d5b7dbbf3..a7cd5c7a2509 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -36,7 +36,7 @@ struct bpf_cgroup_storage_map;
 
 struct bpf_storage_buffer {
 	struct rcu_head rcu;
-	char data[0];
+	char data[];
 };
 
 struct bpf_cgroup_storage {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1acd5bf70350..9aa33b8f3d55 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -859,7 +859,7 @@ struct bpf_prog_array_item {
 
 struct bpf_prog_array {
 	struct rcu_head rcu;
-	struct bpf_prog_array_item items[0];
+	struct bpf_prog_array_item items[];
 };
 
 struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 906e9f2752db..8e98ced0963b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -73,7 +73,7 @@ struct bpf_insn {
 /* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
 struct bpf_lpm_trie_key {
 	__u32	prefixlen;	/* up to 32 for AF_INET, 128 for AF_INET6 */
-	__u8	data[0];	/* Arbitrary size */
+	__u8	data[];	/* Arbitrary size */
 };
 
 struct bpf_cgroup_storage_key {
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 042f95534f86..c498f0fffb40 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -23,7 +23,7 @@ enum bpf_struct_ops_state {
 
 struct bpf_struct_ops_value {
 	BPF_STRUCT_OPS_COMMON_VALUE;
-	char data[0] ____cacheline_aligned_in_smp;
+	char data[] ____cacheline_aligned_in_smp;
 };
 
 struct bpf_struct_ops_map {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 53d9483fee10..d541c8486c95 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -118,7 +118,7 @@ struct htab_elem {
 		struct bpf_lru_node lru_node;
 	};
 	u32 hash;
-	char key[0] __aligned(8);
+	char key[] __aligned(8);
 };
 
 static inline bool htab_is_prealloc(const struct bpf_htab *htab)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 3b3c420bc8ed..65c236cf341e 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -25,7 +25,7 @@ struct lpm_trie_node {
 	struct lpm_trie_node __rcu	*child[2];
 	u32				prefixlen;
 	u32				flags;
-	u8				data[0];
+	u8				data[];
 };
 
 struct lpm_trie {
-- 
cgit v1.2.3


From 96e326878fa5e2727d14e9a23644119374619010 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Tue, 14 Jan 2020 17:30:41 +0200
Subject: net/mlx5e: Eswitch, Use per vport tables for mirroring

When using port mirroring, we forward the traffic to another table and
use that table to forward to the mirrored vport. Since the hardware
loses the values of reg c, and in particular reg c0, we fail the match
on the input vport which previously existed in reg c0. To overcome this
situation, we use a set of per vport tables, positioned at the lowest
priority, and forward traffic to those tables. Since these tables are
per vport, we can avoid matching on reg c0.

Fixes: c01cfd0f1115 ("net/mlx5: E-Switch, Add match on vport metadata for rule in fast path")
Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  10 +
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 206 ++++++++++++++++++++-
 .../mellanox/mlx5/core/eswitch_offloads_chains.c   |  11 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |  11 ++
 include/linux/mlx5/fs.h                            |   1 +
 5 files changed, 221 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 4472710ccc9c..479d2458f872 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -49,6 +49,7 @@
 
 /* The index of the last real chain (FT) + 1 as chain zero is valid as well */
 #define FDB_NUM_CHAINS (FDB_FT_CHAIN + 1)
+#define ESW_OFFLOADS_NUM_GROUPS  4
 
 #define FDB_TC_MAX_PRIO 16
 #define FDB_TC_LEVELS_PER_PRIO 2
@@ -183,6 +184,12 @@ struct mlx5_eswitch_fdb {
 			int vlan_push_pop_refcount;
 
 			struct mlx5_esw_chains_priv *esw_chains_priv;
+			struct {
+				DECLARE_HASHTABLE(table, 8);
+				/* Protects vports.table */
+				struct mutex lock;
+			} vports;
+
 		} offloads;
 	};
 	u32 flags;
@@ -623,6 +630,9 @@ void
 esw_vport_destroy_offloads_acl_tables(struct mlx5_eswitch *esw,
 				      struct mlx5_vport *vport);
 
+int mlx5_esw_vport_tbl_get(struct mlx5_eswitch *esw);
+void mlx5_esw_vport_tbl_put(struct mlx5_eswitch *esw);
+
 #else  /* CONFIG_MLX5_ESWITCH */
 /* eswitch API stubs */
 static inline int  mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 1a57b2bd74b8..9a72c719d8f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -50,6 +50,179 @@
 #define MLX5_ESW_MISS_FLOWS (2)
 #define UPLINK_REP_INDEX 0
 
+/* Per vport tables */
+
+#define MLX5_ESW_VPORT_TABLE_SIZE 128
+
+/* This struct is used as a key to the hash table and we need it to be packed
+ * so hash result is consistent
+ */
+struct mlx5_vport_key {
+	u32 chain;
+	u16 prio;
+	u16 vport;
+	u16 vhca_id;
+} __packed;
+
+struct mlx5_vport_table {
+	struct hlist_node hlist;
+	struct mlx5_flow_table *fdb;
+	u32 num_rules;
+	struct mlx5_vport_key key;
+};
+
+static struct mlx5_flow_table *
+esw_vport_tbl_create(struct mlx5_eswitch *esw, struct mlx5_flow_namespace *ns)
+{
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_flow_table *fdb;
+
+	ft_attr.autogroup.max_num_groups = ESW_OFFLOADS_NUM_GROUPS;
+	ft_attr.max_fte = MLX5_ESW_VPORT_TABLE_SIZE;
+	ft_attr.prio = FDB_PER_VPORT;
+	fdb = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
+	if (IS_ERR(fdb)) {
+		esw_warn(esw->dev, "Failed to create per vport FDB Table err %ld\n",
+			 PTR_ERR(fdb));
+	}
+
+	return fdb;
+}
+
+static u32 flow_attr_to_vport_key(struct mlx5_eswitch *esw,
+				  struct mlx5_esw_flow_attr *attr,
+				  struct mlx5_vport_key *key)
+{
+	key->vport = attr->in_rep->vport;
+	key->chain = attr->chain;
+	key->prio = attr->prio;
+	key->vhca_id = MLX5_CAP_GEN(esw->dev, vhca_id);
+	return jhash(key, sizeof(*key), 0);
+}
+
+/* caller must hold vports.lock */
+static struct mlx5_vport_table *
+esw_vport_tbl_lookup(struct mlx5_eswitch *esw, struct mlx5_vport_key *skey, u32 key)
+{
+	struct mlx5_vport_table *e;
+
+	hash_for_each_possible(esw->fdb_table.offloads.vports.table, e, hlist, key)
+		if (!memcmp(&e->key, skey, sizeof(*skey)))
+			return e;
+
+	return NULL;
+}
+
+static void
+esw_vport_tbl_put(struct mlx5_eswitch *esw, struct mlx5_esw_flow_attr *attr)
+{
+	struct mlx5_vport_table *e;
+	struct mlx5_vport_key key;
+	u32 hkey;
+
+	mutex_lock(&esw->fdb_table.offloads.vports.lock);
+	hkey = flow_attr_to_vport_key(esw, attr, &key);
+	e = esw_vport_tbl_lookup(esw, &key, hkey);
+	if (!e || --e->num_rules)
+		goto out;
+
+	hash_del(&e->hlist);
+	mlx5_destroy_flow_table(e->fdb);
+	kfree(e);
+out:
+	mutex_unlock(&esw->fdb_table.offloads.vports.lock);
+}
+
+static struct mlx5_flow_table *
+esw_vport_tbl_get(struct mlx5_eswitch *esw, struct mlx5_esw_flow_attr *attr)
+{
+	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_namespace *ns;
+	struct mlx5_flow_table *fdb;
+	struct mlx5_vport_table *e;
+	struct mlx5_vport_key skey;
+	u32 hkey;
+
+	mutex_lock(&esw->fdb_table.offloads.vports.lock);
+	hkey = flow_attr_to_vport_key(esw, attr, &skey);
+	e = esw_vport_tbl_lookup(esw, &skey, hkey);
+	if (e) {
+		e->num_rules++;
+		goto out;
+	}
+
+	e = kzalloc(sizeof(*e), GFP_KERNEL);
+	if (!e) {
+		fdb = ERR_PTR(-ENOMEM);
+		goto err_alloc;
+	}
+
+	ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
+	if (!ns) {
+		esw_warn(dev, "Failed to get FDB namespace\n");
+		fdb = ERR_PTR(-ENOENT);
+		goto err_ns;
+	}
+
+	fdb = esw_vport_tbl_create(esw, ns);
+	if (IS_ERR(fdb))
+		goto err_ns;
+
+	e->fdb = fdb;
+	e->num_rules = 1;
+	e->key = skey;
+	hash_add(esw->fdb_table.offloads.vports.table, &e->hlist, hkey);
+out:
+	mutex_unlock(&esw->fdb_table.offloads.vports.lock);
+	return e->fdb;
+
+err_ns:
+	kfree(e);
+err_alloc:
+	mutex_unlock(&esw->fdb_table.offloads.vports.lock);
+	return fdb;
+}
+
+int mlx5_esw_vport_tbl_get(struct mlx5_eswitch *esw)
+{
+	struct mlx5_esw_flow_attr attr = {};
+	struct mlx5_eswitch_rep rep = {};
+	struct mlx5_flow_table *fdb;
+	struct mlx5_vport *vport;
+	int i;
+
+	attr.prio = 1;
+	attr.in_rep = &rep;
+	mlx5_esw_for_all_vports(esw, i, vport) {
+		attr.in_rep->vport = vport->vport;
+		fdb = esw_vport_tbl_get(esw, &attr);
+		if (!fdb)
+			goto out;
+	}
+	return 0;
+
+out:
+	mlx5_esw_vport_tbl_put(esw);
+	return PTR_ERR(fdb);
+}
+
+void mlx5_esw_vport_tbl_put(struct mlx5_eswitch *esw)
+{
+	struct mlx5_esw_flow_attr attr = {};
+	struct mlx5_eswitch_rep rep = {};
+	struct mlx5_vport *vport;
+	int i;
+
+	attr.prio = 1;
+	attr.in_rep = &rep;
+	mlx5_esw_for_all_vports(esw, i, vport) {
+		attr.in_rep->vport = vport->vport;
+		esw_vport_tbl_put(esw, &attr);
+	}
+}
+
+/* End: Per vport tables */
+
 static struct mlx5_eswitch_rep *mlx5_eswitch_get_rep(struct mlx5_eswitch *esw,
 						     u16 vport_num)
 {
@@ -191,8 +364,6 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 		i++;
 	}
 
-	mlx5_eswitch_set_rule_source_port(esw, spec, attr);
-
 	if (attr->outer_match_level != MLX5_MATCH_NONE)
 		spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS;
 	if (attr->inner_match_level != MLX5_MATCH_NONE)
@@ -201,8 +372,13 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
 		flow_act.modify_hdr = attr->modify_hdr;
 
-	fdb = mlx5_esw_chains_get_table(esw, attr->chain, attr->prio,
-					!!split);
+	if (split) {
+		fdb = esw_vport_tbl_get(esw, attr);
+	} else {
+		fdb = mlx5_esw_chains_get_table(esw, attr->chain, attr->prio,
+						0);
+		mlx5_eswitch_set_rule_source_port(esw, spec, attr);
+	}
 	if (IS_ERR(fdb)) {
 		rule = ERR_CAST(fdb);
 		goto err_esw_get;
@@ -221,7 +397,10 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	return rule;
 
 err_add_rule:
-	mlx5_esw_chains_put_table(esw, attr->chain, attr->prio, !!split);
+	if (split)
+		esw_vport_tbl_put(esw, attr);
+	else
+		mlx5_esw_chains_put_table(esw, attr->chain, attr->prio, 0);
 err_esw_get:
 	if (!(attr->flags & MLX5_ESW_ATTR_FLAG_SLOW_PATH) && attr->dest_chain)
 		mlx5_esw_chains_put_table(esw, attr->dest_chain, 1, 0);
@@ -247,7 +426,7 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
 		goto err_get_fast;
 	}
 
-	fwd_fdb = mlx5_esw_chains_get_table(esw, attr->chain, attr->prio, 1);
+	fwd_fdb = esw_vport_tbl_get(esw, attr);
 	if (IS_ERR(fwd_fdb)) {
 		rule = ERR_CAST(fwd_fdb);
 		goto err_get_fwd;
@@ -285,7 +464,7 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
 
 	return rule;
 add_err:
-	mlx5_esw_chains_put_table(esw, attr->chain, attr->prio, 1);
+	esw_vport_tbl_put(esw, attr);
 err_get_fwd:
 	mlx5_esw_chains_put_table(esw, attr->chain, attr->prio, 0);
 err_get_fast:
@@ -312,11 +491,14 @@ __mlx5_eswitch_del_rule(struct mlx5_eswitch *esw,
 	atomic64_dec(&esw->offloads.num_flows);
 
 	if (fwd_rule)  {
-		mlx5_esw_chains_put_table(esw, attr->chain, attr->prio, 1);
+		esw_vport_tbl_put(esw, attr);
 		mlx5_esw_chains_put_table(esw, attr->chain, attr->prio, 0);
 	} else {
-		mlx5_esw_chains_put_table(esw, attr->chain, attr->prio,
-					  !!split);
+		if (split)
+			esw_vport_tbl_put(esw, attr);
+		else
+			mlx5_esw_chains_put_table(esw, attr->chain, attr->prio,
+						  0);
 		if (attr->dest_chain)
 			mlx5_esw_chains_put_table(esw, attr->dest_chain, 1, 0);
 	}
@@ -1923,6 +2105,9 @@ static int esw_offloads_steering_init(struct mlx5_eswitch *esw)
 	if (err)
 		goto create_fg_err;
 
+	mutex_init(&esw->fdb_table.offloads.vports.lock);
+	hash_init(esw->fdb_table.offloads.vports.table);
+
 	return 0;
 
 create_fg_err:
@@ -1939,6 +2124,7 @@ create_fdb_err:
 
 static void esw_offloads_steering_cleanup(struct mlx5_eswitch *esw)
 {
+	mutex_destroy(&esw->fdb_table.offloads.vports.lock);
 	esw_destroy_vport_rx_group(esw);
 	esw_destroy_offloads_table(esw);
 	esw_destroy_offloads_fdb_tables(esw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c
index 4276194b633f..883c9e6ff0b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c
@@ -21,8 +21,6 @@
 #define fdb_ignore_flow_level_supported(esw) \
 	(MLX5_CAP_ESW_FLOWTABLE_FDB((esw)->dev, ignore_flow_level))
 
-#define ESW_OFFLOADS_NUM_GROUPS  4
-
 /* Firmware currently has 4 pool of 4 sizes that it supports (ESW_POOLS),
  * and a virtual memory region of 16M (ESW_SIZE), this region is duplicated
  * for each flow table pool. We can allocate up to 16M of each pool,
@@ -704,12 +702,9 @@ mlx5_esw_chains_open(struct mlx5_eswitch *esw)
 
 	/* Open level 1 for split rules now if prios isn't supported  */
 	if (!mlx5_esw_chains_prios_supported(esw)) {
-		ft = mlx5_esw_chains_get_table(esw, 0, 1, 1);
-
-		if (IS_ERR(ft)) {
-			err = PTR_ERR(ft);
+		err = mlx5_esw_vport_tbl_get(esw);
+		if (err)
 			goto level_1_err;
-		}
 	}
 
 	return 0;
@@ -725,7 +720,7 @@ static void
 mlx5_esw_chains_close(struct mlx5_eswitch *esw)
 {
 	if (!mlx5_esw_chains_prios_supported(esw))
-		mlx5_esw_chains_put_table(esw, 0, 1, 1);
+		mlx5_esw_vport_tbl_put(esw);
 	mlx5_esw_chains_put_table(esw, 0, 1, 0);
 	mlx5_esw_chains_put_table(esw, mlx5_esw_chains_get_ft_chain(esw), 1, 0);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 9dc24241dc91..5826fd43d530 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2700,6 +2700,17 @@ static int init_fdb_root_ns(struct mlx5_flow_steering *steering)
 		goto out_err;
 	}
 
+	/* We put this priority last, knowing that nothing will get here
+	 * unless explicitly forwarded to. This is possible because the
+	 * slow path tables have catch all rules and nothing gets passed
+	 * those tables.
+	 */
+	maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_PER_VPORT, 1);
+	if (IS_ERR(maj_prio)) {
+		err = PTR_ERR(maj_prio);
+		goto out_err;
+	}
+
 	set_prio_attrs(steering->fdb_root_ns);
 	return 0;
 
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 4cae16016b2b..a5cf5c76f348 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -84,6 +84,7 @@ enum {
 	FDB_TC_OFFLOAD,
 	FDB_FT_OFFLOAD,
 	FDB_SLOW_PATH,
+	FDB_PER_VPORT,
 };
 
 struct mlx5_pkt_reformat;
-- 
cgit v1.2.3


From 5682d393b40e1fe7426a7b8c3471f05262f42010 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 25 Feb 2020 15:04:09 -0800
Subject: inet_diag: Refactor inet_sk_diag_fill(), dump(), and dump_one()

In a latter patch, there is a need to update "cb->min_dump_alloc"
in inet_sk_diag_fill() as it learns the diffierent bpf_sk_storages
stored in a sk while dumping all sk(s) (e.g. tcp_hashinfo).

The inet_sk_diag_fill() currently does not take the "cb" as an argument.
One of the reason is inet_sk_diag_fill() is used by both dump_one()
and dump() (which belong to the "struct inet_diag_handler".  The dump_one()
interface does not pass the "cb" along.

This patch is to make dump_one() pass a "cb".  The "cb" is created in
inet_diag_cmd_exact().  The "nlh" and "in_skb" are stored in "cb" as
the dump() interface does.  The total number of args in
inet_sk_diag_fill() is also cut from 10 to 7 and
that helps many callers to pass fewer args.

In particular,
"struct user_namespace *user_ns", "u32 pid", and "u32 seq"
can be replaced by accessing "cb->nlh" and "cb->skb".

A similar argument reduction is also made to
inet_twsk_diag_fill() and inet_req_diag_fill().

inet_csk_diag_dump() and inet_csk_diag_fill() are also removed.
They are mostly equivalent to inet_sk_diag_fill().  Their repeated
usages are very limited.  Thus, inet_sk_diag_fill() is directly used
in those occasions.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200225230409.1975173-1-kafai@fb.com
---
 include/linux/inet_diag.h |  12 ++---
 net/dccp/diag.c           |   5 +-
 net/ipv4/inet_diag.c      | 116 ++++++++++++++++++----------------------------
 net/ipv4/raw_diag.c       |  18 +++----
 net/ipv4/tcp_diag.c       |   4 +-
 net/ipv4/udp_diag.c       |  26 +++++------
 net/sctp/diag.c           |   5 +-
 7 files changed, 73 insertions(+), 113 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index 39faaaf843e1..6b157ce07d74 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -18,8 +18,7 @@ struct inet_diag_handler {
 				const struct inet_diag_req_v2 *r,
 				struct nlattr *bc);
 
-	int		(*dump_one)(struct sk_buff *in_skb,
-				    const struct nlmsghdr *nlh,
+	int		(*dump_one)(struct netlink_callback *cb,
 				    const struct inet_diag_req_v2 *req);
 
 	void		(*idiag_get_info)(struct sock *sk,
@@ -42,16 +41,15 @@ struct inet_diag_handler {
 
 struct inet_connection_sock;
 int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
-		      struct sk_buff *skb, const struct inet_diag_req_v2 *req,
-		      struct user_namespace *user_ns,
-		      u32 pid, u32 seq, u16 nlmsg_flags,
-		      const struct nlmsghdr *unlh, bool net_admin);
+		      struct sk_buff *skb, struct netlink_callback *cb,
+		      const struct inet_diag_req_v2 *req,
+		      u16 nlmsg_flags, bool net_admin);
 void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb,
 			 struct netlink_callback *cb,
 			 const struct inet_diag_req_v2 *r,
 			 struct nlattr *bc);
 int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
-			    struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+			    struct netlink_callback *cb,
 			    const struct inet_diag_req_v2 *req);
 
 struct sock *inet_diag_find_one_icsk(struct net *net,
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index 73ef73a218ff..8f1e2a653f6d 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -51,11 +51,10 @@ static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 	inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r, bc);
 }
 
-static int dccp_diag_dump_one(struct sk_buff *in_skb,
-			      const struct nlmsghdr *nlh,
+static int dccp_diag_dump_one(struct netlink_callback *cb,
 			      const struct inet_diag_req_v2 *req)
 {
-	return inet_diag_dump_one_icsk(&dccp_hashinfo, in_skb, nlh, req);
+	return inet_diag_dump_one_icsk(&dccp_hashinfo, cb, req);
 }
 
 static const struct inet_diag_handler dccp_diag_handler = {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index f11e997e517b..d2ecff3195ba 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -157,11 +157,9 @@ errout:
 EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill);
 
 int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
-		      struct sk_buff *skb, const struct inet_diag_req_v2 *req,
-		      struct user_namespace *user_ns,
-		      u32 portid, u32 seq, u16 nlmsg_flags,
-		      const struct nlmsghdr *unlh,
-		      bool net_admin)
+		      struct sk_buff *skb, struct netlink_callback *cb,
+		      const struct inet_diag_req_v2 *req,
+		      u16 nlmsg_flags, bool net_admin)
 {
 	const struct tcp_congestion_ops *ca_ops;
 	const struct inet_diag_handler *handler;
@@ -174,8 +172,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	handler = inet_diag_table[req->sdiag_protocol];
 	BUG_ON(!handler);
 
-	nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
-			nlmsg_flags);
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
 	if (!nlh)
 		return -EMSGSIZE;
 
@@ -187,7 +185,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	r->idiag_timer = 0;
 	r->idiag_retrans = 0;
 
-	if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
+	if (inet_diag_msg_attrs_fill(sk, skb, r, ext,
+				     sk_user_ns(NETLINK_CB(cb->skb).sk),
+				     net_admin))
 		goto errout;
 
 	if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@@ -312,30 +312,19 @@ errout:
 }
 EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
 
-static int inet_csk_diag_fill(struct sock *sk,
-			      struct sk_buff *skb,
-			      const struct inet_diag_req_v2 *req,
-			      struct user_namespace *user_ns,
-			      u32 portid, u32 seq, u16 nlmsg_flags,
-			      const struct nlmsghdr *unlh,
-			      bool net_admin)
-{
-	return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns,
-				 portid, seq, nlmsg_flags, unlh, net_admin);
-}
-
 static int inet_twsk_diag_fill(struct sock *sk,
 			       struct sk_buff *skb,
-			       u32 portid, u32 seq, u16 nlmsg_flags,
-			       const struct nlmsghdr *unlh)
+			       struct netlink_callback *cb,
+			       u16 nlmsg_flags)
 {
 	struct inet_timewait_sock *tw = inet_twsk(sk);
 	struct inet_diag_msg *r;
 	struct nlmsghdr *nlh;
 	long tmo;
 
-	nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
-			nlmsg_flags);
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type,
+			sizeof(*r), nlmsg_flags);
 	if (!nlh)
 		return -EMSGSIZE;
 
@@ -359,16 +348,16 @@ static int inet_twsk_diag_fill(struct sock *sk,
 }
 
 static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
-			      u32 portid, u32 seq, u16 nlmsg_flags,
-			      const struct nlmsghdr *unlh, bool net_admin)
+			      struct netlink_callback *cb,
+			      u16 nlmsg_flags, bool net_admin)
 {
 	struct request_sock *reqsk = inet_reqsk(sk);
 	struct inet_diag_msg *r;
 	struct nlmsghdr *nlh;
 	long tmo;
 
-	nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
-			nlmsg_flags);
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
 	if (!nlh)
 		return -EMSGSIZE;
 
@@ -397,21 +386,18 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
 }
 
 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+			struct netlink_callback *cb,
 			const struct inet_diag_req_v2 *r,
-			struct user_namespace *user_ns,
-			u32 portid, u32 seq, u16 nlmsg_flags,
-			const struct nlmsghdr *unlh, bool net_admin)
+			u16 nlmsg_flags, bool net_admin)
 {
 	if (sk->sk_state == TCP_TIME_WAIT)
-		return inet_twsk_diag_fill(sk, skb, portid, seq,
-					   nlmsg_flags, unlh);
+		return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags);
 
 	if (sk->sk_state == TCP_NEW_SYN_RECV)
-		return inet_req_diag_fill(sk, skb, portid, seq,
-					  nlmsg_flags, unlh, net_admin);
+		return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
 
-	return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
-				  nlmsg_flags, unlh, net_admin);
+	return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags,
+				 net_admin);
 }
 
 struct sock *inet_diag_find_one_icsk(struct net *net,
@@ -459,10 +445,10 @@ struct sock *inet_diag_find_one_icsk(struct net *net,
 EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk);
 
 int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
-			    struct sk_buff *in_skb,
-			    const struct nlmsghdr *nlh,
+			    struct netlink_callback *cb,
 			    const struct inet_diag_req_v2 *req)
 {
+	struct sk_buff *in_skb = cb->skb;
 	bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
 	struct net *net = sock_net(in_skb->sk);
 	struct sk_buff *rep;
@@ -479,10 +465,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
 		goto out;
 	}
 
-	err = sk_diag_fill(sk, rep, req,
-			   sk_user_ns(NETLINK_CB(in_skb).sk),
-			   NETLINK_CB(in_skb).portid,
-			   nlh->nlmsg_seq, 0, nlh, net_admin);
+	err = sk_diag_fill(sk, rep, cb, req, 0, net_admin);
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
 		nlmsg_free(rep);
@@ -509,14 +492,19 @@ static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
 	int err;
 
 	handler = inet_diag_lock_handler(req->sdiag_protocol);
-	if (IS_ERR(handler))
+	if (IS_ERR(handler)) {
 		err = PTR_ERR(handler);
-	else if (cmd == SOCK_DIAG_BY_FAMILY)
-		err = handler->dump_one(in_skb, nlh, req);
-	else if (cmd == SOCK_DESTROY && handler->destroy)
+	} else if (cmd == SOCK_DIAG_BY_FAMILY) {
+		struct netlink_callback cb = {
+			.nlh = nlh,
+			.skb = in_skb,
+		};
+		err = handler->dump_one(&cb, req);
+	} else if (cmd == SOCK_DESTROY && handler->destroy) {
 		err = handler->destroy(in_skb, req);
-	else
+	} else {
 		err = -EOPNOTSUPP;
+	}
 	inet_diag_unlock_handler(handler);
 
 	return err;
@@ -847,23 +835,6 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
 	return len == 0 ? 0 : -EINVAL;
 }
 
-static int inet_csk_diag_dump(struct sock *sk,
-			      struct sk_buff *skb,
-			      struct netlink_callback *cb,
-			      const struct inet_diag_req_v2 *r,
-			      const struct nlattr *bc,
-			      bool net_admin)
-{
-	if (!inet_diag_bc_sk(bc, sk))
-		return 0;
-
-	return inet_csk_diag_fill(sk, skb, r,
-				  sk_user_ns(NETLINK_CB(cb->skb).sk),
-				  NETLINK_CB(cb->skb).portid,
-				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
-				  net_admin);
-}
-
 static void twsk_build_assert(void)
 {
 	BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
@@ -935,8 +906,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 				    r->id.idiag_sport)
 					goto next_listen;
 
-				if (inet_csk_diag_dump(sk, skb, cb, r,
-						       bc, net_admin) < 0) {
+				if (!inet_diag_bc_sk(bc, sk))
+					goto next_listen;
+
+				if (inet_sk_diag_fill(sk, inet_csk(sk), skb,
+						      cb, r, NLM_F_MULTI,
+						      net_admin) < 0) {
 					spin_unlock(&ilb->lock);
 					goto done;
 				}
@@ -1014,11 +989,8 @@ next_normal:
 		res = 0;
 		for (idx = 0; idx < accum; idx++) {
 			if (res >= 0) {
-				res = sk_diag_fill(sk_arr[idx], skb, r,
-					   sk_user_ns(NETLINK_CB(cb->skb).sk),
-					   NETLINK_CB(cb->skb).portid,
-					   cb->nlh->nlmsg_seq, NLM_F_MULTI,
-					   cb->nlh, net_admin);
+				res = sk_diag_fill(sk_arr[idx], skb, cb, r,
+						   NLM_F_MULTI, net_admin);
 				if (res < 0)
 					num = num_arr[idx];
 			}
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index e35736b99300..a2933eeabd91 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -87,15 +87,16 @@ out_unlock:
 	return sk ? sk : ERR_PTR(-ENOENT);
 }
 
-static int raw_diag_dump_one(struct sk_buff *in_skb,
-			     const struct nlmsghdr *nlh,
+static int raw_diag_dump_one(struct netlink_callback *cb,
 			     const struct inet_diag_req_v2 *r)
 {
-	struct net *net = sock_net(in_skb->sk);
+	struct sk_buff *in_skb = cb->skb;
 	struct sk_buff *rep;
 	struct sock *sk;
+	struct net *net;
 	int err;
 
+	net = sock_net(in_skb->sk);
 	sk = raw_sock_get(net, r);
 	if (IS_ERR(sk))
 		return PTR_ERR(sk);
@@ -108,10 +109,7 @@ static int raw_diag_dump_one(struct sk_buff *in_skb,
 		return -ENOMEM;
 	}
 
-	err = inet_sk_diag_fill(sk, NULL, rep, r,
-				sk_user_ns(NETLINK_CB(in_skb).sk),
-				NETLINK_CB(in_skb).portid,
-				nlh->nlmsg_seq, 0, nlh,
+	err = inet_sk_diag_fill(sk, NULL, rep, cb, r, 0,
 				netlink_net_capable(in_skb, CAP_NET_ADMIN));
 	sock_put(sk);
 
@@ -136,11 +134,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
 	if (!inet_diag_bc_sk(bc, sk))
 		return 0;
 
-	return inet_sk_diag_fill(sk, NULL, skb, r,
-				 sk_user_ns(NETLINK_CB(cb->skb).sk),
-				 NETLINK_CB(cb->skb).portid,
-				 cb->nlh->nlmsg_seq, NLM_F_MULTI,
-				 cb->nlh, net_admin);
+	return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin);
 }
 
 static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 0d08f9e2d8d0..bcd3a26efff1 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -184,10 +184,10 @@ static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 	inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
 }
 
-static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+static int tcp_diag_dump_one(struct netlink_callback *cb,
 			     const struct inet_diag_req_v2 *req)
 {
-	return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
+	return inet_diag_dump_one_icsk(&tcp_hashinfo, cb, req);
 }
 
 #ifdef CONFIG_INET_DIAG_DESTROY
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 910555a4d9fe..7d65a6a5cd51 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -21,16 +21,15 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
 	if (!inet_diag_bc_sk(bc, sk))
 		return 0;
 
-	return inet_sk_diag_fill(sk, NULL, skb, req,
-			sk_user_ns(NETLINK_CB(cb->skb).sk),
-			NETLINK_CB(cb->skb).portid,
-			cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin);
+	return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI,
+				 net_admin);
 }
 
-static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
-			const struct nlmsghdr *nlh,
+static int udp_dump_one(struct udp_table *tbl,
+			struct netlink_callback *cb,
 			const struct inet_diag_req_v2 *req)
 {
+	struct sk_buff *in_skb = cb->skb;
 	int err = -EINVAL;
 	struct sock *sk = NULL;
 	struct sk_buff *rep;
@@ -70,11 +69,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 	if (!rep)
 		goto out;
 
-	err = inet_sk_diag_fill(sk, NULL, rep, req,
-			   sk_user_ns(NETLINK_CB(in_skb).sk),
-			   NETLINK_CB(in_skb).portid,
-			   nlh->nlmsg_seq, 0, nlh,
-			   netlink_net_capable(in_skb, CAP_NET_ADMIN));
+	err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0,
+				netlink_net_capable(in_skb, CAP_NET_ADMIN));
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
 		kfree_skb(rep);
@@ -151,10 +147,10 @@ static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 	udp_dump(&udp_table, skb, cb, r, bc);
 }
 
-static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+static int udp_diag_dump_one(struct netlink_callback *cb,
 			     const struct inet_diag_req_v2 *req)
 {
-	return udp_dump_one(&udp_table, in_skb, nlh, req);
+	return udp_dump_one(&udp_table, cb, req);
 }
 
 static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
@@ -255,10 +251,10 @@ static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 	udp_dump(&udplite_table, skb, cb, r, bc);
 }
 
-static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+static int udplite_diag_dump_one(struct netlink_callback *cb,
 				 const struct inet_diag_req_v2 *req)
 {
-	return udp_dump_one(&udplite_table, in_skb, nlh, req);
+	return udp_dump_one(&udplite_table, cb, req);
 }
 
 static const struct inet_diag_handler udplite_diag_handler = {
diff --git a/net/sctp/diag.c b/net/sctp/diag.c
index 8a15146faaeb..bed6436cd0af 100644
--- a/net/sctp/diag.c
+++ b/net/sctp/diag.c
@@ -432,11 +432,12 @@ static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
 		sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo);
 }
 
-static int sctp_diag_dump_one(struct sk_buff *in_skb,
-			      const struct nlmsghdr *nlh,
+static int sctp_diag_dump_one(struct netlink_callback *cb,
 			      const struct inet_diag_req_v2 *req)
 {
+	struct sk_buff *in_skb = cb->skb;
 	struct net *net = sock_net(in_skb->sk);
+	const struct nlmsghdr *nlh = cb->nlh;
 	union sctp_addr laddr, paddr;
 	struct sctp_comm_param commp = {
 		.skb = in_skb,
-- 
cgit v1.2.3


From 0df6d32842b9a5f97a29ea90c8adc5cfac38341d Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 25 Feb 2020 15:04:15 -0800
Subject: inet_diag: Move the INET_DIAG_REQ_BYTECODE nlattr to cb->data

The INET_DIAG_REQ_BYTECODE nlattr is currently re-found every time when
the "dump()" is re-started.

In a latter patch, it will also need to parse the new
INET_DIAG_REQ_SK_BPF_STORAGES nlattr to learn the map_fds. Thus, this
patch takes this chance to store the parsed nlattr in cb->data
during the "start" time of a dump.

By doing this, the "bc" argument also becomes unnecessary
and is removed.  Also, the two copies of the INET_DIAG_REQ_BYTECODE
parsing-audit logic between compat/current version can be
consolidated to one.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200225230415.1975555-1-kafai@fb.com
---
 include/linux/inet_diag.h      |  11 ++--
 include/uapi/linux/inet_diag.h |   3 +-
 net/dccp/diag.c                |   4 +-
 net/ipv4/inet_diag.c           | 117 ++++++++++++++++++++++++-----------------
 net/ipv4/raw_diag.c            |   6 ++-
 net/ipv4/tcp_diag.c            |   4 +-
 net/ipv4/udp_diag.c            |  15 +++---
 net/sctp/diag.c                |   2 +-
 8 files changed, 98 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index 6b157ce07d74..1bb94cac265f 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -15,8 +15,7 @@ struct netlink_callback;
 struct inet_diag_handler {
 	void		(*dump)(struct sk_buff *skb,
 				struct netlink_callback *cb,
-				const struct inet_diag_req_v2 *r,
-				struct nlattr *bc);
+				const struct inet_diag_req_v2 *r);
 
 	int		(*dump_one)(struct netlink_callback *cb,
 				    const struct inet_diag_req_v2 *req);
@@ -39,6 +38,11 @@ struct inet_diag_handler {
 	__u16		idiag_info_size;
 };
 
+struct inet_diag_dump_data {
+	struct nlattr *req_nlas[__INET_DIAG_REQ_MAX];
+#define inet_diag_nla_bc req_nlas[INET_DIAG_REQ_BYTECODE]
+};
+
 struct inet_connection_sock;
 int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		      struct sk_buff *skb, struct netlink_callback *cb,
@@ -46,8 +50,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		      u16 nlmsg_flags, bool net_admin);
 void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb,
 			 struct netlink_callback *cb,
-			 const struct inet_diag_req_v2 *r,
-			 struct nlattr *bc);
+			 const struct inet_diag_req_v2 *r);
 int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
 			    struct netlink_callback *cb,
 			    const struct inet_diag_req_v2 *req);
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index a1ff345b3f33..bab9a9f8da12 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -64,9 +64,10 @@ struct inet_diag_req_raw {
 enum {
 	INET_DIAG_REQ_NONE,
 	INET_DIAG_REQ_BYTECODE,
+	__INET_DIAG_REQ_MAX,
 };
 
-#define INET_DIAG_REQ_MAX INET_DIAG_REQ_BYTECODE
+#define INET_DIAG_REQ_MAX (__INET_DIAG_REQ_MAX - 1)
 
 /* Bytecode is sequence of 4 byte commands followed by variable arguments.
  * All the commands identified by "code" are conditional jumps forward:
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index 8f1e2a653f6d..8a82c5a2c5a8 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -46,9 +46,9 @@ static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
 }
 
 static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
-			   const struct inet_diag_req_v2 *r, struct nlattr *bc)
+			   const struct inet_diag_req_v2 *r)
 {
-	inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r, bc);
+	inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r);
 }
 
 static int dccp_diag_dump_one(struct netlink_callback *cb,
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index d2ecff3195ba..4bce8a477699 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -495,9 +495,11 @@ static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
 	if (IS_ERR(handler)) {
 		err = PTR_ERR(handler);
 	} else if (cmd == SOCK_DIAG_BY_FAMILY) {
+		struct inet_diag_dump_data empty_dump_data = {};
 		struct netlink_callback cb = {
 			.nlh = nlh,
 			.skb = in_skb,
+			.data = &empty_dump_data,
 		};
 		err = handler->dump_one(&cb, req);
 	} else if (cmd == SOCK_DESTROY && handler->destroy) {
@@ -863,14 +865,17 @@ static void twsk_build_assert(void)
 
 void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 			 struct netlink_callback *cb,
-			 const struct inet_diag_req_v2 *r, struct nlattr *bc)
+			 const struct inet_diag_req_v2 *r)
 {
 	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+	struct inet_diag_dump_data *cb_data = cb->data;
 	struct net *net = sock_net(skb->sk);
 	u32 idiag_states = r->idiag_states;
 	int i, num, s_i, s_num;
+	struct nlattr *bc;
 	struct sock *sk;
 
+	bc = cb_data->inet_diag_nla_bc;
 	if (idiag_states & TCPF_SYN_RECV)
 		idiag_states |= TCPF_NEW_SYN_RECV;
 	s_i = cb->args[1];
@@ -1014,15 +1019,14 @@ out:
 EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
 
 static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
-			    const struct inet_diag_req_v2 *r,
-			    struct nlattr *bc)
+			    const struct inet_diag_req_v2 *r)
 {
 	const struct inet_diag_handler *handler;
 	int err = 0;
 
 	handler = inet_diag_lock_handler(r->sdiag_protocol);
 	if (!IS_ERR(handler))
-		handler->dump(skb, cb, r, bc);
+		handler->dump(skb, cb, r);
 	else
 		err = PTR_ERR(handler);
 	inet_diag_unlock_handler(handler);
@@ -1032,13 +1036,57 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 
 static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	int hdrlen = sizeof(struct inet_diag_req_v2);
-	struct nlattr *bc = NULL;
+	return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh));
+}
+
+static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen)
+{
+	const struct nlmsghdr *nlh = cb->nlh;
+	struct inet_diag_dump_data *cb_data;
+	struct sk_buff *skb = cb->skb;
+	struct nlattr *nla;
+	int rem, err;
+
+	cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL);
+	if (!cb_data)
+		return -ENOMEM;
+
+	nla_for_each_attr(nla, nlmsg_attrdata(nlh, hdrlen),
+			  nlmsg_attrlen(nlh, hdrlen), rem) {
+		int type = nla_type(nla);
+
+		if (type < __INET_DIAG_REQ_MAX)
+			cb_data->req_nlas[type] = nla;
+	}
+
+	nla = cb_data->inet_diag_nla_bc;
+	if (nla) {
+		err = inet_diag_bc_audit(nla, skb);
+		if (err) {
+			kfree(cb_data);
+			return err;
+		}
+	}
+
+	cb->data = cb_data;
+	return 0;
+}
+
+static int inet_diag_dump_start(struct netlink_callback *cb)
+{
+	return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req_v2));
+}
+
+static int inet_diag_dump_start_compat(struct netlink_callback *cb)
+{
+	return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req));
+}
 
-	if (nlmsg_attrlen(cb->nlh, hdrlen))
-		bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
+static int inet_diag_dump_done(struct netlink_callback *cb)
+{
+	kfree(cb->data);
 
-	return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
+	return 0;
 }
 
 static int inet_diag_type2proto(int type)
@@ -1057,9 +1105,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb,
 				 struct netlink_callback *cb)
 {
 	struct inet_diag_req *rc = nlmsg_data(cb->nlh);
-	int hdrlen = sizeof(struct inet_diag_req);
 	struct inet_diag_req_v2 req;
-	struct nlattr *bc = NULL;
 
 	req.sdiag_family = AF_UNSPEC; /* compatibility */
 	req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
@@ -1067,10 +1113,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb,
 	req.idiag_states = rc->idiag_states;
 	req.id = rc->id;
 
-	if (nlmsg_attrlen(cb->nlh, hdrlen))
-		bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
-
-	return __inet_diag_dump(skb, cb, &req, bc);
+	return __inet_diag_dump(skb, cb, &req);
 }
 
 static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
@@ -1098,22 +1141,12 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return -EINVAL;
 
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
-		if (nlmsg_attrlen(nlh, hdrlen)) {
-			struct nlattr *attr;
-			int err;
-
-			attr = nlmsg_find_attr(nlh, hdrlen,
-					       INET_DIAG_REQ_BYTECODE);
-			err = inet_diag_bc_audit(attr, skb);
-			if (err)
-				return err;
-		}
-		{
-			struct netlink_dump_control c = {
-				.dump = inet_diag_dump_compat,
-			};
-			return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
-		}
+		struct netlink_dump_control c = {
+			.start = inet_diag_dump_start_compat,
+			.done = inet_diag_dump_done,
+			.dump = inet_diag_dump_compat,
+		};
+		return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
 	}
 
 	return inet_diag_get_exact_compat(skb, nlh);
@@ -1129,22 +1162,12 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
 
 	if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
 	    h->nlmsg_flags & NLM_F_DUMP) {
-		if (nlmsg_attrlen(h, hdrlen)) {
-			struct nlattr *attr;
-			int err;
-
-			attr = nlmsg_find_attr(h, hdrlen,
-					       INET_DIAG_REQ_BYTECODE);
-			err = inet_diag_bc_audit(attr, skb);
-			if (err)
-				return err;
-		}
-		{
-			struct netlink_dump_control c = {
-				.dump = inet_diag_dump,
-			};
-			return netlink_dump_start(net->diag_nlsk, skb, h, &c);
-		}
+		struct netlink_dump_control c = {
+			.start = inet_diag_dump_start,
+			.done = inet_diag_dump_done,
+			.dump = inet_diag_dump,
+		};
+		return netlink_dump_start(net->diag_nlsk, skb, h, &c);
 	}
 
 	return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h));
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index a2933eeabd91..d19cce39be1b 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -138,17 +138,21 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
 }
 
 static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
-			  const struct inet_diag_req_v2 *r, struct nlattr *bc)
+			  const struct inet_diag_req_v2 *r)
 {
 	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
 	struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
 	struct net *net = sock_net(skb->sk);
+	struct inet_diag_dump_data *cb_data;
 	int num, s_num, slot, s_slot;
 	struct sock *sk = NULL;
+	struct nlattr *bc;
 
 	if (IS_ERR(hashinfo))
 		return;
 
+	cb_data = cb->data;
+	bc = cb_data->inet_diag_nla_bc;
 	s_slot = cb->args[0];
 	num = s_num = cb->args[1];
 
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index bcd3a26efff1..75a1c985f49a 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -179,9 +179,9 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
 }
 
 static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
-			  const struct inet_diag_req_v2 *r, struct nlattr *bc)
+			  const struct inet_diag_req_v2 *r)
 {
-	inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
+	inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r);
 }
 
 static int tcp_diag_dump_one(struct netlink_callback *cb,
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 7d65a6a5cd51..93884696abdd 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -89,12 +89,16 @@ out_nosk:
 
 static void udp_dump(struct udp_table *table, struct sk_buff *skb,
 		     struct netlink_callback *cb,
-		     const struct inet_diag_req_v2 *r, struct nlattr *bc)
+		     const struct inet_diag_req_v2 *r)
 {
 	bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
 	struct net *net = sock_net(skb->sk);
+	struct inet_diag_dump_data *cb_data;
 	int num, s_num, slot, s_slot;
+	struct nlattr *bc;
 
+	cb_data = cb->data;
+	bc = cb_data->inet_diag_nla_bc;
 	s_slot = cb->args[0];
 	num = s_num = cb->args[1];
 
@@ -142,9 +146,9 @@ done:
 }
 
 static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
-			  const struct inet_diag_req_v2 *r, struct nlattr *bc)
+			  const struct inet_diag_req_v2 *r)
 {
-	udp_dump(&udp_table, skb, cb, r, bc);
+	udp_dump(&udp_table, skb, cb, r);
 }
 
 static int udp_diag_dump_one(struct netlink_callback *cb,
@@ -245,10 +249,9 @@ static const struct inet_diag_handler udp_diag_handler = {
 };
 
 static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
-			      const struct inet_diag_req_v2 *r,
-			      struct nlattr *bc)
+			      const struct inet_diag_req_v2 *r)
 {
-	udp_dump(&udplite_table, skb, cb, r, bc);
+	udp_dump(&udplite_table, skb, cb, r);
 }
 
 static int udplite_diag_dump_one(struct netlink_callback *cb,
diff --git a/net/sctp/diag.c b/net/sctp/diag.c
index bed6436cd0af..69743a6aaf6f 100644
--- a/net/sctp/diag.c
+++ b/net/sctp/diag.c
@@ -471,7 +471,7 @@ static int sctp_diag_dump_one(struct netlink_callback *cb,
 }
 
 static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
-			   const struct inet_diag_req_v2 *r, struct nlattr *bc)
+			   const struct inet_diag_req_v2 *r)
 {
 	u32 idiag_states = r->idiag_states;
 	struct net *net = sock_net(skb->sk);
-- 
cgit v1.2.3


From 1ed4d92458a969e71e7914550b6f0c730c14d84e Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 25 Feb 2020 15:04:21 -0800
Subject: bpf: INET_DIAG support in bpf_sk_storage

This patch adds INET_DIAG support to bpf_sk_storage.

1. Although this series adds bpf_sk_storage diag capability to inet sk,
   bpf_sk_storage is in general applicable to all fullsock.  Hence, the
   bpf_sk_storage logic will operate on SK_DIAG_* nlattr.  The caller
   will pass in its specific nesting nlattr (e.g. INET_DIAG_*) as
   the argument.

2. The request will be like:
	INET_DIAG_REQ_SK_BPF_STORAGES (nla_nest) (defined in latter patch)
		SK_DIAG_BPF_STORAGE_REQ_MAP_FD (nla_put_u32)
		SK_DIAG_BPF_STORAGE_REQ_MAP_FD (nla_put_u32)
		......

   Considering there could have multiple bpf_sk_storages in a sk,
   instead of reusing INET_DIAG_INFO ("ss -i"),  the user can select
   some specific bpf_sk_storage to dump by specifying an array of
   SK_DIAG_BPF_STORAGE_REQ_MAP_FD.

   If no SK_DIAG_BPF_STORAGE_REQ_MAP_FD is specified (i.e. an empty
   INET_DIAG_REQ_SK_BPF_STORAGES), it will dump all bpf_sk_storages
   of a sk.

3. The reply will be like:
	INET_DIAG_BPF_SK_STORAGES (nla_nest) (defined in latter patch)
		SK_DIAG_BPF_STORAGE (nla_nest)
			SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
			SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
		SK_DIAG_BPF_STORAGE (nla_nest)
			SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
			SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
		......

4. Unlike other INET_DIAG info of a sk which is pretty static, the size
   required to dump the bpf_sk_storage(s) of a sk is dynamic as the
   system adding more bpf_sk_storage_map.  It is hard to set a static
   min_dump_alloc size.

   Hence, this series learns it at the runtime and adjust the
   cb->min_dump_alloc as it iterates all sk(s) of a system.  The
   "unsigned int *res_diag_size" in bpf_sk_storage_diag_put()
   is for this purpose.

   The next patch will update the cb->min_dump_alloc as it
   iterates the sk(s).

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200225230421.1975729-1-kafai@fb.com
---
 include/linux/bpf.h            |   1 +
 include/net/bpf_sk_storage.h   |  27 ++++
 include/uapi/linux/sock_diag.h |  26 ++++
 kernel/bpf/syscall.c           |  15 +++
 net/core/bpf_sk_storage.c      | 283 ++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 346 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9aa33b8f3d55..6015a4daf118 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1023,6 +1023,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux,
 void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock);
 void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
 
+struct bpf_map *bpf_map_get(u32 ufd);
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
 void bpf_map_inc(struct bpf_map *map);
diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
index 8e4f831d2e52..5036c94c0503 100644
--- a/include/net/bpf_sk_storage.h
+++ b/include/net/bpf_sk_storage.h
@@ -10,14 +10,41 @@ void bpf_sk_storage_free(struct sock *sk);
 extern const struct bpf_func_proto bpf_sk_storage_get_proto;
 extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
 
+struct bpf_sk_storage_diag;
+struct sk_buff;
+struct nlattr;
+struct sock;
+
 #ifdef CONFIG_BPF_SYSCALL
 int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
+struct bpf_sk_storage_diag *
+bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs);
+void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag);
+int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
+			    struct sock *sk, struct sk_buff *skb,
+			    int stg_array_type,
+			    unsigned int *res_diag_size);
 #else
 static inline int bpf_sk_storage_clone(const struct sock *sk,
 				       struct sock *newsk)
 {
 	return 0;
 }
+static inline struct bpf_sk_storage_diag *
+bpf_sk_storage_diag_alloc(const struct nlattr *nla)
+{
+	return NULL;
+}
+static inline void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag)
+{
+}
+static inline int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
+					  struct sock *sk, struct sk_buff *skb,
+					  int stg_array_type,
+					  unsigned int *res_diag_size)
+{
+	return 0;
+}
 #endif
 
 #endif /* _BPF_SK_STORAGE_H */
diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h
index e5925009a652..5f74a5f6091d 100644
--- a/include/uapi/linux/sock_diag.h
+++ b/include/uapi/linux/sock_diag.h
@@ -36,4 +36,30 @@ enum sknetlink_groups {
 };
 #define SKNLGRP_MAX	(__SKNLGRP_MAX - 1)
 
+enum {
+	SK_DIAG_BPF_STORAGE_REQ_NONE,
+	SK_DIAG_BPF_STORAGE_REQ_MAP_FD,
+	__SK_DIAG_BPF_STORAGE_REQ_MAX,
+};
+
+#define SK_DIAG_BPF_STORAGE_REQ_MAX	(__SK_DIAG_BPF_STORAGE_REQ_MAX - 1)
+
+enum {
+	SK_DIAG_BPF_STORAGE_REP_NONE,
+	SK_DIAG_BPF_STORAGE,
+	__SK_DIAG_BPF_STORAGE_REP_MAX,
+};
+
+#define SK_DIAB_BPF_STORAGE_REP_MAX	(__SK_DIAG_BPF_STORAGE_REP_MAX - 1)
+
+enum {
+	SK_DIAG_BPF_STORAGE_NONE,
+	SK_DIAG_BPF_STORAGE_PAD,
+	SK_DIAG_BPF_STORAGE_MAP_ID,
+	SK_DIAG_BPF_STORAGE_MAP_VALUE,
+	__SK_DIAG_BPF_STORAGE_MAX,
+};
+
+#define SK_DIAG_BPF_STORAGE_MAX        (__SK_DIAG_BPF_STORAGE_MAX - 1)
+
 #endif /* _UAPI__SOCK_DIAG_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a79743a89815..c536c65256ad 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -902,6 +902,21 @@ void bpf_map_inc_with_uref(struct bpf_map *map)
 }
 EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
 
+struct bpf_map *bpf_map_get(u32 ufd)
+{
+	struct fd f = fdget(ufd);
+	struct bpf_map *map;
+
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return map;
+
+	bpf_map_inc(map);
+	fdput(f);
+
+	return map;
+}
+
 struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 {
 	struct fd f = fdget(ufd);
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 3ab23f698221..3415a4896c59 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -8,6 +8,7 @@
 #include <linux/bpf.h>
 #include <net/bpf_sk_storage.h>
 #include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
 #include <uapi/linux/btf.h>
 
 static atomic_t cache_idx;
@@ -606,6 +607,14 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
 	kfree(map);
 }
 
+/* U16_MAX is much more than enough for sk local storage
+ * considering a tcp_sock is ~2k.
+ */
+#define MAX_VALUE_SIZE							\
+	min_t(u32,							\
+	      (KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \
+	      (U16_MAX - sizeof(struct bpf_sk_storage_elem)))
+
 static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
 {
 	if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
@@ -619,12 +628,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (attr->value_size >= KMALLOC_MAX_SIZE -
-	    MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) ||
-	    /* U16_MAX is much more than enough for sk local storage
-	     * considering a tcp_sock is ~2k.
-	     */
-	    attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem))
+	if (attr->value_size > MAX_VALUE_SIZE)
 		return -E2BIG;
 
 	return 0;
@@ -910,3 +914,270 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = {
 	.arg1_type	= ARG_CONST_MAP_PTR,
 	.arg2_type	= ARG_PTR_TO_SOCKET,
 };
+
+struct bpf_sk_storage_diag {
+	u32 nr_maps;
+	struct bpf_map *maps[];
+};
+
+/* The reply will be like:
+ * INET_DIAG_BPF_SK_STORAGES (nla_nest)
+ *	SK_DIAG_BPF_STORAGE (nla_nest)
+ *		SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ *		SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ *	SK_DIAG_BPF_STORAGE (nla_nest)
+ *		SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ *		SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ *	....
+ */
+static int nla_value_size(u32 value_size)
+{
+	/* SK_DIAG_BPF_STORAGE (nla_nest)
+	 *	SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+	 *	SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+	 */
+	return nla_total_size(0) + nla_total_size(sizeof(u32)) +
+		nla_total_size_64bit(value_size);
+}
+
+void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag)
+{
+	u32 i;
+
+	if (!diag)
+		return;
+
+	for (i = 0; i < diag->nr_maps; i++)
+		bpf_map_put(diag->maps[i]);
+
+	kfree(diag);
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free);
+
+static bool diag_check_dup(const struct bpf_sk_storage_diag *diag,
+			   const struct bpf_map *map)
+{
+	u32 i;
+
+	for (i = 0; i < diag->nr_maps; i++) {
+		if (diag->maps[i] == map)
+			return true;
+	}
+
+	return false;
+}
+
+struct bpf_sk_storage_diag *
+bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
+{
+	struct bpf_sk_storage_diag *diag;
+	struct nlattr *nla;
+	u32 nr_maps = 0;
+	int rem, err;
+
+	/* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as
+	 * the map_alloc_check() side also does.
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	nla_for_each_nested(nla, nla_stgs, rem) {
+		if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
+			nr_maps++;
+	}
+
+	diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps,
+		       GFP_KERNEL);
+	if (!diag)
+		return ERR_PTR(-ENOMEM);
+
+	nla_for_each_nested(nla, nla_stgs, rem) {
+		struct bpf_map *map;
+		int map_fd;
+
+		if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
+			continue;
+
+		map_fd = nla_get_u32(nla);
+		map = bpf_map_get(map_fd);
+		if (IS_ERR(map)) {
+			err = PTR_ERR(map);
+			goto err_free;
+		}
+		if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) {
+			bpf_map_put(map);
+			err = -EINVAL;
+			goto err_free;
+		}
+		if (diag_check_dup(diag, map)) {
+			bpf_map_put(map);
+			err = -EEXIST;
+			goto err_free;
+		}
+		diag->maps[diag->nr_maps++] = map;
+	}
+
+	return diag;
+
+err_free:
+	bpf_sk_storage_diag_free(diag);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
+
+static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb)
+{
+	struct nlattr *nla_stg, *nla_value;
+	struct bpf_sk_storage_map *smap;
+
+	/* It cannot exceed max nlattr's payload */
+	BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE);
+
+	nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE);
+	if (!nla_stg)
+		return -EMSGSIZE;
+
+	smap = rcu_dereference(sdata->smap);
+	if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id))
+		goto errout;
+
+	nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE,
+				      smap->map.value_size,
+				      SK_DIAG_BPF_STORAGE_PAD);
+	if (!nla_value)
+		goto errout;
+
+	if (map_value_has_spin_lock(&smap->map))
+		copy_map_value_locked(&smap->map, nla_data(nla_value),
+				      sdata->data, true);
+	else
+		copy_map_value(&smap->map, nla_data(nla_value), sdata->data);
+
+	nla_nest_end(skb, nla_stg);
+	return 0;
+
+errout:
+	nla_nest_cancel(skb, nla_stg);
+	return -EMSGSIZE;
+}
+
+static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
+				       int stg_array_type,
+				       unsigned int *res_diag_size)
+{
+	/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
+	unsigned int diag_size = nla_total_size(0);
+	struct bpf_sk_storage *sk_storage;
+	struct bpf_sk_storage_elem *selem;
+	struct bpf_sk_storage_map *smap;
+	struct nlattr *nla_stgs;
+	unsigned int saved_len;
+	int err = 0;
+
+	rcu_read_lock();
+
+	sk_storage = rcu_dereference(sk->sk_bpf_storage);
+	if (!sk_storage || hlist_empty(&sk_storage->list)) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+	nla_stgs = nla_nest_start(skb, stg_array_type);
+	if (!nla_stgs)
+		/* Continue to learn diag_size */
+		err = -EMSGSIZE;
+
+	saved_len = skb->len;
+	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
+		smap = rcu_dereference(SDATA(selem)->smap);
+		diag_size += nla_value_size(smap->map.value_size);
+
+		if (nla_stgs && diag_get(SDATA(selem), skb))
+			/* Continue to learn diag_size */
+			err = -EMSGSIZE;
+	}
+
+	rcu_read_unlock();
+
+	if (nla_stgs) {
+		if (saved_len == skb->len)
+			nla_nest_cancel(skb, nla_stgs);
+		else
+			nla_nest_end(skb, nla_stgs);
+	}
+
+	if (diag_size == nla_total_size(0)) {
+		*res_diag_size = 0;
+		return 0;
+	}
+
+	*res_diag_size = diag_size;
+	return err;
+}
+
+int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
+			    struct sock *sk, struct sk_buff *skb,
+			    int stg_array_type,
+			    unsigned int *res_diag_size)
+{
+	/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
+	unsigned int diag_size = nla_total_size(0);
+	struct bpf_sk_storage *sk_storage;
+	struct bpf_sk_storage_data *sdata;
+	struct nlattr *nla_stgs;
+	unsigned int saved_len;
+	int err = 0;
+	u32 i;
+
+	*res_diag_size = 0;
+
+	/* No map has been specified.  Dump all. */
+	if (!diag->nr_maps)
+		return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type,
+						   res_diag_size);
+
+	rcu_read_lock();
+	sk_storage = rcu_dereference(sk->sk_bpf_storage);
+	if (!sk_storage || hlist_empty(&sk_storage->list)) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+	nla_stgs = nla_nest_start(skb, stg_array_type);
+	if (!nla_stgs)
+		/* Continue to learn diag_size */
+		err = -EMSGSIZE;
+
+	saved_len = skb->len;
+	for (i = 0; i < diag->nr_maps; i++) {
+		sdata = __sk_storage_lookup(sk_storage,
+				(struct bpf_sk_storage_map *)diag->maps[i],
+				false);
+
+		if (!sdata)
+			continue;
+
+		diag_size += nla_value_size(diag->maps[i]->value_size);
+
+		if (nla_stgs && diag_get(sdata, skb))
+			/* Continue to learn diag_size */
+			err = -EMSGSIZE;
+	}
+	rcu_read_unlock();
+
+	if (nla_stgs) {
+		if (saved_len == skb->len)
+			nla_nest_cancel(skb, nla_stgs);
+		else
+			nla_nest_end(skb, nla_stgs);
+	}
+
+	if (diag_size == nla_total_size(0)) {
+		*res_diag_size = 0;
+		return 0;
+	}
+
+	*res_diag_size = diag_size;
+	return err;
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put);
-- 
cgit v1.2.3


From 085c20cacf2b72991ce1c9d99a5e2f1d9e73bb68 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 25 Feb 2020 15:04:27 -0800
Subject: bpf: inet_diag: Dump bpf_sk_storages in inet_diag_dump()

This patch will dump out the bpf_sk_storages of a sk
if the request has the INET_DIAG_REQ_SK_BPF_STORAGES nlattr.

An array of SK_DIAG_BPF_STORAGE_REQ_MAP_FD can be specified in
INET_DIAG_REQ_SK_BPF_STORAGES to select which bpf_sk_storage to dump.
If no map_fd is specified, all bpf_sk_storages of a sk will be dumped.

bpf_sk_storages can be added to the system at runtime.  It is difficult
to find a proper static value for cb->min_dump_alloc.

This patch learns the nlattr size required to dump the bpf_sk_storages
of a sk.  If it happens to be the very first nlmsg of a dump and it
cannot fit the needed bpf_sk_storages,  it will try to expand the
skb by "pskb_expand_head()".

Instead of expanding it in inet_sk_diag_fill(), it is expanded at a
sleepable context in __inet_diag_dump() so __GFP_DIRECT_RECLAIM can
be used.  In __inet_diag_dump(), it will retry as long as the
skb is empty and the cb->min_dump_alloc becomes larger than before.
cb->min_dump_alloc is bounded by KMALLOC_MAX_SIZE.  The min_dump_alloc
is also changed from 'u16' to 'u32' to accommodate a sk that may have
a few large bpf_sk_storages.

The updated cb->min_dump_alloc will also be used to allocate the skb in
the next dump.  This logic already exists in netlink_dump().

Here is the sample output of a locally modified 'ss' and it could be made
more readable by using BTF later:
[root@arch-fb-vm1 ~]# ss --bpf-map-id 14 --bpf-map-id 13 -t6an 'dst [::1]:8989'
State Recv-Q Send-Q Local Address:Port  Peer Address:PortProcess
ESTAB 0      0              [::1]:51072        [::1]:8989
	 bpf_map_id:14 value:[ 3feb ]
	 bpf_map_id:13 value:[ 3f ]
ESTAB 0      0              [::1]:51070        [::1]:8989
	 bpf_map_id:14 value:[ 3feb ]
	 bpf_map_id:13 value:[ 3f ]

[root@arch-fb-vm1 ~]# ~/devshare/github/iproute2/misc/ss --bpf-maps -t6an 'dst [::1]:8989'
State         Recv-Q         Send-Q                   Local Address:Port                    Peer Address:Port         Process
ESTAB         0              0                                [::1]:51072                          [::1]:8989
	 bpf_map_id:14 value:[ 3feb ]
	 bpf_map_id:13 value:[ 3f ]
	 bpf_map_id:12 value:[ 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000... total:65407 ]
ESTAB         0              0                                [::1]:51070                          [::1]:8989
	 bpf_map_id:14 value:[ 3feb ]
	 bpf_map_id:13 value:[ 3f ]
	 bpf_map_id:12 value:[ 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000... total:65407 ]

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200225230427.1976129-1-kafai@fb.com
---
 include/linux/inet_diag.h      |  4 +++
 include/linux/netlink.h        |  4 +--
 include/uapi/linux/inet_diag.h |  2 ++
 net/ipv4/inet_diag.c           | 74 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 82 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index 1bb94cac265f..e4ba25d63913 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -38,9 +38,13 @@ struct inet_diag_handler {
 	__u16		idiag_info_size;
 };
 
+struct bpf_sk_storage_diag;
 struct inet_diag_dump_data {
 	struct nlattr *req_nlas[__INET_DIAG_REQ_MAX];
 #define inet_diag_nla_bc req_nlas[INET_DIAG_REQ_BYTECODE]
+#define inet_diag_nla_bpf_stgs req_nlas[INET_DIAG_REQ_SK_BPF_STORAGES]
+
+	struct bpf_sk_storage_diag *bpf_stg_diag;
 };
 
 struct inet_connection_sock;
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 205fa7b1f07a..788969ccbbde 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -188,10 +188,10 @@ struct netlink_callback {
 	struct module		*module;
 	struct netlink_ext_ack	*extack;
 	u16			family;
-	u16			min_dump_alloc;
-	bool			strict_check;
 	u16			answer_flags;
+	u32			min_dump_alloc;
 	unsigned int		prev_seq, seq;
+	bool			strict_check;
 	union {
 		u8		ctx[48];
 
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index bab9a9f8da12..75dffd78363a 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -64,6 +64,7 @@ struct inet_diag_req_raw {
 enum {
 	INET_DIAG_REQ_NONE,
 	INET_DIAG_REQ_BYTECODE,
+	INET_DIAG_REQ_SK_BPF_STORAGES,
 	__INET_DIAG_REQ_MAX,
 };
 
@@ -155,6 +156,7 @@ enum {
 	INET_DIAG_CLASS_ID,	/* request as INET_DIAG_TCLASS */
 	INET_DIAG_MD5SIG,
 	INET_DIAG_ULP_INFO,
+	INET_DIAG_SK_BPF_STORAGES,
 	__INET_DIAG_MAX,
 };
 
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 4bce8a477699..e1cad25909df 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -23,6 +23,7 @@
 #include <net/inet_hashtables.h>
 #include <net/inet_timewait_sock.h>
 #include <net/inet6_hashtables.h>
+#include <net/bpf_sk_storage.h>
 #include <net/netlink.h>
 
 #include <linux/inet.h>
@@ -156,6 +157,8 @@ errout:
 }
 EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill);
 
+#define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+
 int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		      struct sk_buff *skb, struct netlink_callback *cb,
 		      const struct inet_diag_req_v2 *req,
@@ -163,12 +166,14 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 {
 	const struct tcp_congestion_ops *ca_ops;
 	const struct inet_diag_handler *handler;
+	struct inet_diag_dump_data *cb_data;
 	int ext = req->idiag_ext;
 	struct inet_diag_msg *r;
 	struct nlmsghdr  *nlh;
 	struct nlattr *attr;
 	void *info = NULL;
 
+	cb_data = cb->data;
 	handler = inet_diag_table[req->sdiag_protocol];
 	BUG_ON(!handler);
 
@@ -302,6 +307,48 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 			goto errout;
 	}
 
+	/* Keep it at the end for potential retry with a larger skb,
+	 * or else do best-effort fitting, which is only done for the
+	 * first_nlmsg.
+	 */
+	if (cb_data->bpf_stg_diag) {
+		bool first_nlmsg = ((unsigned char *)nlh == skb->data);
+		unsigned int prev_min_dump_alloc;
+		unsigned int total_nla_size = 0;
+		unsigned int msg_len;
+		int err;
+
+		msg_len = skb_tail_pointer(skb) - (unsigned char *)nlh;
+		err = bpf_sk_storage_diag_put(cb_data->bpf_stg_diag, sk, skb,
+					      INET_DIAG_SK_BPF_STORAGES,
+					      &total_nla_size);
+
+		if (!err)
+			goto out;
+
+		total_nla_size += msg_len;
+		prev_min_dump_alloc = cb->min_dump_alloc;
+		if (total_nla_size > prev_min_dump_alloc)
+			cb->min_dump_alloc = min_t(u32, total_nla_size,
+						   MAX_DUMP_ALLOC_SIZE);
+
+		if (!first_nlmsg)
+			goto errout;
+
+		if (cb->min_dump_alloc > prev_min_dump_alloc)
+			/* Retry with pskb_expand_head() with
+			 * __GFP_DIRECT_RECLAIM
+			 */
+			goto errout;
+
+		WARN_ON_ONCE(total_nla_size <= prev_min_dump_alloc);
+
+		/* Send what we have for this sk
+		 * and move on to the next sk in the following
+		 * dump()
+		 */
+	}
+
 out:
 	nlmsg_end(skb, nlh);
 	return 0;
@@ -1022,8 +1069,11 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 			    const struct inet_diag_req_v2 *r)
 {
 	const struct inet_diag_handler *handler;
+	u32 prev_min_dump_alloc;
 	int err = 0;
 
+again:
+	prev_min_dump_alloc = cb->min_dump_alloc;
 	handler = inet_diag_lock_handler(r->sdiag_protocol);
 	if (!IS_ERR(handler))
 		handler->dump(skb, cb, r);
@@ -1031,6 +1081,15 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		err = PTR_ERR(handler);
 	inet_diag_unlock_handler(handler);
 
+	/* The skb is not large enough to fit one sk info and
+	 * inet_sk_diag_fill() has requested for a larger skb.
+	 */
+	if (!skb->len && cb->min_dump_alloc > prev_min_dump_alloc) {
+		err = pskb_expand_head(skb, 0, cb->min_dump_alloc, GFP_KERNEL);
+		if (!err)
+			goto again;
+	}
+
 	return err ? : skb->len;
 }
 
@@ -1068,6 +1127,18 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen)
 		}
 	}
 
+	nla = cb_data->inet_diag_nla_bpf_stgs;
+	if (nla) {
+		struct bpf_sk_storage_diag *bpf_stg_diag;
+
+		bpf_stg_diag = bpf_sk_storage_diag_alloc(nla);
+		if (IS_ERR(bpf_stg_diag)) {
+			kfree(cb_data);
+			return PTR_ERR(bpf_stg_diag);
+		}
+		cb_data->bpf_stg_diag = bpf_stg_diag;
+	}
+
 	cb->data = cb_data;
 	return 0;
 }
@@ -1084,6 +1155,9 @@ static int inet_diag_dump_start_compat(struct netlink_callback *cb)
 
 static int inet_diag_dump_done(struct netlink_callback *cb)
 {
+	struct inet_diag_dump_data *cb_data = cb->data;
+
+	bpf_sk_storage_diag_free(cb_data->bpf_stg_diag);
 	kfree(cb->data);
 
 	return 0;
-- 
cgit v1.2.3


From 8402a31dd803e091fd2ec9cd22040b34a0b07085 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Fri, 28 Feb 2020 07:33:37 -0600
Subject: net: dccp: Replace zero-length array with flexible-array member

The current codebase makes use of the zero-length array language
extension to the C90 standard, but the preferred mechanism to declare
variable-length types such as these ones is a flexible array member[1][2],
introduced in C99:

struct foo {
        int stuff;
        struct boo array[];
};

By making use of the mechanism above, we will get a compiler warning
in case the flexible array does not occur last in the structure, which
will help us prevent some kind of undefined behavior bugs from being
inadvertently introduced[3] to the codebase from now on.

Also, notice that, dynamic memory allocations won't be affected by
this change:

"Flexible array members have incomplete type, and so the sizeof operator
may not be applied. As a quirk of the original implementation of
zero-length arrays, sizeof evaluates to zero."[1]

This issue was found with the help of Coccinelle.

[1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html
[2] https://github.com/KSPP/linux/issues/21
[3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour")

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h | 2 +-
 net/dccp/ccid.h      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 6b64b6cc2175..07e547c02fd8 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -198,7 +198,7 @@ enum dccp_role {
 
 struct dccp_service_list {
 	__u32	dccpsl_nr;
-	__be32	dccpsl_list[0];
+	__be32	dccpsl_list[];
 };
 
 #define DCCP_SERVICE_INVALID_VALUE htonl((__u32)-1)
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 70f88f2b4456..105f3734dadb 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -95,7 +95,7 @@ void ccid_cleanup_builtins(void);
 
 struct ccid {
 	struct ccid_operations *ccid_ops;
-	char		       ccid_priv[0];
+	char		       ccid_priv[];
 };
 
 static inline void *ccid_priv(const struct ccid *ccid)
-- 
cgit v1.2.3


From e427cad6eee47e2daf207cd7a4156ae72496ee07 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 28 Feb 2020 14:45:22 +0100
Subject: net: datagram: drop 'destructor' argument from several helpers

The only users for such argument are the UDP protocol and the UNIX
socket family. We can safely reclaim the accounted memory directly
from the UDP code and, after the previous patch, we can do scm
stats accounting outside the datagram helpers.

Overall this cleans up a bit some datagram-related helpers, and
avoids an indirect call per packet in the UDP receive path.

v1 -> v2:
 - call scm_stat_del() only when not peeking - Kirill
 - fix build issue with CONFIG_INET_ESPINTCP

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 12 ++----------
 net/core/datagram.c    | 25 +++++++------------------
 net/ipv4/udp.c         | 14 ++++++++------
 net/unix/af_unix.c     |  7 +++++--
 net/xfrm/espintcp.c    |  2 +-
 5 files changed, 23 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5b50278c4bc8..21749b2cdc9b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3514,23 +3514,15 @@ int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
 struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
 					  struct sk_buff_head *queue,
 					  unsigned int flags,
-					  void (*destructor)(struct sock *sk,
-							   struct sk_buff *skb),
 					  int *off, int *err,
 					  struct sk_buff **last);
 struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
 					struct sk_buff_head *queue,
-					unsigned int flags,
-					void (*destructor)(struct sock *sk,
-							   struct sk_buff *skb),
-					int *off, int *err,
+					unsigned int flags, int *off, int *err,
 					struct sk_buff **last);
 struct sk_buff *__skb_recv_datagram(struct sock *sk,
 				    struct sk_buff_head *sk_queue,
-				    unsigned int flags,
-				    void (*destructor)(struct sock *sk,
-						       struct sk_buff *skb),
-				    int *off, int *err);
+				    unsigned int flags, int *off, int *err);
 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
 				  int *err);
 __poll_t datagram_poll(struct file *file, struct socket *sock,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index a78e7f864c1e..4213081c6ed3 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -166,8 +166,6 @@ done:
 struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
 					  struct sk_buff_head *queue,
 					  unsigned int flags,
-					  void (*destructor)(struct sock *sk,
-							   struct sk_buff *skb),
 					  int *off, int *err,
 					  struct sk_buff **last)
 {
@@ -198,8 +196,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
 			refcount_inc(&skb->users);
 		} else {
 			__skb_unlink(skb, queue);
-			if (destructor)
-				destructor(sk, skb);
 		}
 		*off = _off;
 		return skb;
@@ -212,7 +208,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
  *	@sk: socket
  *	@queue: socket queue from which to receive
  *	@flags: MSG\_ flags
- *	@destructor: invoked under the receive lock on successful dequeue
  *	@off: an offset in bytes to peek skb from. Returns an offset
  *	      within an skb where data actually starts
  *	@err: error code returned
@@ -245,10 +240,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
  */
 struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
 					struct sk_buff_head *queue,
-					unsigned int flags,
-					void (*destructor)(struct sock *sk,
-							   struct sk_buff *skb),
-					int *off, int *err,
+					unsigned int flags, int *off, int *err,
 					struct sk_buff **last)
 {
 	struct sk_buff *skb;
@@ -269,8 +261,8 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
 		 * However, this function was correct in any case. 8)
 		 */
 		spin_lock_irqsave(&queue->lock, cpu_flags);
-		skb = __skb_try_recv_from_queue(sk, queue, flags, destructor,
-						off, &error, last);
+		skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error,
+						last);
 		spin_unlock_irqrestore(&queue->lock, cpu_flags);
 		if (error)
 			goto no_packet;
@@ -293,10 +285,7 @@ EXPORT_SYMBOL(__skb_try_recv_datagram);
 
 struct sk_buff *__skb_recv_datagram(struct sock *sk,
 				    struct sk_buff_head *sk_queue,
-				    unsigned int flags,
-				    void (*destructor)(struct sock *sk,
-						       struct sk_buff *skb),
-				    int *off, int *err)
+				    unsigned int flags, int *off, int *err)
 {
 	struct sk_buff *skb, *last;
 	long timeo;
@@ -304,8 +293,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk,
 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
 	do {
-		skb = __skb_try_recv_datagram(sk, sk_queue, flags, destructor,
-					      off, err, &last);
+		skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err,
+					      &last);
 		if (skb)
 			return skb;
 
@@ -326,7 +315,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
 
 	return __skb_recv_datagram(sk, &sk->sk_receive_queue,
 				   flags | (noblock ? MSG_DONTWAIT : 0),
-				   NULL, &off, err);
+				   &off, err);
 }
 EXPORT_SYMBOL(skb_recv_datagram);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 08a41f1e1cd2..a68e2ac37f26 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1671,10 +1671,11 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
 		error = -EAGAIN;
 		do {
 			spin_lock_bh(&queue->lock);
-			skb = __skb_try_recv_from_queue(sk, queue, flags,
-							udp_skb_destructor,
-							off, err, &last);
+			skb = __skb_try_recv_from_queue(sk, queue, flags, off,
+							err, &last);
 			if (skb) {
+				if (!(flags & MSG_PEEK))
+					udp_skb_destructor(sk, skb);
 				spin_unlock_bh(&queue->lock);
 				return skb;
 			}
@@ -1692,9 +1693,10 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
 			spin_lock(&sk_queue->lock);
 			skb_queue_splice_tail_init(sk_queue, queue);
 
-			skb = __skb_try_recv_from_queue(sk, queue, flags,
-							udp_skb_dtor_locked,
-							off, err, &last);
+			skb = __skb_try_recv_from_queue(sk, queue, flags, off,
+							err, &last);
+			if (skb && !(flags & MSG_PEEK))
+				udp_skb_dtor_locked(sk, skb);
 			spin_unlock(&sk_queue->lock);
 			spin_unlock_bh(&queue->lock);
 			if (skb)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c46fa271fc4a..3385a7a0b231 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2106,9 +2106,12 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 
 		skip = sk_peek_offset(sk, flags);
 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
-					      scm_stat_del, &skip, &err, &last);
-		if (skb)
+					      &skip, &err, &last);
+		if (skb) {
+			if (!(flags & MSG_PEEK))
+				scm_stat_del(sk, skb);
 			break;
+		}
 
 		mutex_unlock(&u->iolock);
 
diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c
index f15d6a564b0e..037ea156d2f9 100644
--- a/net/xfrm/espintcp.c
+++ b/net/xfrm/espintcp.c
@@ -100,7 +100,7 @@ static int espintcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 
 	flags |= nonblock ? MSG_DONTWAIT : 0;
 
-	skb = __skb_recv_datagram(sk, &ctx->ike_queue, flags, NULL, &off, &err);
+	skb = __skb_recv_datagram(sk, &ctx->ike_queue, flags, &off, &err);
 	if (!skb)
 		return err;
 
-- 
cgit v1.2.3


From 70ae1e127b486704c62d3537d69ca65c446d4d83 Mon Sep 17 00:00:00 2001
From: Cris Forno <cforno12@linux.vnet.ibm.com>
Date: Fri, 28 Feb 2020 14:12:04 -0600
Subject: ethtool: Factored out similar ethtool link settings for virtual
 devices to core

Three virtual devices (ibmveth, virtio_net, and netvsc) all have
similar code to set link settings and validate ethtool command. To
eliminate duplication of code, it is factored out into core/ethtool.c.

Signed-off-by: Cris Forno <cforno12@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  6 ++++++
 net/ethtool/ioctl.c     | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 95991e4300bf..23373978cb3c 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -420,4 +420,10 @@ struct ethtool_rx_flow_rule *
 ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input);
 void ethtool_rx_flow_rule_destroy(struct ethtool_rx_flow_rule *rule);
 
+bool ethtool_virtdev_validate_cmd(const struct ethtool_link_ksettings *cmd);
+int ethtool_virtdev_set_link_ksettings(struct net_device *dev,
+				       const struct ethtool_link_ksettings *cmd,
+				       u32 *dev_speed, u8 *dev_duplex);
+
+
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index b987052d91ef..f2fe8e5896dc 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -459,6 +459,24 @@ static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to,
 	return 0;
 }
 
+/* Check if the user is trying to change anything besides speed/duplex */
+bool ethtool_virtdev_validate_cmd(const struct ethtool_link_ksettings *cmd)
+{
+	struct ethtool_link_settings base2 = {};
+
+	base2.speed = cmd->base.speed;
+	base2.port = PORT_OTHER;
+	base2.duplex = cmd->base.duplex;
+	base2.cmd = cmd->base.cmd;
+	base2.link_mode_masks_nwords = cmd->base.link_mode_masks_nwords;
+
+	return !memcmp(&base2, &cmd->base, sizeof(base2)) &&
+		bitmap_empty(cmd->link_modes.supported,
+			     __ETHTOOL_LINK_MODE_MASK_NBITS) &&
+		bitmap_empty(cmd->link_modes.lp_advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
 /* convert a kernel internal ethtool_link_ksettings to
  * ethtool_link_usettings in user space. return 0 on success, errno on
  * error.
@@ -581,6 +599,27 @@ static int ethtool_set_link_ksettings(struct net_device *dev,
 	return err;
 }
 
+int ethtool_virtdev_set_link_ksettings(struct net_device *dev,
+				       const struct ethtool_link_ksettings *cmd,
+				       u32 *dev_speed, u8 *dev_duplex)
+{
+	u32 speed;
+	u8 duplex;
+
+	speed = cmd->base.speed;
+	duplex = cmd->base.duplex;
+	/* don't allow custom speed and duplex */
+	if (!ethtool_validate_speed(speed) ||
+	    !ethtool_validate_duplex(duplex) ||
+	    !ethtool_virtdev_validate_cmd(cmd))
+		return -EINVAL;
+	*dev_speed = speed;
+	*dev_duplex = duplex;
+
+	return 0;
+}
+EXPORT_SYMBOL(ethtool_virtdev_set_link_ksettings);
+
 /* Query device for its ethtool_cmd settings.
  *
  * Backward compatibility note: for compatibility with legacy ethtool, this is
-- 
cgit v1.2.3


From bb4cf02d4c74f0db60f58c406ddfdfed16d14f84 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Mon, 2 Mar 2020 05:59:33 -0600
Subject: netdevice: Replace zero-length array with flexible-array member

The current codebase makes use of the zero-length array language
extension to the C90 standard, but the preferred mechanism to declare
variable-length types such as these ones is a flexible array member[1][2],
introduced in C99:

struct foo {
        int stuff;
        struct boo array[];
};

By making use of the mechanism above, we will get a compiler warning
in case the flexible array does not occur last in the structure, which
will help us prevent some kind of undefined behavior bugs from being
inadvertently introduced[3] to the codebase from now on.

Also, notice that, dynamic memory allocations won't be affected by
this change:

"Flexible array members have incomplete type, and so the sizeof operator
may not be applied. As a quirk of the original implementation of
zero-length arrays, sizeof evaluates to zero."[1]

This issue was found with the help of Coccinelle.

[1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html
[2] https://github.com/KSPP/linux/issues/21
[3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour")

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6c3f7032e8d9..b6fedd54cd8e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -664,7 +664,7 @@ static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node
 struct rps_map {
 	unsigned int len;
 	struct rcu_head rcu;
-	u16 cpus[0];
+	u16 cpus[];
 };
 #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16)))
 
@@ -686,7 +686,7 @@ struct rps_dev_flow {
 struct rps_dev_flow_table {
 	unsigned int mask;
 	struct rcu_head rcu;
-	struct rps_dev_flow flows[0];
+	struct rps_dev_flow flows[];
 };
 #define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
     ((_num) * sizeof(struct rps_dev_flow)))
@@ -704,7 +704,7 @@ struct rps_dev_flow_table {
 struct rps_sock_flow_table {
 	u32	mask;
 
-	u32	ents[0] ____cacheline_aligned_in_smp;
+	u32	ents[] ____cacheline_aligned_in_smp;
 };
 #define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
 
@@ -767,7 +767,7 @@ struct xps_map {
 	unsigned int len;
 	unsigned int alloc_len;
 	struct rcu_head rcu;
-	u16 queues[0];
+	u16 queues[];
 };
 #define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + ((_num) * sizeof(u16)))
 #define XPS_MIN_MAP_ALLOC ((L1_CACHE_ALIGN(offsetof(struct xps_map, queues[1])) \
@@ -778,7 +778,7 @@ struct xps_map {
  */
 struct xps_dev_maps {
 	struct rcu_head rcu;
-	struct xps_map __rcu *attr_map[0]; /* Either CPUs map or RXQs map */
+	struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */
 };
 
 #define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +	\
-- 
cgit v1.2.3


From 70ed506c3bbcfa846d4636b23051ca79fa4781f7 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Mon, 2 Mar 2020 20:31:57 -0800
Subject: bpf: Introduce pinnable bpf_link abstraction

Introduce bpf_link abstraction, representing an attachment of BPF program to
a BPF hook point (e.g., tracepoint, perf event, etc). bpf_link encapsulates
ownership of attached BPF program, reference counting of a link itself, when
reference from multiple anonymous inodes, as well as ensures that release
callback will be called from a process context, so that users can safely take
mutex locks and sleep.

Additionally, with a new abstraction it's now possible to generalize pinning
of a link object in BPF FS, allowing to explicitly prevent BPF program
detachment on process exit by pinning it in a BPF FS and let it open from
independent other process to keep working with it.

Convert two existing bpf_link-like objects (raw tracepoint and tracing BPF
program attachments) into utilizing bpf_link framework, making them pinnable
in BPF FS. More FD-based bpf_links will be added in follow up patches.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200303043159.323675-2-andriin@fb.com
---
 include/linux/bpf.h  |  13 +++
 kernel/bpf/inode.c   |  42 +++++++++-
 kernel/bpf/syscall.c | 223 +++++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 232 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6015a4daf118..f13c78c6f29d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1056,6 +1056,19 @@ extern int sysctl_unprivileged_bpf_disabled;
 int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
 
+struct bpf_link;
+
+struct bpf_link_ops {
+	void (*release)(struct bpf_link *link);
+};
+
+void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
+		   struct bpf_prog *prog);
+void bpf_link_inc(struct bpf_link *link);
+void bpf_link_put(struct bpf_link *link);
+int bpf_link_new_fd(struct bpf_link *link);
+struct bpf_link *bpf_link_get_from_fd(u32 ufd);
+
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
 int bpf_obj_get_user(const char __user *pathname, int flags);
 
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5e40e7fccc21..95087d9f4ed3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -25,6 +25,7 @@ enum bpf_type {
 	BPF_TYPE_UNSPEC	= 0,
 	BPF_TYPE_PROG,
 	BPF_TYPE_MAP,
+	BPF_TYPE_LINK,
 };
 
 static void *bpf_any_get(void *raw, enum bpf_type type)
@@ -36,6 +37,9 @@ static void *bpf_any_get(void *raw, enum bpf_type type)
 	case BPF_TYPE_MAP:
 		bpf_map_inc_with_uref(raw);
 		break;
+	case BPF_TYPE_LINK:
+		bpf_link_inc(raw);
+		break;
 	default:
 		WARN_ON_ONCE(1);
 		break;
@@ -53,6 +57,9 @@ static void bpf_any_put(void *raw, enum bpf_type type)
 	case BPF_TYPE_MAP:
 		bpf_map_put_with_uref(raw);
 		break;
+	case BPF_TYPE_LINK:
+		bpf_link_put(raw);
+		break;
 	default:
 		WARN_ON_ONCE(1);
 		break;
@@ -63,20 +70,32 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
 {
 	void *raw;
 
-	*type = BPF_TYPE_MAP;
 	raw = bpf_map_get_with_uref(ufd);
-	if (IS_ERR(raw)) {
+	if (!IS_ERR(raw)) {
+		*type = BPF_TYPE_MAP;
+		return raw;
+	}
+
+	raw = bpf_prog_get(ufd);
+	if (!IS_ERR(raw)) {
 		*type = BPF_TYPE_PROG;
-		raw = bpf_prog_get(ufd);
+		return raw;
 	}
 
-	return raw;
+	raw = bpf_link_get_from_fd(ufd);
+	if (!IS_ERR(raw)) {
+		*type = BPF_TYPE_LINK;
+		return raw;
+	}
+
+	return ERR_PTR(-EINVAL);
 }
 
 static const struct inode_operations bpf_dir_iops;
 
 static const struct inode_operations bpf_prog_iops = { };
 static const struct inode_operations bpf_map_iops  = { };
+static const struct inode_operations bpf_link_iops  = { };
 
 static struct inode *bpf_get_inode(struct super_block *sb,
 				   const struct inode *dir,
@@ -114,6 +133,8 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
 		*type = BPF_TYPE_PROG;
 	else if (inode->i_op == &bpf_map_iops)
 		*type = BPF_TYPE_MAP;
+	else if (inode->i_op == &bpf_link_iops)
+		*type = BPF_TYPE_LINK;
 	else
 		return -EACCES;
 
@@ -335,6 +356,12 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
 			     &bpffs_map_fops : &bpffs_obj_fops);
 }
 
+static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg)
+{
+	return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops,
+			     &bpffs_obj_fops);
+}
+
 static struct dentry *
 bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
 {
@@ -411,6 +438,9 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw,
 	case BPF_TYPE_MAP:
 		ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw);
 		break;
+	case BPF_TYPE_LINK:
+		ret = vfs_mkobj(dentry, mode, bpf_mklink, raw);
+		break;
 	default:
 		ret = -EPERM;
 	}
@@ -487,6 +517,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
 		ret = bpf_prog_new_fd(raw);
 	else if (type == BPF_TYPE_MAP)
 		ret = bpf_map_new_fd(raw, f_flags);
+	else if (type == BPF_TYPE_LINK)
+		ret = bpf_link_new_fd(raw);
 	else
 		return -ENOENT;
 
@@ -504,6 +536,8 @@ static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type
 
 	if (inode->i_op == &bpf_map_iops)
 		return ERR_PTR(-EINVAL);
+	if (inode->i_op == &bpf_link_iops)
+		return ERR_PTR(-EINVAL);
 	if (inode->i_op != &bpf_prog_iops)
 		return ERR_PTR(-EACCES);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c536c65256ad..13de65363ba2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2173,24 +2173,154 @@ static int bpf_obj_get(const union bpf_attr *attr)
 				attr->file_flags);
 }
 
-static int bpf_tracing_prog_release(struct inode *inode, struct file *filp)
+struct bpf_link {
+	atomic64_t refcnt;
+	const struct bpf_link_ops *ops;
+	struct bpf_prog *prog;
+	struct work_struct work;
+};
+
+void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
+		   struct bpf_prog *prog)
 {
-	struct bpf_prog *prog = filp->private_data;
+	atomic64_set(&link->refcnt, 1);
+	link->ops = ops;
+	link->prog = prog;
+}
+
+void bpf_link_inc(struct bpf_link *link)
+{
+	atomic64_inc(&link->refcnt);
+}
+
+/* bpf_link_free is guaranteed to be called from process context */
+static void bpf_link_free(struct bpf_link *link)
+{
+	struct bpf_prog *prog;
 
-	WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
+	/* remember prog locally, because release below will free link memory */
+	prog = link->prog;
+	/* extra clean up and kfree of container link struct */
+	link->ops->release(link);
+	/* no more accesing of link members after this point */
 	bpf_prog_put(prog);
+}
+
+static void bpf_link_put_deferred(struct work_struct *work)
+{
+	struct bpf_link *link = container_of(work, struct bpf_link, work);
+
+	bpf_link_free(link);
+}
+
+/* bpf_link_put can be called from atomic context, but ensures that resources
+ * are freed from process context
+ */
+void bpf_link_put(struct bpf_link *link)
+{
+	if (!atomic64_dec_and_test(&link->refcnt))
+		return;
+
+	if (in_atomic()) {
+		INIT_WORK(&link->work, bpf_link_put_deferred);
+		schedule_work(&link->work);
+	} else {
+		bpf_link_free(link);
+	}
+}
+
+static int bpf_link_release(struct inode *inode, struct file *filp)
+{
+	struct bpf_link *link = filp->private_data;
+
+	bpf_link_put(link);
 	return 0;
 }
 
-static const struct file_operations bpf_tracing_prog_fops = {
-	.release	= bpf_tracing_prog_release,
+#ifdef CONFIG_PROC_FS
+static const struct bpf_link_ops bpf_raw_tp_lops;
+static const struct bpf_link_ops bpf_tracing_link_lops;
+static const struct bpf_link_ops bpf_xdp_link_lops;
+
+static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+	const struct bpf_link *link = filp->private_data;
+	const struct bpf_prog *prog = link->prog;
+	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
+	const char *link_type;
+
+	if (link->ops == &bpf_raw_tp_lops)
+		link_type = "raw_tracepoint";
+	else if (link->ops == &bpf_tracing_link_lops)
+		link_type = "tracing";
+	else
+		link_type = "unknown";
+
+	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+	seq_printf(m,
+		   "link_type:\t%s\n"
+		   "prog_tag:\t%s\n"
+		   "prog_id:\t%u\n",
+		   link_type,
+		   prog_tag,
+		   prog->aux->id);
+}
+#endif
+
+const struct file_operations bpf_link_fops = {
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= bpf_link_show_fdinfo,
+#endif
+	.release	= bpf_link_release,
 	.read		= bpf_dummy_read,
 	.write		= bpf_dummy_write,
 };
 
+int bpf_link_new_fd(struct bpf_link *link)
+{
+	return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
+}
+
+struct bpf_link *bpf_link_get_from_fd(u32 ufd)
+{
+	struct fd f = fdget(ufd);
+	struct bpf_link *link;
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+	if (f.file->f_op != &bpf_link_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	link = f.file->private_data;
+	bpf_link_inc(link);
+	fdput(f);
+
+	return link;
+}
+
+struct bpf_tracing_link {
+	struct bpf_link link;
+};
+
+static void bpf_tracing_link_release(struct bpf_link *link)
+{
+	struct bpf_tracing_link *tr_link =
+		container_of(link, struct bpf_tracing_link, link);
+
+	WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog));
+	kfree(tr_link);
+}
+
+static const struct bpf_link_ops bpf_tracing_link_lops = {
+	.release = bpf_tracing_link_release,
+};
+
 static int bpf_tracing_prog_attach(struct bpf_prog *prog)
 {
-	int tr_fd, err;
+	struct bpf_tracing_link *link;
+	int link_fd, err;
 
 	if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
 	    prog->expected_attach_type != BPF_TRACE_FEXIT &&
@@ -2199,58 +2329,61 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
 		goto out_put_prog;
 	}
 
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
+		err = -ENOMEM;
+		goto out_put_prog;
+	}
+	bpf_link_init(&link->link, &bpf_tracing_link_lops, prog);
+
 	err = bpf_trampoline_link_prog(prog);
 	if (err)
-		goto out_put_prog;
+		goto out_free_link;
 
-	tr_fd = anon_inode_getfd("bpf-tracing-prog", &bpf_tracing_prog_fops,
-				 prog, O_CLOEXEC);
-	if (tr_fd < 0) {
+	link_fd = bpf_link_new_fd(&link->link);
+	if (link_fd < 0) {
 		WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
-		err = tr_fd;
-		goto out_put_prog;
+		err = link_fd;
+		goto out_free_link;
 	}
-	return tr_fd;
+	return link_fd;
 
+out_free_link:
+	kfree(link);
 out_put_prog:
 	bpf_prog_put(prog);
 	return err;
 }
 
-struct bpf_raw_tracepoint {
+struct bpf_raw_tp_link {
+	struct bpf_link link;
 	struct bpf_raw_event_map *btp;
-	struct bpf_prog *prog;
 };
 
-static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp)
+static void bpf_raw_tp_link_release(struct bpf_link *link)
 {
-	struct bpf_raw_tracepoint *raw_tp = filp->private_data;
+	struct bpf_raw_tp_link *raw_tp =
+		container_of(link, struct bpf_raw_tp_link, link);
 
-	if (raw_tp->prog) {
-		bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
-		bpf_prog_put(raw_tp->prog);
-	}
+	bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
 	bpf_put_raw_tracepoint(raw_tp->btp);
 	kfree(raw_tp);
-	return 0;
 }
 
-static const struct file_operations bpf_raw_tp_fops = {
-	.release	= bpf_raw_tracepoint_release,
-	.read		= bpf_dummy_read,
-	.write		= bpf_dummy_write,
+static const struct bpf_link_ops bpf_raw_tp_lops = {
+	.release = bpf_raw_tp_link_release,
 };
 
 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
 
 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 {
-	struct bpf_raw_tracepoint *raw_tp;
+	struct bpf_raw_tp_link *raw_tp;
 	struct bpf_raw_event_map *btp;
 	struct bpf_prog *prog;
 	const char *tp_name;
 	char buf[128];
-	int tp_fd, err;
+	int link_fd, err;
 
 	if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
 		return -EINVAL;
@@ -2302,21 +2435,20 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 		err = -ENOMEM;
 		goto out_put_btp;
 	}
+	bpf_link_init(&raw_tp->link, &bpf_raw_tp_lops, prog);
 	raw_tp->btp = btp;
-	raw_tp->prog = prog;
 
 	err = bpf_probe_register(raw_tp->btp, prog);
 	if (err)
 		goto out_free_tp;
 
-	tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp,
-				 O_CLOEXEC);
-	if (tp_fd < 0) {
+	link_fd = bpf_link_new_fd(&raw_tp->link);
+	if (link_fd < 0) {
 		bpf_probe_unregister(raw_tp->btp, prog);
-		err = tp_fd;
+		err = link_fd;
 		goto out_free_tp;
 	}
-	return tp_fd;
+	return link_fd;
 
 out_free_tp:
 	kfree(raw_tp);
@@ -3266,15 +3398,21 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
 	if (err)
 		goto out;
 
-	if (file->f_op == &bpf_raw_tp_fops) {
-		struct bpf_raw_tracepoint *raw_tp = file->private_data;
-		struct bpf_raw_event_map *btp = raw_tp->btp;
+	if (file->f_op == &bpf_link_fops) {
+		struct bpf_link *link = file->private_data;
 
-		err = bpf_task_fd_query_copy(attr, uattr,
-					     raw_tp->prog->aux->id,
-					     BPF_FD_TYPE_RAW_TRACEPOINT,
-					     btp->tp->name, 0, 0);
-		goto put_file;
+		if (link->ops == &bpf_raw_tp_lops) {
+			struct bpf_raw_tp_link *raw_tp =
+				container_of(link, struct bpf_raw_tp_link, link);
+			struct bpf_raw_event_map *btp = raw_tp->btp;
+
+			err = bpf_task_fd_query_copy(attr, uattr,
+						     raw_tp->link.prog->aux->id,
+						     BPF_FD_TYPE_RAW_TRACEPOINT,
+						     btp->tp->name, 0, 0);
+			goto put_file;
+		}
+		goto out_not_supp;
 	}
 
 	event = perf_get_event(file);
@@ -3294,6 +3432,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
 		goto put_file;
 	}
 
+out_not_supp:
 	err = -ENOTSUPP;
 put_file:
 	fput(file);
-- 
cgit v1.2.3


From 88fd9e5352fe05f7fe57778293aebd4cd106960b Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@google.com>
Date: Wed, 4 Mar 2020 20:18:47 +0100
Subject: bpf: Refactor trampoline update code

As we need to introduce a third type of attachment for trampolines, the
flattened signature of arch_prepare_bpf_trampoline gets even more
complicated.

Refactor the prog and count argument to arch_prepare_bpf_trampoline to
use bpf_tramp_progs to simplify the addition and accounting for new
attachment types.

Signed-off-by: KP Singh <kpsingh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200304191853.1529-2-kpsingh@chromium.org
---
 arch/x86/net/bpf_jit_comp.c | 31 ++++++++++++-----------
 include/linux/bpf.h         | 13 ++++++++--
 kernel/bpf/bpf_struct_ops.c | 10 +++++++-
 kernel/bpf/trampoline.c     | 62 +++++++++++++++++++++++++--------------------
 4 files changed, 71 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 9ba08e9abc09..15c7d28bc05c 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1362,12 +1362,12 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
 }
 
 static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
-		      struct bpf_prog **progs, int prog_cnt, int stack_size)
+		      struct bpf_tramp_progs *tp, int stack_size)
 {
 	u8 *prog = *pprog;
 	int cnt = 0, i;
 
-	for (i = 0; i < prog_cnt; i++) {
+	for (i = 0; i < tp->nr_progs; i++) {
 		if (emit_call(&prog, __bpf_prog_enter, prog))
 			return -EINVAL;
 		/* remember prog start time returned by __bpf_prog_enter */
@@ -1376,17 +1376,17 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
 		/* arg1: lea rdi, [rbp - stack_size] */
 		EMIT4(0x48, 0x8D, 0x7D, -stack_size);
 		/* arg2: progs[i]->insnsi for interpreter */
-		if (!progs[i]->jited)
+		if (!tp->progs[i]->jited)
 			emit_mov_imm64(&prog, BPF_REG_2,
-				       (long) progs[i]->insnsi >> 32,
-				       (u32) (long) progs[i]->insnsi);
+				       (long) tp->progs[i]->insnsi >> 32,
+				       (u32) (long) tp->progs[i]->insnsi);
 		/* call JITed bpf program or interpreter */
-		if (emit_call(&prog, progs[i]->bpf_func, prog))
+		if (emit_call(&prog, tp->progs[i]->bpf_func, prog))
 			return -EINVAL;
 
 		/* arg1: mov rdi, progs[i] */
-		emit_mov_imm64(&prog, BPF_REG_1, (long) progs[i] >> 32,
-			       (u32) (long) progs[i]);
+		emit_mov_imm64(&prog, BPF_REG_1, (long) tp->progs[i] >> 32,
+			       (u32) (long) tp->progs[i]);
 		/* arg2: mov rsi, rbx <- start time in nsec */
 		emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
 		if (emit_call(&prog, __bpf_prog_exit, prog))
@@ -1458,12 +1458,13 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
  */
 int arch_prepare_bpf_trampoline(void *image, void *image_end,
 				const struct btf_func_model *m, u32 flags,
-				struct bpf_prog **fentry_progs, int fentry_cnt,
-				struct bpf_prog **fexit_progs, int fexit_cnt,
+				struct bpf_tramp_progs *tprogs,
 				void *orig_call)
 {
 	int cnt = 0, nr_args = m->nr_args;
 	int stack_size = nr_args * 8;
+	struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY];
+	struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT];
 	u8 *prog;
 
 	/* x86-64 supports up to 6 arguments. 7+ can be added in the future */
@@ -1492,12 +1493,12 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
 
 	save_regs(m, &prog, nr_args, stack_size);
 
-	if (fentry_cnt)
-		if (invoke_bpf(m, &prog, fentry_progs, fentry_cnt, stack_size))
+	if (fentry->nr_progs)
+		if (invoke_bpf(m, &prog, fentry, stack_size))
 			return -EINVAL;
 
 	if (flags & BPF_TRAMP_F_CALL_ORIG) {
-		if (fentry_cnt)
+		if (fentry->nr_progs)
 			restore_regs(m, &prog, nr_args, stack_size);
 
 		/* call original function */
@@ -1507,8 +1508,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
 		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
 	}
 
-	if (fexit_cnt)
-		if (invoke_bpf(m, &prog, fexit_progs, fexit_cnt, stack_size))
+	if (fexit->nr_progs)
+		if (invoke_bpf(m, &prog, fexit, stack_size))
 			return -EINVAL;
 
 	if (flags & BPF_TRAMP_F_RESTORE_REGS)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f13c78c6f29d..98ec10b23dbb 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -433,6 +433,16 @@ struct btf_func_model {
  */
 #define BPF_TRAMP_F_SKIP_FRAME		BIT(2)
 
+/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
+ * bytes on x86.  Pick a number to fit into BPF_IMAGE_SIZE / 2
+ */
+#define BPF_MAX_TRAMP_PROGS 40
+
+struct bpf_tramp_progs {
+	struct bpf_prog *progs[BPF_MAX_TRAMP_PROGS];
+	int nr_progs;
+};
+
 /* Different use cases for BPF trampoline:
  * 1. replace nop at the function entry (kprobe equivalent)
  *    flags = BPF_TRAMP_F_RESTORE_REGS
@@ -455,8 +465,7 @@ struct btf_func_model {
  */
 int arch_prepare_bpf_trampoline(void *image, void *image_end,
 				const struct btf_func_model *m, u32 flags,
-				struct bpf_prog **fentry_progs, int fentry_cnt,
-				struct bpf_prog **fexit_progs, int fexit_cnt,
+				struct bpf_tramp_progs *tprogs,
 				void *orig_call);
 /* these two functions are called from generated trampoline */
 u64 notrace __bpf_prog_enter(void);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index c498f0fffb40..ca5cc8cdb6eb 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -320,6 +320,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	struct bpf_struct_ops_value *uvalue, *kvalue;
 	const struct btf_member *member;
 	const struct btf_type *t = st_ops->type;
+	struct bpf_tramp_progs *tprogs = NULL;
 	void *udata, *kdata;
 	int prog_fd, err = 0;
 	void *image;
@@ -343,6 +344,10 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	if (uvalue->state || refcount_read(&uvalue->refcnt))
 		return -EINVAL;
 
+	tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
+	if (!tprogs)
+		return -ENOMEM;
+
 	uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
 	kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue;
 
@@ -425,10 +430,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 			goto reset_unlock;
 		}
 
+		tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
+		tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
 		err = arch_prepare_bpf_trampoline(image,
 						  st_map->image + PAGE_SIZE,
 						  &st_ops->func_models[i], 0,
-						  &prog, 1, NULL, 0, NULL);
+						  tprogs, NULL);
 		if (err < 0)
 			goto reset_unlock;
 
@@ -469,6 +476,7 @@ reset_unlock:
 	memset(uvalue, 0, map->value_size);
 	memset(kvalue, 0, map->value_size);
 unlock:
+	kfree(tprogs);
 	mutex_unlock(&st_map->lock);
 	return err;
 }
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 704fa787fec0..546198f6f307 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -190,40 +190,49 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 	return ret;
 }
 
-/* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
- * bytes on x86.  Pick a number to fit into BPF_IMAGE_SIZE / 2
- */
-#define BPF_MAX_TRAMP_PROGS 40
+static struct bpf_tramp_progs *
+bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
+{
+	const struct bpf_prog_aux *aux;
+	struct bpf_tramp_progs *tprogs;
+	struct bpf_prog **progs;
+	int kind;
+
+	*total = 0;
+	tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
+	if (!tprogs)
+		return ERR_PTR(-ENOMEM);
+
+	for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
+		tprogs[kind].nr_progs = tr->progs_cnt[kind];
+		*total += tr->progs_cnt[kind];
+		progs = tprogs[kind].progs;
+
+		hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist)
+			*progs++ = aux->prog;
+	}
+	return tprogs;
+}
 
 static int bpf_trampoline_update(struct bpf_trampoline *tr)
 {
 	void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2;
 	void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2;
-	struct bpf_prog *progs_to_run[BPF_MAX_TRAMP_PROGS];
-	int fentry_cnt = tr->progs_cnt[BPF_TRAMP_FENTRY];
-	int fexit_cnt = tr->progs_cnt[BPF_TRAMP_FEXIT];
-	struct bpf_prog **progs, **fentry, **fexit;
+	struct bpf_tramp_progs *tprogs;
 	u32 flags = BPF_TRAMP_F_RESTORE_REGS;
-	struct bpf_prog_aux *aux;
-	int err;
+	int err, total;
 
-	if (fentry_cnt + fexit_cnt == 0) {
+	tprogs = bpf_trampoline_get_progs(tr, &total);
+	if (IS_ERR(tprogs))
+		return PTR_ERR(tprogs);
+
+	if (total == 0) {
 		err = unregister_fentry(tr, old_image);
 		tr->selector = 0;
 		goto out;
 	}
 
-	/* populate fentry progs */
-	fentry = progs = progs_to_run;
-	hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FENTRY], tramp_hlist)
-		*progs++ = aux->prog;
-
-	/* populate fexit progs */
-	fexit = progs;
-	hlist_for_each_entry(aux, &tr->progs_hlist[BPF_TRAMP_FEXIT], tramp_hlist)
-		*progs++ = aux->prog;
-
-	if (fexit_cnt)
+	if (tprogs[BPF_TRAMP_FEXIT].nr_progs)
 		flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
 
 	/* Though the second half of trampoline page is unused a task could be
@@ -232,12 +241,11 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
 	 * preempted task. Hence wait for tasks to voluntarily schedule or go
 	 * to userspace.
 	 */
+
 	synchronize_rcu_tasks();
 
 	err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2,
-					  &tr->func.model, flags,
-					  fentry, fentry_cnt,
-					  fexit, fexit_cnt,
+					  &tr->func.model, flags, tprogs,
 					  tr->func.addr);
 	if (err < 0)
 		goto out;
@@ -252,6 +260,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
 		goto out;
 	tr->selector++;
 out:
+	kfree(tprogs);
 	return err;
 }
 
@@ -409,8 +418,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
 int __weak
 arch_prepare_bpf_trampoline(void *image, void *image_end,
 			    const struct btf_func_model *m, u32 flags,
-			    struct bpf_prog **fentry_progs, int fentry_cnt,
-			    struct bpf_prog **fexit_progs, int fexit_cnt,
+			    struct bpf_tramp_progs *tprogs,
 			    void *orig_call)
 {
 	return -ENOTSUPP;
-- 
cgit v1.2.3


From ae24082331d9bbaae283aafbe930a8f0eb85605a Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@google.com>
Date: Wed, 4 Mar 2020 20:18:49 +0100
Subject: bpf: Introduce BPF_MODIFY_RETURN

When multiple programs are attached, each program receives the return
value from the previous program on the stack and the last program
provides the return value to the attached function.

The fmod_ret bpf programs are run after the fentry programs and before
the fexit programs. The original function is only called if all the
fmod_ret programs return 0 to avoid any unintended side-effects. The
success value, i.e. 0 is not currently configurable but can be made so
where user-space can specify it at load time.

For example:

int func_to_be_attached(int a, int b)
{  <--- do_fentry

do_fmod_ret:
   <update ret by calling fmod_ret>
   if (ret != 0)
        goto do_fexit;

original_function:

    <side_effects_happen_here>

}  <--- do_fexit

The fmod_ret program attached to this function can be defined as:

SEC("fmod_ret/func_to_be_attached")
int BPF_PROG(func_name, int a, int b, int ret)
{
        // This will skip the original function logic.
        return 1;
}

The first fmod_ret program is passed 0 in its return argument.

Signed-off-by: KP Singh <kpsingh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200304191853.1529-4-kpsingh@chromium.org
---
 arch/x86/net/bpf_jit_comp.c    | 130 +++++++++++++++++++++++++++++++++++++----
 include/linux/bpf.h            |   1 +
 include/uapi/linux/bpf.h       |   1 +
 kernel/bpf/btf.c               |   3 +-
 kernel/bpf/syscall.c           |   1 +
 kernel/bpf/trampoline.c        |   5 +-
 kernel/bpf/verifier.c          |   1 +
 tools/include/uapi/linux/bpf.h |   1 +
 8 files changed, 130 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index d6349e930b06..b1fd000feb89 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1362,7 +1362,7 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
 }
 
 static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
-			   struct bpf_prog *p, int stack_size)
+			   struct bpf_prog *p, int stack_size, bool mod_ret)
 {
 	u8 *prog = *pprog;
 	int cnt = 0;
@@ -1383,6 +1383,13 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 	if (emit_call(&prog, p->bpf_func, prog))
 		return -EINVAL;
 
+	/* BPF_TRAMP_MODIFY_RETURN trampolines can modify the return
+	 * of the previous call which is then passed on the stack to
+	 * the next BPF program.
+	 */
+	if (mod_ret)
+		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
+
 	/* arg1: mov rdi, progs[i] */
 	emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
 		       (u32) (long) p);
@@ -1442,6 +1449,23 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
 	return 0;
 }
 
+static int emit_mod_ret_check_imm8(u8 **pprog, int value)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	if (!is_imm8(value))
+		return -EINVAL;
+
+	if (value == 0)
+		EMIT2(0x85, add_2reg(0xC0, BPF_REG_0, BPF_REG_0));
+	else
+		EMIT3(0x83, add_1reg(0xF8, BPF_REG_0), value);
+
+	*pprog = prog;
+	return 0;
+}
+
 static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
 		      struct bpf_tramp_progs *tp, int stack_size)
 {
@@ -1449,9 +1473,49 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
 	u8 *prog = *pprog;
 
 	for (i = 0; i < tp->nr_progs; i++) {
-		if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size))
+		if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, false))
+			return -EINVAL;
+	}
+	*pprog = prog;
+	return 0;
+}
+
+static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
+			      struct bpf_tramp_progs *tp, int stack_size,
+			      u8 **branches)
+{
+	u8 *prog = *pprog;
+	int i;
+
+	/* The first fmod_ret program will receive a garbage return value.
+	 * Set this to 0 to avoid confusing the program.
+	 */
+	emit_mov_imm32(&prog, false, BPF_REG_0, 0);
+	emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
+	for (i = 0; i < tp->nr_progs; i++) {
+		if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, true))
 			return -EINVAL;
+
+		/* Generate a branch:
+		 *
+		 * if (ret !=  0)
+		 *	goto do_fexit;
+		 *
+		 * If needed this can be extended to any integer value which can
+		 * be passed by user-space when the program is loaded.
+		 */
+		if (emit_mod_ret_check_imm8(&prog, 0))
+			return -EINVAL;
+
+		/* Save the location of the branch and Generate 6 nops
+		 * (4 bytes for an offset and 2 bytes for the jump) These nops
+		 * are replaced with a conditional jump once do_fexit (i.e. the
+		 * start of the fexit invocation) is finalized.
+		 */
+		branches[i] = prog;
+		emit_nops(&prog, 4 + 2);
 	}
+
 	*pprog = prog;
 	return 0;
 }
@@ -1521,10 +1585,12 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
 				struct bpf_tramp_progs *tprogs,
 				void *orig_call)
 {
-	int cnt = 0, nr_args = m->nr_args;
+	int ret, i, cnt = 0, nr_args = m->nr_args;
 	int stack_size = nr_args * 8;
 	struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY];
 	struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT];
+	struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN];
+	u8 **branches = NULL;
 	u8 *prog;
 
 	/* x86-64 supports up to 6 arguments. 7+ can be added in the future */
@@ -1557,24 +1623,60 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
 		if (invoke_bpf(m, &prog, fentry, stack_size))
 			return -EINVAL;
 
+	if (fmod_ret->nr_progs) {
+		branches = kcalloc(fmod_ret->nr_progs, sizeof(u8 *),
+				   GFP_KERNEL);
+		if (!branches)
+			return -ENOMEM;
+
+		if (invoke_bpf_mod_ret(m, &prog, fmod_ret, stack_size,
+				       branches)) {
+			ret = -EINVAL;
+			goto cleanup;
+		}
+	}
+
 	if (flags & BPF_TRAMP_F_CALL_ORIG) {
-		if (fentry->nr_progs)
+		if (fentry->nr_progs || fmod_ret->nr_progs)
 			restore_regs(m, &prog, nr_args, stack_size);
 
 		/* call original function */
-		if (emit_call(&prog, orig_call, prog))
-			return -EINVAL;
+		if (emit_call(&prog, orig_call, prog)) {
+			ret = -EINVAL;
+			goto cleanup;
+		}
 		/* remember return value in a stack for bpf prog to access */
 		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
 	}
 
+	if (fmod_ret->nr_progs) {
+		/* From Intel 64 and IA-32 Architectures Optimization
+		 * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler
+		 * Coding Rule 11: All branch targets should be 16-byte
+		 * aligned.
+		 */
+		emit_align(&prog, 16);
+		/* Update the branches saved in invoke_bpf_mod_ret with the
+		 * aligned address of do_fexit.
+		 */
+		for (i = 0; i < fmod_ret->nr_progs; i++)
+			emit_cond_near_jump(&branches[i], prog, branches[i],
+					    X86_JNE);
+	}
+
 	if (fexit->nr_progs)
-		if (invoke_bpf(m, &prog, fexit, stack_size))
-			return -EINVAL;
+		if (invoke_bpf(m, &prog, fexit, stack_size)) {
+			ret = -EINVAL;
+			goto cleanup;
+		}
 
 	if (flags & BPF_TRAMP_F_RESTORE_REGS)
 		restore_regs(m, &prog, nr_args, stack_size);
 
+	/* This needs to be done regardless. If there were fmod_ret programs,
+	 * the return value is only updated on the stack and still needs to be
+	 * restored to R0.
+	 */
 	if (flags & BPF_TRAMP_F_CALL_ORIG)
 		/* restore original return value back into RAX */
 		emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
@@ -1586,9 +1688,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
 		EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
 	EMIT1(0xC3); /* ret */
 	/* Make sure the trampoline generation logic doesn't overflow */
-	if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY))
-		return -EFAULT;
-	return prog - (u8 *)image;
+	if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) {
+		ret = -EFAULT;
+		goto cleanup;
+	}
+	ret = prog - (u8 *)image;
+
+cleanup:
+	kfree(branches);
+	return ret;
 }
 
 static int emit_fallback_jump(u8 **pprog)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 98ec10b23dbb..f748b31e5888 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -474,6 +474,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
 enum bpf_tramp_prog_type {
 	BPF_TRAMP_FENTRY,
 	BPF_TRAMP_FEXIT,
+	BPF_TRAMP_MODIFY_RETURN,
 	BPF_TRAMP_MAX,
 	BPF_TRAMP_REPLACE, /* more than MAX */
 };
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d6b33ea27bcc..40b2d9476268 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -210,6 +210,7 @@ enum bpf_attach_type {
 	BPF_TRACE_RAW_TP,
 	BPF_TRACE_FENTRY,
 	BPF_TRACE_FEXIT,
+	BPF_MODIFY_RETURN,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 787140095e58..30841fb8b3c0 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3710,7 +3710,8 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		nr_args--;
 	}
 
-	if (prog->expected_attach_type == BPF_TRACE_FEXIT &&
+	if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
+	     prog->expected_attach_type == BPF_MODIFY_RETURN) &&
 	    arg == nr_args) {
 		if (!t)
 			/* Default prog with 5 args. 6th arg is retval. */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 13de65363ba2..7ce0815793dd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2324,6 +2324,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
 
 	if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
 	    prog->expected_attach_type != BPF_TRACE_FEXIT &&
+	    prog->expected_attach_type != BPF_MODIFY_RETURN &&
 	    prog->type != BPF_PROG_TYPE_EXT) {
 		err = -EINVAL;
 		goto out_put_prog;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 546198f6f307..221a17af1f81 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -232,7 +232,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
 		goto out;
 	}
 
-	if (tprogs[BPF_TRAMP_FEXIT].nr_progs)
+	if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
+	    tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
 		flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
 
 	/* Though the second half of trampoline page is unused a task could be
@@ -269,6 +270,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t)
 	switch (t) {
 	case BPF_TRACE_FENTRY:
 		return BPF_TRAMP_FENTRY;
+	case BPF_MODIFY_RETURN:
+		return BPF_TRAMP_MODIFY_RETURN;
 	case BPF_TRACE_FEXIT:
 		return BPF_TRAMP_FEXIT;
 	default:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 289383edfc8c..2460c8e6b5be 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9950,6 +9950,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 		if (!prog_extension)
 			return -EINVAL;
 		/* fallthrough */
+	case BPF_MODIFY_RETURN:
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
 		if (!btf_type_is_func(t)) {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d6b33ea27bcc..40b2d9476268 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -210,6 +210,7 @@ enum bpf_attach_type {
 	BPF_TRACE_RAW_TP,
 	BPF_TRACE_FENTRY,
 	BPF_TRACE_FEXIT,
+	BPF_MODIFY_RETURN,
 	__MAX_BPF_ATTACH_TYPE
 };
 
-- 
cgit v1.2.3


From da00d2f117a08fbca262db5ea422c80a568b112b Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@google.com>
Date: Wed, 4 Mar 2020 20:18:52 +0100
Subject: bpf: Add test ops for BPF_PROG_TYPE_TRACING

The current fexit and fentry tests rely on a different program to
exercise the functions they attach to. Instead of doing this, implement
the test operations for tracing which will also be used for
BPF_MODIFY_RETURN in a subsequent patch.

Also, clean up the fexit test to use the generated skeleton.

Signed-off-by: KP Singh <kpsingh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200304191853.1529-7-kpsingh@chromium.org
---
 include/linux/bpf.h                                | 10 ++++
 kernel/trace/bpf_trace.c                           |  1 +
 net/bpf/test_run.c                                 | 37 +++++++++---
 .../selftests/bpf/prog_tests/fentry_fexit.c        | 12 +---
 .../testing/selftests/bpf/prog_tests/fentry_test.c | 14 ++---
 .../testing/selftests/bpf/prog_tests/fexit_test.c  | 69 +++++++---------------
 6 files changed, 67 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f748b31e5888..40c53924571d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1156,6 +1156,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr);
 int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr);
+int bpf_prog_test_run_tracing(struct bpf_prog *prog,
+			      const union bpf_attr *kattr,
+			      union bpf_attr __user *uattr);
 int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 				     const union bpf_attr *kattr,
 				     union bpf_attr __user *uattr);
@@ -1313,6 +1316,13 @@ static inline int bpf_prog_test_run_skb(struct bpf_prog *prog,
 	return -ENOTSUPP;
 }
 
+static inline int bpf_prog_test_run_tracing(struct bpf_prog *prog,
+					    const union bpf_attr *kattr,
+					    union bpf_attr __user *uattr)
+{
+	return -ENOTSUPP;
+}
+
 static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 						   const union bpf_attr *kattr,
 						   union bpf_attr __user *uattr)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 07764c761073..363e0a2c75cf 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1266,6 +1266,7 @@ const struct bpf_verifier_ops tracing_verifier_ops = {
 };
 
 const struct bpf_prog_ops tracing_prog_ops = {
+	.test_run = bpf_prog_test_run_tracing,
 };
 
 static bool raw_tp_writable_prog_is_valid_access(int off, int size,
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 1cd7a1c2f8b2..3600f098e7c6 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -160,18 +160,37 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
 		kfree(data);
 		return ERR_PTR(-EFAULT);
 	}
-	if (bpf_fentry_test1(1) != 2 ||
-	    bpf_fentry_test2(2, 3) != 5 ||
-	    bpf_fentry_test3(4, 5, 6) != 15 ||
-	    bpf_fentry_test4((void *)7, 8, 9, 10) != 34 ||
-	    bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 ||
-	    bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111) {
-		kfree(data);
-		return ERR_PTR(-EFAULT);
-	}
+
 	return data;
 }
 
+int bpf_prog_test_run_tracing(struct bpf_prog *prog,
+			      const union bpf_attr *kattr,
+			      union bpf_attr __user *uattr)
+{
+	int err = -EFAULT;
+
+	switch (prog->expected_attach_type) {
+	case BPF_TRACE_FENTRY:
+	case BPF_TRACE_FEXIT:
+		if (bpf_fentry_test1(1) != 2 ||
+		    bpf_fentry_test2(2, 3) != 5 ||
+		    bpf_fentry_test3(4, 5, 6) != 15 ||
+		    bpf_fentry_test4((void *)7, 8, 9, 10) != 34 ||
+		    bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 ||
+		    bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111)
+			goto out;
+		break;
+	default:
+		goto out;
+	}
+
+	err = 0;
+out:
+	trace_bpf_test_finish(&err);
+	return err;
+}
+
 static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size)
 {
 	void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in);
diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c
index 235ac4f67f5b..83493bd5745c 100644
--- a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c
+++ b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c
@@ -1,22 +1,17 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019 Facebook */
 #include <test_progs.h>
-#include "test_pkt_access.skel.h"
 #include "fentry_test.skel.h"
 #include "fexit_test.skel.h"
 
 void test_fentry_fexit(void)
 {
-	struct test_pkt_access *pkt_skel = NULL;
 	struct fentry_test *fentry_skel = NULL;
 	struct fexit_test *fexit_skel = NULL;
 	__u64 *fentry_res, *fexit_res;
 	__u32 duration = 0, retval;
-	int err, pkt_fd, i;
+	int err, prog_fd, i;
 
-	pkt_skel = test_pkt_access__open_and_load();
-	if (CHECK(!pkt_skel, "pkt_skel_load", "pkt_access skeleton failed\n"))
-		return;
 	fentry_skel = fentry_test__open_and_load();
 	if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n"))
 		goto close_prog;
@@ -31,8 +26,8 @@ void test_fentry_fexit(void)
 	if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err))
 		goto close_prog;
 
-	pkt_fd = bpf_program__fd(pkt_skel->progs.test_pkt_access);
-	err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6),
+	prog_fd = bpf_program__fd(fexit_skel->progs.test1);
+	err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
 				NULL, NULL, &retval, &duration);
 	CHECK(err || retval, "ipv6",
 	      "err %d errno %d retval %d duration %d\n",
@@ -49,7 +44,6 @@ void test_fentry_fexit(void)
 	}
 
 close_prog:
-	test_pkt_access__destroy(pkt_skel);
 	fentry_test__destroy(fentry_skel);
 	fexit_test__destroy(fexit_skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c
index 5cc06021f27d..04ebbf1cb390 100644
--- a/tools/testing/selftests/bpf/prog_tests/fentry_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c
@@ -1,20 +1,15 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019 Facebook */
 #include <test_progs.h>
-#include "test_pkt_access.skel.h"
 #include "fentry_test.skel.h"
 
 void test_fentry_test(void)
 {
-	struct test_pkt_access *pkt_skel = NULL;
 	struct fentry_test *fentry_skel = NULL;
-	int err, pkt_fd, i;
+	int err, prog_fd, i;
 	__u32 duration = 0, retval;
 	__u64 *result;
 
-	pkt_skel = test_pkt_access__open_and_load();
-	if (CHECK(!pkt_skel, "pkt_skel_load", "pkt_access skeleton failed\n"))
-		return;
 	fentry_skel = fentry_test__open_and_load();
 	if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n"))
 		goto cleanup;
@@ -23,10 +18,10 @@ void test_fentry_test(void)
 	if (CHECK(err, "fentry_attach", "fentry attach failed: %d\n", err))
 		goto cleanup;
 
-	pkt_fd = bpf_program__fd(pkt_skel->progs.test_pkt_access);
-	err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6),
+	prog_fd = bpf_program__fd(fentry_skel->progs.test1);
+	err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
 				NULL, NULL, &retval, &duration);
-	CHECK(err || retval, "ipv6",
+	CHECK(err || retval, "test_run",
 	      "err %d errno %d retval %d duration %d\n",
 	      err, errno, retval, duration);
 
@@ -39,5 +34,4 @@ void test_fentry_test(void)
 
 cleanup:
 	fentry_test__destroy(fentry_skel);
-	test_pkt_access__destroy(pkt_skel);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_test.c b/tools/testing/selftests/bpf/prog_tests/fexit_test.c
index d2c3655dd7a3..78d7a2765c27 100644
--- a/tools/testing/selftests/bpf/prog_tests/fexit_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_test.c
@@ -1,64 +1,37 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2019 Facebook */
 #include <test_progs.h>
+#include "fexit_test.skel.h"
 
 void test_fexit_test(void)
 {
-	struct bpf_prog_load_attr attr = {
-		.file = "./fexit_test.o",
-	};
-
-	char prog_name[] = "fexit/bpf_fentry_testX";
-	struct bpf_object *obj = NULL, *pkt_obj;
-	int err, pkt_fd, kfree_skb_fd, i;
-	struct bpf_link *link[6] = {};
-	struct bpf_program *prog[6];
+	struct fexit_test *fexit_skel = NULL;
+	int err, prog_fd, i;
 	__u32 duration = 0, retval;
-	struct bpf_map *data_map;
-	const int zero = 0;
-	u64 result[6];
+	__u64 *result;
 
-	err = bpf_prog_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS,
-			    &pkt_obj, &pkt_fd);
-	if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno))
-		return;
-	err = bpf_prog_load_xattr(&attr, &obj, &kfree_skb_fd);
-	if (CHECK(err, "prog_load fail", "err %d errno %d\n", err, errno))
-		goto close_prog;
+	fexit_skel = fexit_test__open_and_load();
+	if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n"))
+		goto cleanup;
 
-	for (i = 0; i < 6; i++) {
-		prog_name[sizeof(prog_name) - 2] = '1' + i;
-		prog[i] = bpf_object__find_program_by_title(obj, prog_name);
-		if (CHECK(!prog[i], "find_prog", "prog %s not found\n", prog_name))
-			goto close_prog;
-		link[i] = bpf_program__attach_trace(prog[i]);
-		if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n"))
-			goto close_prog;
-	}
-	data_map = bpf_object__find_map_by_name(obj, "fexit_te.bss");
-	if (CHECK(!data_map, "find_data_map", "data map not found\n"))
-		goto close_prog;
+	err = fexit_test__attach(fexit_skel);
+	if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err))
+		goto cleanup;
 
-	err = bpf_prog_test_run(pkt_fd, 1, &pkt_v6, sizeof(pkt_v6),
+	prog_fd = bpf_program__fd(fexit_skel->progs.test1);
+	err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
 				NULL, NULL, &retval, &duration);
-	CHECK(err || retval, "ipv6",
+	CHECK(err || retval, "test_run",
 	      "err %d errno %d retval %d duration %d\n",
 	      err, errno, retval, duration);
 
-	err = bpf_map_lookup_elem(bpf_map__fd(data_map), &zero, &result);
-	if (CHECK(err, "get_result",
-		  "failed to get output data: %d\n", err))
-		goto close_prog;
-
-	for (i = 0; i < 6; i++)
-		if (CHECK(result[i] != 1, "result", "bpf_fentry_test%d failed err %ld\n",
-			  i + 1, result[i]))
-			goto close_prog;
+	result = (__u64 *)fexit_skel->bss;
+	for (i = 0; i < 6; i++) {
+		if (CHECK(result[i] != 1, "result",
+			  "fexit_test%d failed err %lld\n", i + 1, result[i]))
+			goto cleanup;
+	}
 
-close_prog:
-	for (i = 0; i < 6; i++)
-		if (!IS_ERR_OR_NULL(link[i]))
-			bpf_link__destroy(link[i]);
-	bpf_object__close(obj);
-	bpf_object__close(pkt_obj);
+cleanup:
+	fexit_test__destroy(fexit_skel);
 }
-- 
cgit v1.2.3


From d6e055e8733da5ce53fc69c77e379915400068c5 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 29 Feb 2020 23:23:44 +0100
Subject: PCI: Add constant PCI_STATUS_ERROR_BITS

This collection of PCI error bits is used in more than one driver,
so move it to the PCI core.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/skge.h | 7 -------
 drivers/net/ethernet/marvell/sky2.h | 7 -------
 include/linux/pci.h                 | 7 +++++++
 3 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/marvell/skge.h b/drivers/net/ethernet/marvell/skge.h
index 8ef19b3aa8ff..6928abcec0a3 100644
--- a/drivers/net/ethernet/marvell/skge.h
+++ b/drivers/net/ethernet/marvell/skge.h
@@ -15,13 +15,6 @@
 #define  PCI_VPD_ROM_SZ	7L<<14	/* VPD ROM size 0=256, 1=512, ... */
 #define  PCI_REV_DESC	1<<2	/* Reverse Descriptor bytes */
 
-#define PCI_STATUS_ERROR_BITS (PCI_STATUS_DETECTED_PARITY | \
-			       PCI_STATUS_SIG_SYSTEM_ERROR | \
-			       PCI_STATUS_REC_MASTER_ABORT | \
-			       PCI_STATUS_REC_TARGET_ABORT | \
-			       PCI_STATUS_SIG_TARGET_ABORT | \
-			       PCI_STATUS_PARITY)
-
 enum csr_regs {
 	B0_RAP	= 0x0000,
 	B0_CTST	= 0x0004,
diff --git a/drivers/net/ethernet/marvell/sky2.h b/drivers/net/ethernet/marvell/sky2.h
index 2474d326319e..b2dddd8a246c 100644
--- a/drivers/net/ethernet/marvell/sky2.h
+++ b/drivers/net/ethernet/marvell/sky2.h
@@ -252,13 +252,6 @@ enum {
 };
 
 
-#define PCI_STATUS_ERROR_BITS (PCI_STATUS_DETECTED_PARITY | \
-			       PCI_STATUS_SIG_SYSTEM_ERROR | \
-			       PCI_STATUS_REC_MASTER_ABORT | \
-			       PCI_STATUS_REC_TARGET_ABORT | \
-			       PCI_STATUS_SIG_TARGET_ABORT | \
-			       PCI_STATUS_PARITY)
-
 enum csr_regs {
 	B0_RAP		= 0x0000,
 	B0_CTST		= 0x0004,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 3840a541a9de..101d71e0ad0d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -42,6 +42,13 @@
 
 #include <linux/pci_ids.h>
 
+#define PCI_STATUS_ERROR_BITS (PCI_STATUS_DETECTED_PARITY  | \
+			       PCI_STATUS_SIG_SYSTEM_ERROR | \
+			       PCI_STATUS_REC_MASTER_ABORT | \
+			       PCI_STATUS_REC_TARGET_ABORT | \
+			       PCI_STATUS_SIG_TARGET_ABORT | \
+			       PCI_STATUS_PARITY)
+
 /*
  * The PCI interface treats multi-function devices as independent
  * devices.  The slot/function address of each device is encoded
-- 
cgit v1.2.3


From ec5d9e87842a43be3a10ada0d5f560bbd3f31d5d Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 29 Feb 2020 23:24:23 +0100
Subject: PCI: Add pci_status_get_and_clear_errors

Several drivers use the following code sequence:
1. Read PCI_STATUS
2. Mask out non-error bits
3. Action based on error bits set
4. Write back set error bits to clear them

As this is a repeated pattern, add a helper to the PCI core.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/pci/pci.c   | 23 +++++++++++++++++++++++
 include/linux/pci.h |  1 +
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index d828ca835a98..c16b0ba2a895 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -173,6 +173,29 @@ unsigned char pci_bus_max_busnr(struct pci_bus *bus)
 }
 EXPORT_SYMBOL_GPL(pci_bus_max_busnr);
 
+/**
+ * pci_status_get_and_clear_errors - return and clear error bits in PCI_STATUS
+ * @pdev: the PCI device
+ *
+ * Returns error bits set in PCI_STATUS and clears them.
+ */
+int pci_status_get_and_clear_errors(struct pci_dev *pdev)
+{
+	u16 status;
+	int ret;
+
+	ret = pci_read_config_word(pdev, PCI_STATUS, &status);
+	if (ret != PCIBIOS_SUCCESSFUL)
+		return -EIO;
+
+	status &= PCI_STATUS_ERROR_BITS;
+	if (status)
+		pci_write_config_word(pdev, PCI_STATUS, status);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(pci_status_get_and_clear_errors);
+
 #ifdef CONFIG_HAS_IOMEM
 void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar)
 {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 101d71e0ad0d..7beaf51e98ec 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1210,6 +1210,7 @@ int pci_select_bars(struct pci_dev *dev, unsigned long flags);
 bool pci_device_is_present(struct pci_dev *pdev);
 void pci_ignore_hotplug(struct pci_dev *dev);
 struct pci_dev *pci_real_dma_dev(struct pci_dev *dev);
+int pci_status_get_and_clear_errors(struct pci_dev *pdev);
 
 int __printf(6, 7) pci_request_irq(struct pci_dev *dev, unsigned int nr,
 		irq_handler_t handler, irq_handler_t thread_fn, void *dev_id,
-- 
cgit v1.2.3


From 1326034b3ce7073e3ed74bd0f4d24afee96a9e07 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Wed, 19 Feb 2020 21:05:17 +0200
Subject: net/mlx5: Expose raw packet pacing APIs

Expose raw packet pacing APIs to be used by DEVX based applications.
The existing code was refactored to have a single flow with the new raw
APIs.

The new raw APIs considered the input of 'pp_rate_limit_context', uid,
'dedicated', upon looking for an existing entry.

This raw mode enables future device specification data in the raw
context without changing the existing logic and code.

The ability to ask for a dedicated entry gives control for application
to allocate entries according to its needs.

A dedicated entry may not be used by some other process and it also
enables the process spreading its resources to some different entries
for use different hardware resources as part of enforcing the rate.

The counter per entry was changed to be u64 to prevent any option to
overflow.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/rl.c | 130 ++++++++++++++++++++-------
 include/linux/mlx5/driver.h                  |  11 ++-
 include/linux/mlx5/mlx5_ifc.h                |  26 +++---
 3 files changed, 122 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
index 01c380425f9d..f3b29d9ade1f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
@@ -101,22 +101,39 @@ int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 
+static bool mlx5_rl_are_equal_raw(struct mlx5_rl_entry *entry, void *rl_in,
+				  u16 uid)
+{
+	return (!memcmp(entry->rl_raw, rl_in, sizeof(entry->rl_raw)) &&
+		entry->uid == uid);
+}
+
 /* Finds an entry where we can register the given rate
  * If the rate already exists, return the entry where it is registered,
  * otherwise return the first available entry.
  * If the table is full, return NULL
  */
 static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table,
-					   struct mlx5_rate_limit *rl)
+					   void *rl_in, u16 uid, bool dedicated)
 {
 	struct mlx5_rl_entry *ret_entry = NULL;
 	bool empty_found = false;
 	int i;
 
 	for (i = 0; i < table->max_size; i++) {
-		if (mlx5_rl_are_equal(&table->rl_entry[i].rl, rl))
-			return &table->rl_entry[i];
-		if (!empty_found && !table->rl_entry[i].rl.rate) {
+		if (dedicated) {
+			if (!table->rl_entry[i].refcount)
+				return &table->rl_entry[i];
+			continue;
+		}
+
+		if (table->rl_entry[i].refcount) {
+			if (table->rl_entry[i].dedicated)
+				continue;
+			if (mlx5_rl_are_equal_raw(&table->rl_entry[i], rl_in,
+						  uid))
+				return &table->rl_entry[i];
+		} else if (!empty_found) {
 			empty_found = true;
 			ret_entry = &table->rl_entry[i];
 		}
@@ -126,18 +143,19 @@ static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table,
 }
 
 static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev,
-				      u16 index,
-				      struct mlx5_rate_limit *rl)
+				      struct mlx5_rl_entry *entry, bool set)
 {
-	u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)]   = {0};
-	u32 out[MLX5_ST_SZ_DW(set_pp_rate_limit_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)]   = {};
+	u32 out[MLX5_ST_SZ_DW(set_pp_rate_limit_out)] = {};
+	void *pp_context;
 
+	pp_context = MLX5_ADDR_OF(set_pp_rate_limit_in, in, ctx);
 	MLX5_SET(set_pp_rate_limit_in, in, opcode,
 		 MLX5_CMD_OP_SET_PP_RATE_LIMIT);
-	MLX5_SET(set_pp_rate_limit_in, in, rate_limit_index, index);
-	MLX5_SET(set_pp_rate_limit_in, in, rate_limit, rl->rate);
-	MLX5_SET(set_pp_rate_limit_in, in, burst_upper_bound, rl->max_burst_sz);
-	MLX5_SET(set_pp_rate_limit_in, in, typical_packet_size, rl->typical_pkt_sz);
+	MLX5_SET(set_pp_rate_limit_in, in, uid, entry->uid);
+	MLX5_SET(set_pp_rate_limit_in, in, rate_limit_index, entry->index);
+	if (set)
+		memcpy(pp_context, entry->rl_raw, sizeof(entry->rl_raw));
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 
@@ -158,23 +176,25 @@ bool mlx5_rl_are_equal(struct mlx5_rate_limit *rl_0,
 }
 EXPORT_SYMBOL(mlx5_rl_are_equal);
 
-int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u16 *index,
-		     struct mlx5_rate_limit *rl)
+int mlx5_rl_add_rate_raw(struct mlx5_core_dev *dev, void *rl_in, u16 uid,
+			 bool dedicated_entry, u16 *index)
 {
 	struct mlx5_rl_table *table = &dev->priv.rl_table;
 	struct mlx5_rl_entry *entry;
 	int err = 0;
+	u32 rate;
 
+	rate = MLX5_GET(set_pp_rate_limit_context, rl_in, rate_limit);
 	mutex_lock(&table->rl_lock);
 
-	if (!rl->rate || !mlx5_rl_is_in_range(dev, rl->rate)) {
+	if (!rate || !mlx5_rl_is_in_range(dev, rate)) {
 		mlx5_core_err(dev, "Invalid rate: %u, should be %u to %u\n",
-			      rl->rate, table->min_rate, table->max_rate);
+			      rate, table->min_rate, table->max_rate);
 		err = -EINVAL;
 		goto out;
 	}
 
-	entry = find_rl_entry(table, rl);
+	entry = find_rl_entry(table, rl_in, uid, dedicated_entry);
 	if (!entry) {
 		mlx5_core_err(dev, "Max number of %u rates reached\n",
 			      table->max_size);
@@ -185,16 +205,24 @@ int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u16 *index,
 		/* rate already configured */
 		entry->refcount++;
 	} else {
+		memcpy(entry->rl_raw, rl_in, sizeof(entry->rl_raw));
+		entry->uid = uid;
 		/* new rate limit */
-		err = mlx5_set_pp_rate_limit_cmd(dev, entry->index, rl);
+		err = mlx5_set_pp_rate_limit_cmd(dev, entry, true);
 		if (err) {
-			mlx5_core_err(dev, "Failed configuring rate limit(err %d): rate %u, max_burst_sz %u, typical_pkt_sz %u\n",
-				      err, rl->rate, rl->max_burst_sz,
-				      rl->typical_pkt_sz);
+			mlx5_core_err(
+				dev,
+				"Failed configuring rate limit(err %d): rate %u, max_burst_sz %u, typical_pkt_sz %u\n",
+				err, rate,
+				MLX5_GET(set_pp_rate_limit_context, rl_in,
+					 burst_upper_bound),
+				MLX5_GET(set_pp_rate_limit_context, rl_in,
+					 typical_packet_size));
 			goto out;
 		}
-		entry->rl = *rl;
+
 		entry->refcount = 1;
+		entry->dedicated = dedicated_entry;
 	}
 	*index = entry->index;
 
@@ -202,20 +230,61 @@ out:
 	mutex_unlock(&table->rl_lock);
 	return err;
 }
+EXPORT_SYMBOL(mlx5_rl_add_rate_raw);
+
+void mlx5_rl_remove_rate_raw(struct mlx5_core_dev *dev, u16 index)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+	struct mlx5_rl_entry *entry;
+
+	mutex_lock(&table->rl_lock);
+	entry = &table->rl_entry[index - 1];
+	entry->refcount--;
+	if (!entry->refcount)
+		/* need to remove rate */
+		mlx5_set_pp_rate_limit_cmd(dev, entry, false);
+	mutex_unlock(&table->rl_lock);
+}
+EXPORT_SYMBOL(mlx5_rl_remove_rate_raw);
+
+int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u16 *index,
+		     struct mlx5_rate_limit *rl)
+{
+	u8 rl_raw[MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)] = {};
+
+	MLX5_SET(set_pp_rate_limit_context, rl_raw, rate_limit, rl->rate);
+	MLX5_SET(set_pp_rate_limit_context, rl_raw, burst_upper_bound,
+		 rl->max_burst_sz);
+	MLX5_SET(set_pp_rate_limit_context, rl_raw, typical_packet_size,
+		 rl->typical_pkt_sz);
+
+	return mlx5_rl_add_rate_raw(dev, rl_raw,
+				    MLX5_CAP_QOS(dev, packet_pacing_uid) ?
+					MLX5_SHARED_RESOURCE_UID : 0,
+				    false, index);
+}
 EXPORT_SYMBOL(mlx5_rl_add_rate);
 
 void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, struct mlx5_rate_limit *rl)
 {
+	u8 rl_raw[MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)] = {};
 	struct mlx5_rl_table *table = &dev->priv.rl_table;
 	struct mlx5_rl_entry *entry = NULL;
-	struct mlx5_rate_limit reset_rl = {0};
 
 	/* 0 is a reserved value for unlimited rate */
 	if (rl->rate == 0)
 		return;
 
+	MLX5_SET(set_pp_rate_limit_context, rl_raw, rate_limit, rl->rate);
+	MLX5_SET(set_pp_rate_limit_context, rl_raw, burst_upper_bound,
+		 rl->max_burst_sz);
+	MLX5_SET(set_pp_rate_limit_context, rl_raw, typical_packet_size,
+		 rl->typical_pkt_sz);
+
 	mutex_lock(&table->rl_lock);
-	entry = find_rl_entry(table, rl);
+	entry = find_rl_entry(table, rl_raw,
+			      MLX5_CAP_QOS(dev, packet_pacing_uid) ?
+				MLX5_SHARED_RESOURCE_UID : 0, false);
 	if (!entry || !entry->refcount) {
 		mlx5_core_warn(dev, "Rate %u, max_burst_sz %u typical_pkt_sz %u are not configured\n",
 			       rl->rate, rl->max_burst_sz, rl->typical_pkt_sz);
@@ -223,11 +292,9 @@ void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, struct mlx5_rate_limit *rl)
 	}
 
 	entry->refcount--;
-	if (!entry->refcount) {
+	if (!entry->refcount)
 		/* need to remove rate */
-		mlx5_set_pp_rate_limit_cmd(dev, entry->index, &reset_rl);
-		entry->rl = reset_rl;
-	}
+		mlx5_set_pp_rate_limit_cmd(dev, entry, false);
 
 out:
 	mutex_unlock(&table->rl_lock);
@@ -273,14 +340,13 @@ int mlx5_init_rl_table(struct mlx5_core_dev *dev)
 void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev)
 {
 	struct mlx5_rl_table *table = &dev->priv.rl_table;
-	struct mlx5_rate_limit rl = {0};
 	int i;
 
 	/* Clear all configured rates */
 	for (i = 0; i < table->max_size; i++)
-		if (table->rl_entry[i].rl.rate)
-			mlx5_set_pp_rate_limit_cmd(dev, table->rl_entry[i].index,
-						   &rl);
+		if (table->rl_entry[i].refcount)
+			mlx5_set_pp_rate_limit_cmd(dev, &table->rl_entry[i],
+						   false);
 
 	kfree(dev->priv.rl_table.rl_entry);
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 277a51d3ec40..f2b4225ed650 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -518,9 +518,11 @@ struct mlx5_rate_limit {
 };
 
 struct mlx5_rl_entry {
-	struct mlx5_rate_limit	rl;
-	u16                     index;
-	u16                     refcount;
+	u8 rl_raw[MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)];
+	u16 index;
+	u64 refcount;
+	u16 uid;
+	u8 dedicated : 1;
 };
 
 struct mlx5_rl_table {
@@ -1007,6 +1009,9 @@ int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u16 *index,
 		     struct mlx5_rate_limit *rl);
 void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, struct mlx5_rate_limit *rl);
 bool mlx5_rl_is_in_range(struct mlx5_core_dev *dev, u32 rate);
+int mlx5_rl_add_rate_raw(struct mlx5_core_dev *dev, void *rl_in, u16 uid,
+			 bool dedicated_entry, u16 *index);
+void mlx5_rl_remove_rate_raw(struct mlx5_core_dev *dev, u16 index);
 bool mlx5_rl_are_equal(struct mlx5_rate_limit *rl_0,
 		       struct mlx5_rate_limit *rl_1);
 int mlx5_alloc_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ff8c9d527bb4..7d89ab64b372 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -810,7 +810,9 @@ struct mlx5_ifc_qos_cap_bits {
 	u8         reserved_at_4[0x1];
 	u8         packet_pacing_burst_bound[0x1];
 	u8         packet_pacing_typical_size[0x1];
-	u8         reserved_at_7[0x19];
+	u8         reserved_at_7[0x4];
+	u8         packet_pacing_uid[0x1];
+	u8         reserved_at_c[0x14];
 
 	u8         reserved_at_20[0x20];
 
@@ -8262,9 +8264,20 @@ struct mlx5_ifc_set_pp_rate_limit_out_bits {
 	u8         reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_set_pp_rate_limit_context_bits {
+	u8         rate_limit[0x20];
+
+	u8	   burst_upper_bound[0x20];
+
+	u8         reserved_at_40[0x10];
+	u8	   typical_packet_size[0x10];
+
+	u8         reserved_at_60[0x120];
+};
+
 struct mlx5_ifc_set_pp_rate_limit_in_bits {
 	u8         opcode[0x10];
-	u8         reserved_at_10[0x10];
+	u8         uid[0x10];
 
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
@@ -8274,14 +8287,7 @@ struct mlx5_ifc_set_pp_rate_limit_in_bits {
 
 	u8         reserved_at_60[0x20];
 
-	u8         rate_limit[0x20];
-
-	u8	   burst_upper_bound[0x20];
-
-	u8         reserved_at_c0[0x10];
-	u8	   typical_packet_size[0x10];
-
-	u8         reserved_at_e0[0x120];
+	struct mlx5_ifc_set_pp_rate_limit_context_bits ctx;
 };
 
 struct mlx5_ifc_access_register_out_bits {
-- 
cgit v1.2.3


From 95cddcb5cc202d3f2499596b9af5b77536c5f86a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 4 Mar 2020 21:15:31 -0800
Subject: ethtool: add infrastructure for centralized checking of coalescing
 parameters

Linux supports 22 different interrupt coalescing parameters.
No driver implements them all. Some drivers just ignore the
ones they don't support, while others have to carry a long
list of checks to reject unsupported settings.

To simplify the drivers add the ability to specify inside
ethtool_ops which parameters are supported and let the core
reject attempts to set any other one.

This commit makes the mechanism an opt-in, only drivers which
set ethtool_opts->coalesce_types to a non-zero value will have
the checks enforced.

The same mask is used for global and per queue settings.

v3: - move the (temporary) check if driver defines types
      earlier (Michal)
    - rename used_types -> nonzero_params, and
      coalesce_types -> supported_coalesce_params (Alex)
    - use EOPNOTSUPP instead of EINVAL (Andrew, Michal)

Leaving the long series of ifs for now, it seems nice to
be able to grep for the field and flag names. This will
probably have to be revisited once netlink support lands.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 45 ++++++++++++++++++++++++++++++---
 net/ethtool/ioctl.c     | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 23373978cb3c..e464c946bca4 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -177,8 +177,44 @@ void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
 bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 				     const unsigned long *src);
 
+#define ETHTOOL_COALESCE_RX_USECS		BIT(0)
+#define ETHTOOL_COALESCE_RX_MAX_FRAMES		BIT(1)
+#define ETHTOOL_COALESCE_RX_USECS_IRQ		BIT(2)
+#define ETHTOOL_COALESCE_RX_MAX_FRAMES_IRQ	BIT(3)
+#define ETHTOOL_COALESCE_TX_USECS		BIT(4)
+#define ETHTOOL_COALESCE_TX_MAX_FRAMES		BIT(5)
+#define ETHTOOL_COALESCE_TX_USECS_IRQ		BIT(6)
+#define ETHTOOL_COALESCE_TX_MAX_FRAMES_IRQ	BIT(7)
+#define ETHTOOL_COALESCE_STATS_BLOCK_USECS	BIT(8)
+#define ETHTOOL_COALESCE_USE_ADAPTIVE_RX	BIT(9)
+#define ETHTOOL_COALESCE_USE_ADAPTIVE_TX	BIT(10)
+#define ETHTOOL_COALESCE_PKT_RATE_LOW		BIT(11)
+#define ETHTOOL_COALESCE_RX_USECS_LOW		BIT(12)
+#define ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW	BIT(13)
+#define ETHTOOL_COALESCE_TX_USECS_LOW		BIT(14)
+#define ETHTOOL_COALESCE_TX_MAX_FRAMES_LOW	BIT(15)
+#define ETHTOOL_COALESCE_PKT_RATE_HIGH		BIT(16)
+#define ETHTOOL_COALESCE_RX_USECS_HIGH		BIT(17)
+#define ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH	BIT(18)
+#define ETHTOOL_COALESCE_TX_USECS_HIGH		BIT(19)
+#define ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH	BIT(20)
+#define ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL	BIT(21)
+
+#define ETHTOOL_COALESCE_USECS						\
+	(ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_TX_USECS)
+#define ETHTOOL_COALESCE_MAX_FRAMES					\
+	(ETHTOOL_COALESCE_RX_MAX_FRAMES | ETHTOOL_COALESCE_TX_MAX_FRAMES)
+#define ETHTOOL_COALESCE_USECS_IRQ					\
+	(ETHTOOL_COALESCE_RX_USECS_IRQ | ETHTOOL_COALESCE_TX_USECS_IRQ)
+#define ETHTOOL_COALESCE_MAX_FRAMES_IRQ		\
+	(ETHTOOL_COALESCE_RX_MAX_FRAMES_IRQ |	\
+	 ETHTOOL_COALESCE_TX_MAX_FRAMES_IRQ)
+#define ETHTOOL_COALESCE_USE_ADAPTIVE					\
+	(ETHTOOL_COALESCE_USE_ADAPTIVE_RX | ETHTOOL_COALESCE_USE_ADAPTIVE_TX)
+
 /**
  * struct ethtool_ops - optional netdev operations
+ * @supported_coalesce_params: supported types of interrupt coalescing.
  * @get_drvinfo: Report driver/device information.  Should only set the
  *	@driver, @version, @fw_version and @bus_info fields.  If not
  *	implemented, the @driver and @bus_info fields will be filled in
@@ -207,8 +243,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
  *	or zero.
  * @get_coalesce: Get interrupt coalescing parameters.  Returns a negative
  *	error code or zero.
- * @set_coalesce: Set interrupt coalescing parameters.  Returns a negative
- *	error code or zero.
+ * @set_coalesce: Set interrupt coalescing parameters.  Supported coalescing
+ *	types should be set in @supported_coalesce_params.
+ *	Returns a negative error code or zero.
  * @get_ringparam: Report ring sizes
  * @set_ringparam: Set ring sizes.  Returns a negative error code or zero.
  * @get_pauseparam: Report pause parameters
@@ -292,7 +329,8 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
  * @set_per_queue_coalesce: Set interrupt coalescing parameters per queue.
  *	It must check that the given queue number is valid. If neither a RX nor
  *	a TX queue has this number, return -EINVAL. If only a RX queue or a TX
- *	queue has this number, ignore the inapplicable fields.
+ *	queue has this number, ignore the inapplicable fields. Supported
+ *	coalescing types should be set in @supported_coalesce_params.
  *	Returns a negative error code or zero.
  * @get_link_ksettings: Get various device settings including Ethernet link
  *	settings. The %cmd and %link_mode_masks_nwords fields should be
@@ -323,6 +361,7 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
  * of the generic netdev features interface.
  */
 struct ethtool_ops {
+	u32	supported_coalesce_params;
 	void	(*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *);
 	int	(*get_regs_len)(struct net_device *);
 	void	(*get_regs)(struct net_device *, struct ethtool_regs *, void *);
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index f2fe8e5896dc..b2684ffa26de 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1544,6 +1544,64 @@ static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev,
 	return 0;
 }
 
+static bool
+ethtool_set_coalesce_supported(struct net_device *dev,
+			       struct ethtool_coalesce *coalesce)
+{
+	u32 supported_params = dev->ethtool_ops->supported_coalesce_params;
+	u32 nonzero_params = 0;
+
+	if (!supported_params)
+		return true;
+
+	if (coalesce->rx_coalesce_usecs)
+		nonzero_params |= ETHTOOL_COALESCE_RX_USECS;
+	if (coalesce->rx_max_coalesced_frames)
+		nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES;
+	if (coalesce->rx_coalesce_usecs_irq)
+		nonzero_params |= ETHTOOL_COALESCE_RX_USECS_IRQ;
+	if (coalesce->rx_max_coalesced_frames_irq)
+		nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_IRQ;
+	if (coalesce->tx_coalesce_usecs)
+		nonzero_params |= ETHTOOL_COALESCE_TX_USECS;
+	if (coalesce->tx_max_coalesced_frames)
+		nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES;
+	if (coalesce->tx_coalesce_usecs_irq)
+		nonzero_params |= ETHTOOL_COALESCE_TX_USECS_IRQ;
+	if (coalesce->tx_max_coalesced_frames_irq)
+		nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_IRQ;
+	if (coalesce->stats_block_coalesce_usecs)
+		nonzero_params |= ETHTOOL_COALESCE_STATS_BLOCK_USECS;
+	if (coalesce->use_adaptive_rx_coalesce)
+		nonzero_params |= ETHTOOL_COALESCE_USE_ADAPTIVE_RX;
+	if (coalesce->use_adaptive_tx_coalesce)
+		nonzero_params |= ETHTOOL_COALESCE_USE_ADAPTIVE_TX;
+	if (coalesce->pkt_rate_low)
+		nonzero_params |= ETHTOOL_COALESCE_PKT_RATE_LOW;
+	if (coalesce->rx_coalesce_usecs_low)
+		nonzero_params |= ETHTOOL_COALESCE_RX_USECS_LOW;
+	if (coalesce->rx_max_coalesced_frames_low)
+		nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW;
+	if (coalesce->tx_coalesce_usecs_low)
+		nonzero_params |= ETHTOOL_COALESCE_TX_USECS_LOW;
+	if (coalesce->tx_max_coalesced_frames_low)
+		nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_LOW;
+	if (coalesce->pkt_rate_high)
+		nonzero_params |= ETHTOOL_COALESCE_PKT_RATE_HIGH;
+	if (coalesce->rx_coalesce_usecs_high)
+		nonzero_params |= ETHTOOL_COALESCE_RX_USECS_HIGH;
+	if (coalesce->rx_max_coalesced_frames_high)
+		nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH;
+	if (coalesce->tx_coalesce_usecs_high)
+		nonzero_params |= ETHTOOL_COALESCE_TX_USECS_HIGH;
+	if (coalesce->tx_max_coalesced_frames_high)
+		nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH;
+	if (coalesce->rate_sample_interval)
+		nonzero_params |= ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL;
+
+	return (supported_params & nonzero_params) == nonzero_params;
+}
+
 static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev,
 						   void __user *useraddr)
 {
@@ -1555,6 +1613,9 @@ static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev,
 	if (copy_from_user(&coalesce, useraddr, sizeof(coalesce)))
 		return -EFAULT;
 
+	if (!ethtool_set_coalesce_supported(dev, &coalesce))
+		return -EOPNOTSUPP;
+
 	return dev->ethtool_ops->set_coalesce(dev, &coalesce);
 }
 
@@ -2336,6 +2397,11 @@ ethtool_set_per_queue_coalesce(struct net_device *dev,
 			goto roll_back;
 		}
 
+		if (!ethtool_set_coalesce_supported(dev, &coalesce)) {
+			ret = -EOPNOTSUPP;
+			goto roll_back;
+		}
+
 		ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce);
 		if (ret != 0)
 			goto roll_back;
-- 
cgit v1.2.3


From aaca9408078914380fbfd8aef3c38a34b515a654 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Thu, 5 Mar 2020 09:16:40 +0200
Subject: net: sched: Make FIFO Qdisc offloadable

Invoke ndo_setup_tc() as appropriate to signal init / replacement,
destroying and dumping of pFIFO / bFIFO Qdisc.

A lot of the FIFO logic is used for pFIFO_head_drop as well, but that's a
semantically very different Qdisc that isn't really in the same boat as
pFIFO / bFIFO. Split some of the functions to keep the Qdisc intact.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 include/net/pkt_cls.h     | 15 ++++++++
 net/sched/sch_fifo.c      | 97 ++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 107 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b6fedd54cd8e..654808bfad83 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -853,6 +853,7 @@ enum tc_setup_type {
 	TC_SETUP_FT,
 	TC_SETUP_QDISC_ETS,
 	TC_SETUP_QDISC_TBF,
+	TC_SETUP_QDISC_FIFO,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 53946b509b51..341a66af8d59 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -881,4 +881,19 @@ struct tc_tbf_qopt_offload {
 	};
 };
 
+enum tc_fifo_command {
+	TC_FIFO_REPLACE,
+	TC_FIFO_DESTROY,
+	TC_FIFO_STATS,
+};
+
+struct tc_fifo_qopt_offload {
+	enum tc_fifo_command command;
+	u32 handle;
+	u32 parent;
+	union {
+		struct tc_qopt_offload_stats stats;
+	};
+};
+
 #endif
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 37c8aa75d70c..a579a4131d22 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -12,6 +12,7 @@
 #include <linux/errno.h>
 #include <linux/skbuff.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 
 /* 1 band FIFO pseudo-"scheduler" */
 
@@ -51,8 +52,49 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	return NET_XMIT_CN;
 }
 
-static int fifo_init(struct Qdisc *sch, struct nlattr *opt,
-		     struct netlink_ext_ack *extack)
+static void fifo_offload_init(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_fifo_qopt_offload qopt;
+
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return;
+
+	qopt.command = TC_FIFO_REPLACE;
+	qopt.handle = sch->handle;
+	qopt.parent = sch->parent;
+	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_FIFO, &qopt);
+}
+
+static void fifo_offload_destroy(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct tc_fifo_qopt_offload qopt;
+
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return;
+
+	qopt.command = TC_FIFO_DESTROY;
+	qopt.handle = sch->handle;
+	qopt.parent = sch->parent;
+	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_FIFO, &qopt);
+}
+
+static int fifo_offload_dump(struct Qdisc *sch)
+{
+	struct tc_fifo_qopt_offload qopt;
+
+	qopt.command = TC_FIFO_STATS;
+	qopt.handle = sch->handle;
+	qopt.parent = sch->parent;
+	qopt.stats.bstats = &sch->bstats;
+	qopt.stats.qstats = &sch->qstats;
+
+	return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_FIFO, &qopt);
+}
+
+static int __fifo_init(struct Qdisc *sch, struct nlattr *opt,
+		       struct netlink_ext_ack *extack)
 {
 	bool bypass;
 	bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
@@ -82,10 +124,35 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt,
 		sch->flags |= TCQ_F_CAN_BYPASS;
 	else
 		sch->flags &= ~TCQ_F_CAN_BYPASS;
+
 	return 0;
 }
 
-static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+static int fifo_init(struct Qdisc *sch, struct nlattr *opt,
+		     struct netlink_ext_ack *extack)
+{
+	int err;
+
+	err = __fifo_init(sch, opt, extack);
+	if (err)
+		return err;
+
+	fifo_offload_init(sch);
+	return 0;
+}
+
+static int fifo_hd_init(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
+{
+	return __fifo_init(sch, opt, extack);
+}
+
+static void fifo_destroy(struct Qdisc *sch)
+{
+	fifo_offload_destroy(sch);
+}
+
+static int __fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct tc_fifo_qopt opt = { .limit = sch->limit };
 
@@ -97,6 +164,22 @@ nla_put_failure:
 	return -1;
 }
 
+static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	int err;
+
+	err = fifo_offload_dump(sch);
+	if (err)
+		return err;
+
+	return __fifo_dump(sch, skb);
+}
+
+static int fifo_hd_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	return __fifo_dump(sch, skb);
+}
+
 struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
 	.id		=	"pfifo",
 	.priv_size	=	0,
@@ -104,6 +187,7 @@ struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
 	.dequeue	=	qdisc_dequeue_head,
 	.peek		=	qdisc_peek_head,
 	.init		=	fifo_init,
+	.destroy	=	fifo_destroy,
 	.reset		=	qdisc_reset_queue,
 	.change		=	fifo_init,
 	.dump		=	fifo_dump,
@@ -118,6 +202,7 @@ struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
 	.dequeue	=	qdisc_dequeue_head,
 	.peek		=	qdisc_peek_head,
 	.init		=	fifo_init,
+	.destroy	=	fifo_destroy,
 	.reset		=	qdisc_reset_queue,
 	.change		=	fifo_init,
 	.dump		=	fifo_dump,
@@ -131,10 +216,10 @@ struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {
 	.enqueue	=	pfifo_tail_enqueue,
 	.dequeue	=	qdisc_dequeue_head,
 	.peek		=	qdisc_peek_head,
-	.init		=	fifo_init,
+	.init		=	fifo_hd_init,
 	.reset		=	qdisc_reset_queue,
-	.change		=	fifo_init,
-	.dump		=	fifo_dump,
+	.change		=	fifo_hd_init,
+	.dump		=	fifo_hd_dump,
 	.owner		=	THIS_MODULE,
 };
 
-- 
cgit v1.2.3


From 70c0923b0ef10b1c8d8f78fb50fcaef8eaae619d Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Mon, 2 Mar 2020 18:25:00 -0800
Subject: PCI: Introduce pci_get_dsn

Several device drivers read their Device Serial Number from the PCIe
extended config space.

Introduce a new helper function, pci_get_dsn(). This function reads the
eight bytes of the DSN and returns them as a u64. If the capability does not
exist for the device, the function returns 0.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Michael Chan <michael.chan@broadcom.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/pci/pci.c   | 34 ++++++++++++++++++++++++++++++++++
 include/linux/pci.h |  5 +++++
 2 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index c16b0ba2a895..86821313c007 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -580,6 +580,40 @@ int pci_find_ext_capability(struct pci_dev *dev, int cap)
 }
 EXPORT_SYMBOL_GPL(pci_find_ext_capability);
 
+/**
+ * pci_get_dsn - Read and return the 8-byte Device Serial Number
+ * @dev: PCI device to query
+ *
+ * Looks up the PCI_EXT_CAP_ID_DSN and reads the 8 bytes of the Device Serial
+ * Number.
+ *
+ * Returns the DSN, or zero if the capability does not exist.
+ */
+u64 pci_get_dsn(struct pci_dev *dev)
+{
+	u32 dword;
+	u64 dsn;
+	int pos;
+
+	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DSN);
+	if (!pos)
+		return 0;
+
+	/*
+	 * The Device Serial Number is two dwords offset 4 bytes from the
+	 * capability position. The specification says that the first dword is
+	 * the lower half, and the second dword is the upper half.
+	 */
+	pos += 4;
+	pci_read_config_dword(dev, pos, &dword);
+	dsn = (u64)dword;
+	pci_read_config_dword(dev, pos + 4, &dword);
+	dsn |= ((u64)dword) << 32;
+
+	return dsn;
+}
+EXPORT_SYMBOL_GPL(pci_get_dsn);
+
 static int __pci_find_next_ht_cap(struct pci_dev *dev, int pos, int ht_cap)
 {
 	int rc, ttl = PCI_FIND_CAP_TTL;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7beaf51e98ec..fc54b8922e66 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1052,6 +1052,8 @@ int pci_find_ht_capability(struct pci_dev *dev, int ht_cap);
 int pci_find_next_ht_capability(struct pci_dev *dev, int pos, int ht_cap);
 struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
 
+u64 pci_get_dsn(struct pci_dev *dev);
+
 struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device,
 			       struct pci_dev *from);
 struct pci_dev *pci_get_subsys(unsigned int vendor, unsigned int device,
@@ -1707,6 +1709,9 @@ static inline int pci_find_next_capability(struct pci_dev *dev, u8 post,
 static inline int pci_find_ext_capability(struct pci_dev *dev, int cap)
 { return 0; }
 
+static inline u64 pci_get_dsn(struct pci_dev *dev)
+{ return 0; }
+
 /* Power management related routines */
 static inline int pci_save_state(struct pci_dev *dev) { return 0; }
 static inline void pci_restore_state(struct pci_dev *dev) { }
-- 
cgit v1.2.3


From 86f5d0f3d4995b8b8909da7eea235e5bc1ceefbe Mon Sep 17 00:00:00 2001
From: Vu Pham <vuhuong@mellanox.com>
Date: Mon, 2 Mar 2020 16:15:19 -0800
Subject: net/mlx5: Introduce egress acl forward-to-vport capability

Add HCA_CAP.egress_acl_forward_to_vport field to check whether HW
supports e-switch vport's egress acl to forward packets to other
e-switch vport or not.

By default E-Switch egress ACL forwards eswitch vports egress packets
to their corresponding NIC/VF vports.

With this cap enabled, the driver is allowed to alter this behavior
and forward packets to arbitrary NIC/VF vports with the following
limitations:

   a. Multiple processing paths are supported if all of the following
      conditions are met:
      - HCA_CAP.egress_acl_forward_to_vport is set ==1.
      - A destination of type Flow Table only appears once, as the
        last destination in the list.
      - Vport destination is supported if
        HCA_CAP.egress_acl_forward_to_vport==1. Vport must not be
        the Uplink.
   b. Flow_tag not supported.
   c. This table is only applicable after an FDB table is created.
   d. Push VLAN action is not supported.
   e. Pop VLAN action cannot be added concurrently to this table and
      FDB table.

This feature will be used during port failover in bonding scenario
where two VFs representors are bonded to handle failover egress traffic
(VM's ingress/receive traffic).

Signed-off-by: Vu Pham <vuhuong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 7d89ab64b372..07be46e713fc 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -738,7 +738,7 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits {
 	u8      flow_source[0x1];
 	u8      reserved_at_18[0x2];
 	u8      multi_fdb_encap[0x1];
-	u8      reserved_at_1b[0x1];
+	u8      egress_acl_forward_to_vport[0x1];
 	u8      fdb_multi_path_to_table[0x1];
 	u8      reserved_at_1d[0x3];
 
-- 
cgit v1.2.3


From bd673da6d933e3c8f652b011afba6cee0f8bbe45 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Mon, 2 Mar 2020 16:15:20 -0800
Subject: net/mlx5: Introduce TLS and IPSec objects enums

Expose the TLS encryption key general object type enum correctly,
and add the IPSec encryption key general object type enum.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c | 2 +-
 include/linux/mlx5/mlx5_ifc.h                        | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c
index 3fc575d1c3ec..dcea87ec5977 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c
@@ -42,7 +42,7 @@ int mlx5_create_encryption_key(struct mlx5_core_dev *mdev,
 
 	MLX5_SET(encryption_key_obj, obj, key_size, general_obj_key_size);
 	MLX5_SET(encryption_key_obj, obj, key_type,
-		 MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_DEK);
+		 MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_TLS);
 	MLX5_SET(general_obj_in_cmd_hdr, in, opcode,
 		 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
 	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 07be46e713fc..2e98bba12356 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10489,7 +10489,8 @@ enum {
 };
 
 enum {
-	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_DEK = 0x1,
+	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_TLS = 0x1,
+	MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_IPSEC = 0x2,
 };
 
 struct mlx5_ifc_tls_static_params_bits {
-- 
cgit v1.2.3


From dc392fc56f39a00a46d6db2d150571ccafe99734 Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Mon, 2 Mar 2020 16:15:21 -0800
Subject: net/mlx5: Expose link speed directly

Expose port rate as part of the port speed register fields.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 2e98bba12356..d0a678c82ccd 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -8423,7 +8423,8 @@ struct mlx5_ifc_ptys_reg_bits {
 	u8         proto_mask[0x3];
 
 	u8         an_status[0x4];
-	u8         reserved_at_24[0x1c];
+	u8         reserved_at_24[0xc];
+	u8         data_rate_oper[0x10];
 
 	u8         ext_eth_proto_capability[0x20];
 
-- 
cgit v1.2.3


From e0ebd8eb36ed850a22a9a0ca83edc4a40ad67c16 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Mon, 2 Mar 2020 16:15:22 -0800
Subject: net/mlx5: HW bit for goto chain offload support

Add the HW bit definition indecating goto chain offload support.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index d0a678c82ccd..9b8ff4e57002 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -414,7 +414,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         reserved_at_16[0x1];
 	u8	   table_miss_action_domain[0x1];
 	u8         termination_table[0x1];
-	u8         reserved_at_19[0x7];
+	u8         reformat_and_fwd_to_table[0x1];
+	u8         reserved_at_1a[0x6];
 	u8         reserved_at_20[0x2];
 	u8         log_max_ft_size[0x6];
 	u8         log_max_modify_header_context[0x8];
-- 
cgit v1.2.3


From d7f5f3c89c1a2344e88842ba0de327cc0098583e Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Thu, 5 Mar 2020 22:28:15 -0600
Subject: remoteproc: add IPA notification to q6v5 driver

Set up a subdev in the q6v5 modem remoteproc driver that generates
event notifications for the IPA driver to use for initialization and
recovery following a modem shutdown or crash.

A pair of new functions provides a way for the IPA driver to register
and deregister a notification callback function that will be called
whenever modem events (about to boot, running, about to shut down,
etc.) occur.  A void pointer value (provided by the IPA driver at
registration time) and an event type are supplied to the callback
function.

One event, MODEM_REMOVING, is signaled whenever the q6v5 driver is
about to remove the notification subdevice.  It requires the IPA
driver de-register its callback.

This sub-device is only used by the modem subsystem (MSS) driver,
so the code that adds the new subdev and allows registration and
deregistration of the notifier is found in "qcom_q6v6_mss.c".

Signed-off-by: Alex Elder <elder@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/remoteproc/Kconfig                      |  6 ++
 drivers/remoteproc/Makefile                     |  1 +
 drivers/remoteproc/qcom_q6v5_ipa_notify.c       | 85 +++++++++++++++++++++++++
 drivers/remoteproc/qcom_q6v5_mss.c              | 38 +++++++++++
 include/linux/remoteproc/qcom_q6v5_ipa_notify.h | 82 ++++++++++++++++++++++++
 5 files changed, 212 insertions(+)
 create mode 100644 drivers/remoteproc/qcom_q6v5_ipa_notify.c
 create mode 100644 include/linux/remoteproc/qcom_q6v5_ipa_notify.h

(limited to 'include/linux')

diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig
index de3862c15fcc..56084635dd63 100644
--- a/drivers/remoteproc/Kconfig
+++ b/drivers/remoteproc/Kconfig
@@ -167,6 +167,12 @@ config QCOM_Q6V5_WCSS
 	  Say y here to support the Qualcomm Peripheral Image Loader for the
 	  Hexagon V5 based WCSS remote processors.
 
+config QCOM_Q6V5_IPA_NOTIFY
+	tristate
+	depends on QCOM_IPA
+	depends on QCOM_Q6V5_MSS
+	default QCOM_IPA
+
 config QCOM_SYSMON
 	tristate "Qualcomm sysmon driver"
 	depends on RPMSG
diff --git a/drivers/remoteproc/Makefile b/drivers/remoteproc/Makefile
index e30a1b15fbac..0effd3825035 100644
--- a/drivers/remoteproc/Makefile
+++ b/drivers/remoteproc/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_QCOM_Q6V5_ADSP)		+= qcom_q6v5_adsp.o
 obj-$(CONFIG_QCOM_Q6V5_MSS)		+= qcom_q6v5_mss.o
 obj-$(CONFIG_QCOM_Q6V5_PAS)		+= qcom_q6v5_pas.o
 obj-$(CONFIG_QCOM_Q6V5_WCSS)		+= qcom_q6v5_wcss.o
+obj-$(CONFIG_QCOM_Q6V5_IPA_NOTIFY)	+= qcom_q6v5_ipa_notify.o
 obj-$(CONFIG_QCOM_SYSMON)		+= qcom_sysmon.o
 obj-$(CONFIG_QCOM_WCNSS_PIL)		+= qcom_wcnss_pil.o
 qcom_wcnss_pil-y			+= qcom_wcnss.o
diff --git a/drivers/remoteproc/qcom_q6v5_ipa_notify.c b/drivers/remoteproc/qcom_q6v5_ipa_notify.c
new file mode 100644
index 000000000000..e1c10a128bfd
--- /dev/null
+++ b/drivers/remoteproc/qcom_q6v5_ipa_notify.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Qualcomm IPA notification subdev support
+ *
+ * Copyright (C) 2019 Linaro Ltd.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/remoteproc.h>
+#include <linux/remoteproc/qcom_q6v5_ipa_notify.h>
+
+static void
+ipa_notify_common(struct rproc_subdev *subdev, enum qcom_rproc_event event)
+{
+	struct qcom_rproc_ipa_notify *ipa_notify;
+	qcom_ipa_notify_t notify;
+
+	ipa_notify = container_of(subdev, struct qcom_rproc_ipa_notify, subdev);
+	notify = ipa_notify->notify;
+	if (notify)
+		notify(ipa_notify->data, event);
+}
+
+static int ipa_notify_prepare(struct rproc_subdev *subdev)
+{
+	ipa_notify_common(subdev, MODEM_STARTING);
+
+	return 0;
+}
+
+static int ipa_notify_start(struct rproc_subdev *subdev)
+{
+	ipa_notify_common(subdev, MODEM_RUNNING);
+
+	return 0;
+}
+
+static void ipa_notify_stop(struct rproc_subdev *subdev, bool crashed)
+
+{
+	ipa_notify_common(subdev, crashed ? MODEM_CRASHED : MODEM_STOPPING);
+}
+
+static void ipa_notify_unprepare(struct rproc_subdev *subdev)
+{
+	ipa_notify_common(subdev, MODEM_OFFLINE);
+}
+
+static void ipa_notify_removing(struct rproc_subdev *subdev)
+{
+	ipa_notify_common(subdev, MODEM_REMOVING);
+}
+
+/* Register the IPA notification subdevice with the Q6V5 MSS remoteproc */
+void qcom_add_ipa_notify_subdev(struct rproc *rproc,
+		struct qcom_rproc_ipa_notify *ipa_notify)
+{
+	ipa_notify->notify = NULL;
+	ipa_notify->data = NULL;
+	ipa_notify->subdev.prepare = ipa_notify_prepare;
+	ipa_notify->subdev.start = ipa_notify_start;
+	ipa_notify->subdev.stop = ipa_notify_stop;
+	ipa_notify->subdev.unprepare = ipa_notify_unprepare;
+
+	rproc_add_subdev(rproc, &ipa_notify->subdev);
+}
+EXPORT_SYMBOL_GPL(qcom_add_ipa_notify_subdev);
+
+/* Remove the IPA notification subdevice */
+void qcom_remove_ipa_notify_subdev(struct rproc *rproc,
+		struct qcom_rproc_ipa_notify *ipa_notify)
+{
+	struct rproc_subdev *subdev = &ipa_notify->subdev;
+
+	ipa_notify_removing(subdev);
+
+	rproc_remove_subdev(rproc, subdev);
+	ipa_notify->notify = NULL;	/* Make it obvious */
+}
+EXPORT_SYMBOL_GPL(qcom_remove_ipa_notify_subdev);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Qualcomm IPA notification remoteproc subdev");
diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c
index a1cc9cbe038f..f9ccce76e44b 100644
--- a/drivers/remoteproc/qcom_q6v5_mss.c
+++ b/drivers/remoteproc/qcom_q6v5_mss.c
@@ -22,6 +22,7 @@
 #include <linux/regmap.h>
 #include <linux/regulator/consumer.h>
 #include <linux/remoteproc.h>
+#include "linux/remoteproc/qcom_q6v5_ipa_notify.h"
 #include <linux/reset.h>
 #include <linux/soc/qcom/mdt_loader.h>
 #include <linux/iopoll.h>
@@ -201,6 +202,7 @@ struct q6v5 {
 	struct qcom_rproc_glink glink_subdev;
 	struct qcom_rproc_subdev smd_subdev;
 	struct qcom_rproc_ssr ssr_subdev;
+	struct qcom_rproc_ipa_notify ipa_notify_subdev;
 	struct qcom_sysmon *sysmon;
 	bool need_mem_protection;
 	bool has_alt_reset;
@@ -1540,6 +1542,39 @@ static int q6v5_alloc_memory_region(struct q6v5 *qproc)
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_QCOM_Q6V5_IPA_NOTIFY)
+
+/* Register IPA notification function */
+int qcom_register_ipa_notify(struct rproc *rproc, qcom_ipa_notify_t notify,
+			     void *data)
+{
+	struct qcom_rproc_ipa_notify *ipa_notify;
+	struct q6v5 *qproc = rproc->priv;
+
+	if (!notify)
+		return -EINVAL;
+
+	ipa_notify = &qproc->ipa_notify_subdev;
+	if (ipa_notify->notify)
+		return -EBUSY;
+
+	ipa_notify->notify = notify;
+	ipa_notify->data = data;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(qcom_register_ipa_notify);
+
+/* Deregister IPA notification function */
+void qcom_deregister_ipa_notify(struct rproc *rproc)
+{
+	struct q6v5 *qproc = rproc->priv;
+
+	qproc->ipa_notify_subdev.notify = NULL;
+}
+EXPORT_SYMBOL_GPL(qcom_deregister_ipa_notify);
+#endif /* !IS_ENABLED(CONFIG_QCOM_Q6V5_IPA_NOTIFY) */
+
 static int q6v5_probe(struct platform_device *pdev)
 {
 	const struct rproc_hexagon_res *desc;
@@ -1664,6 +1699,7 @@ static int q6v5_probe(struct platform_device *pdev)
 	qcom_add_glink_subdev(rproc, &qproc->glink_subdev);
 	qcom_add_smd_subdev(rproc, &qproc->smd_subdev);
 	qcom_add_ssr_subdev(rproc, &qproc->ssr_subdev, "mpss");
+	qcom_add_ipa_notify_subdev(rproc, &qproc->ipa_notify_subdev);
 	qproc->sysmon = qcom_add_sysmon_subdev(rproc, "modem", 0x12);
 	if (IS_ERR(qproc->sysmon)) {
 		ret = PTR_ERR(qproc->sysmon);
@@ -1677,6 +1713,7 @@ static int q6v5_probe(struct platform_device *pdev)
 	return 0;
 
 detach_proxy_pds:
+	qcom_remove_ipa_notify_subdev(qproc->rproc, &qproc->ipa_notify_subdev);
 	q6v5_pds_detach(qproc, qproc->proxy_pds, qproc->proxy_pd_count);
 detach_active_pds:
 	q6v5_pds_detach(qproc, qproc->active_pds, qproc->active_pd_count);
@@ -1693,6 +1730,7 @@ static int q6v5_remove(struct platform_device *pdev)
 	rproc_del(qproc->rproc);
 
 	qcom_remove_sysmon_subdev(qproc->sysmon);
+	qcom_remove_ipa_notify_subdev(qproc->rproc, &qproc->ipa_notify_subdev);
 	qcom_remove_glink_subdev(qproc->rproc, &qproc->glink_subdev);
 	qcom_remove_smd_subdev(qproc->rproc, &qproc->smd_subdev);
 	qcom_remove_ssr_subdev(qproc->rproc, &qproc->ssr_subdev);
diff --git a/include/linux/remoteproc/qcom_q6v5_ipa_notify.h b/include/linux/remoteproc/qcom_q6v5_ipa_notify.h
new file mode 100644
index 000000000000..0820edc0ab7d
--- /dev/null
+++ b/include/linux/remoteproc/qcom_q6v5_ipa_notify.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* Copyright (C) 2019 Linaro Ltd. */
+
+#ifndef __QCOM_Q6V5_IPA_NOTIFY_H__
+#define __QCOM_Q6V5_IPA_NOTIFY_H__
+
+#if IS_ENABLED(CONFIG_QCOM_Q6V5_IPA_NOTIFY)
+
+#include <linux/remoteproc.h>
+
+enum qcom_rproc_event {
+	MODEM_STARTING	= 0,	/* Modem is about to be started */
+	MODEM_RUNNING	= 1,	/* Startup complete; modem is operational */
+	MODEM_STOPPING	= 2,	/* Modem is about to shut down */
+	MODEM_CRASHED	= 3,	/* Modem has crashed (implies stopping) */
+	MODEM_OFFLINE	= 4,	/* Modem is now offline */
+	MODEM_REMOVING	= 5,	/* Modem is about to be removed */
+};
+
+typedef void (*qcom_ipa_notify_t)(void *data, enum qcom_rproc_event event);
+
+struct qcom_rproc_ipa_notify {
+	struct rproc_subdev subdev;
+
+	qcom_ipa_notify_t notify;
+	void *data;
+};
+
+/**
+ * qcom_add_ipa_notify_subdev() - Register IPA notification subdevice
+ * @rproc:	rproc handle
+ * @ipa_notify:	IPA notification subdevice handle
+ *
+ * Register the @ipa_notify subdevice with the @rproc so modem events
+ * can be sent to IPA when they occur.
+ *
+ * This is defined in "qcom_q6v5_ipa_notify.c".
+ */
+void qcom_add_ipa_notify_subdev(struct rproc *rproc,
+		struct qcom_rproc_ipa_notify *ipa_notify);
+
+/**
+ * qcom_remove_ipa_notify_subdev() - Remove IPA SSR subdevice
+ * @rproc:	rproc handle
+ * @ipa_notify:	IPA notification subdevice handle
+ *
+ * This is defined in "qcom_q6v5_ipa_notify.c".
+ */
+void qcom_remove_ipa_notify_subdev(struct rproc *rproc,
+		struct qcom_rproc_ipa_notify *ipa_notify);
+
+/**
+ * qcom_register_ipa_notify() - Register IPA notification function
+ * @rproc:	Remote processor handle
+ * @notify:	Non-null IPA notification callback function pointer
+ * @data:	Data supplied to IPA notification callback function
+ *
+ * @Return: 0 if successful, or a negative error code otherwise
+ *
+ * This is defined in "qcom_q6v5_mss.c".
+ */
+int qcom_register_ipa_notify(struct rproc *rproc, qcom_ipa_notify_t notify,
+			     void *data);
+/**
+ * qcom_deregister_ipa_notify() - Deregister IPA notification function
+ * @rproc:	Remote processor handle
+ *
+ * This is defined in "qcom_q6v5_mss.c".
+ */
+void qcom_deregister_ipa_notify(struct rproc *rproc);
+
+#else /* !IS_ENABLED(CONFIG_QCOM_Q6V5_IPA_NOTIFY) */
+
+struct qcom_rproc_ipa_notify { /* empty */ };
+
+#define qcom_add_ipa_notify_subdev(rproc, ipa_notify)		/* no-op */
+#define qcom_remove_ipa_notify_subdev(rproc, ipa_notify)	/* no-op */
+
+#endif /* !IS_ENABLED(CONFIG_QCOM_Q6V5_IPA_NOTIFY) */
+
+#endif /* !__QCOM_Q6V5_IPA_NOTIFY_H__ */
-- 
cgit v1.2.3


From 7b70973d7edb2f005511102d5a2e0116464a46a1 Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Mon, 9 Mar 2020 11:12:32 +0000
Subject: bpf: sockmap: Only check ULP for TCP sockets

The sock map code checks that a socket does not have an active upper
layer protocol before inserting it into the map. This requires casting
via inet_csk, which isn't valid for UDP sockets.

Guard checks for ULP by checking inet_sk(sk)->is_icsk first.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200309111243.6982-2-lmb@cloudflare.com
---
 include/linux/skmsg.h              | 8 +++++++-
 include/net/inet_connection_sock.h | 6 ++++++
 net/core/sock_map.c                | 6 ++----
 net/ipv4/tcp_ulp.c                 | 7 -------
 4 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 112765bd146d..4d3d75d63066 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -360,7 +360,13 @@ static inline void sk_psock_restore_proto(struct sock *sk,
 					  struct sk_psock *psock)
 {
 	sk->sk_prot->unhash = psock->saved_unhash;
-	tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
+	if (inet_csk_has_ulp(sk)) {
+		tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
+	} else {
+		sk->sk_write_space = psock->saved_write_space;
+		/* Pairs with lockless read in sk_clone_lock() */
+		WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+	}
 }
 
 static inline void sk_psock_set_state(struct sk_psock *psock,
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 895546058a20..a3f076befa4f 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -335,4 +335,10 @@ static inline void inet_csk_inc_pingpong_cnt(struct sock *sk)
 	if (icsk->icsk_ack.pingpong < U8_MAX)
 		icsk->icsk_ack.pingpong++;
 }
+
+static inline bool inet_csk_has_ulp(struct sock *sk)
+{
+	return inet_sk(sk)->is_icsk && !!inet_csk(sk)->icsk_ulp_ops;
+}
+
 #endif /* _INET_CONNECTION_SOCK_H */
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 2e0f465295c3..cb8f740f7949 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -384,7 +384,6 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
 				  struct sock *sk, u64 flags)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct sk_psock_link *link;
 	struct sk_psock *psock;
 	struct sock *osk;
@@ -395,7 +394,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
 		return -EINVAL;
 	if (unlikely(idx >= map->max_entries))
 		return -E2BIG;
-	if (unlikely(rcu_access_pointer(icsk->icsk_ulp_data)))
+	if (inet_csk_has_ulp(sk))
 		return -EINVAL;
 
 	link = sk_psock_init_link();
@@ -738,7 +737,6 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
 				   struct sock *sk, u64 flags)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 key_size = map->key_size, hash;
 	struct bpf_htab_elem *elem, *elem_new;
 	struct bpf_htab_bucket *bucket;
@@ -749,7 +747,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	if (unlikely(flags > BPF_EXIST))
 		return -EINVAL;
-	if (unlikely(icsk->icsk_ulp_data))
+	if (inet_csk_has_ulp(sk))
 		return -EINVAL;
 
 	link = sk_psock_init_link();
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 2703f24c5d1a..7c27aa629af1 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -105,13 +105,6 @@ void tcp_update_ulp(struct sock *sk, struct proto *proto,
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
-	if (!icsk->icsk_ulp_ops) {
-		sk->sk_write_space = write_space;
-		/* Pairs with lockless read in sk_clone_lock() */
-		WRITE_ONCE(sk->sk_prot, proto);
-		return;
-	}
-
 	if (icsk->icsk_ulp_ops->update)
 		icsk->icsk_ulp_ops->update(sk, proto, write_space);
 }
-- 
cgit v1.2.3


From 1a2e20132db7bb76dd4f97b8364bd167227dd15f Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Mon, 9 Mar 2020 11:12:33 +0000
Subject: skmsg: Update saved hooks only once

Only update psock->saved_* if psock->sk_proto has not been initialized
yet. This allows us to get rid of tcp_bpf_reinit_sk_prot.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200309111243.6982-3-lmb@cloudflare.com
---
 include/linux/skmsg.h | 20 ++++++++++++++++----
 net/ipv4/tcp_bpf.c    | 16 +---------------
 2 files changed, 17 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 4d3d75d63066..2be51b7a5800 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -347,11 +347,23 @@ static inline void sk_psock_update_proto(struct sock *sk,
 					 struct sk_psock *psock,
 					 struct proto *ops)
 {
-	psock->saved_unhash = sk->sk_prot->unhash;
-	psock->saved_close = sk->sk_prot->close;
-	psock->saved_write_space = sk->sk_write_space;
+	/* Initialize saved callbacks and original proto only once, since this
+	 * function may be called multiple times for a psock, e.g. when
+	 * psock->progs.msg_parser is updated.
+	 *
+	 * Since we've not installed the new proto, psock is not yet in use and
+	 * we can initialize it without synchronization.
+	 */
+	if (!psock->sk_proto) {
+		struct proto *orig = READ_ONCE(sk->sk_prot);
+
+		psock->saved_unhash = orig->unhash;
+		psock->saved_close = orig->close;
+		psock->saved_write_space = sk->sk_write_space;
+
+		psock->sk_proto = orig;
+	}
 
-	psock->sk_proto = sk->sk_prot;
 	/* Pairs with lockless read in sk_clone_lock() */
 	WRITE_ONCE(sk->sk_prot, ops);
 }
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 7d6e1b75d4d4..3327afa05c3d 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -637,20 +637,6 @@ static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock)
 	sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]);
 }
 
-static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock)
-{
-	int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
-	int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
-
-	/* Reinit occurs when program types change e.g. TCP_BPF_TX is removed
-	 * or added requiring sk_prot hook updates. We keep original saved
-	 * hooks in this case.
-	 *
-	 * Pairs with lockless read in sk_clone_lock().
-	 */
-	WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]);
-}
-
 static int tcp_bpf_assert_proto_ops(struct proto *ops)
 {
 	/* In order to avoid retpoline, we make assumptions when we call
@@ -670,7 +656,7 @@ void tcp_bpf_reinit(struct sock *sk)
 
 	rcu_read_lock();
 	psock = sk_psock(sk);
-	tcp_bpf_reinit_sk_prot(sk, psock);
+	tcp_bpf_update_sk_prot(sk, psock);
 	rcu_read_unlock();
 }
 
-- 
cgit v1.2.3


From f747632b608f90217a4e9ebb1deba8a37612aa32 Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Mon, 9 Mar 2020 11:12:36 +0000
Subject: bpf: sockmap: Move generic sockmap hooks from BPF TCP

The init, close and unhash handlers from TCP sockmap are generic,
and can be reused by UDP sockmap. Move the helpers into the sockmap code
base and expose them. This requires tcp_bpf_get_proto and tcp_bpf_clone to
be conditional on BPF_STREAM_PARSER.

The moved functions are unmodified, except that sk_psock_unlink is
renamed to sock_map_unlink to better match its behaviour.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200309111243.6982-6-lmb@cloudflare.com
---
 include/linux/bpf.h   |   4 +-
 include/linux/skmsg.h |  28 -------------
 include/net/tcp.h     |  15 ++++---
 net/core/sock_map.c   | 106 +++++++++++++++++++++++++++++++++++++++++++++++---
 net/ipv4/tcp_bpf.c    |  84 +++------------------------------------
 5 files changed, 118 insertions(+), 119 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 40c53924571d..94a329b9da81 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1419,6 +1419,8 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
 #if defined(CONFIG_BPF_STREAM_PARSER)
 int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which);
 int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
+void sock_map_unhash(struct sock *sk);
+void sock_map_close(struct sock *sk, long timeout);
 #else
 static inline int sock_map_prog_update(struct bpf_map *map,
 				       struct bpf_prog *prog, u32 which)
@@ -1431,7 +1433,7 @@ static inline int sock_map_get_from_fd(const union bpf_attr *attr,
 {
 	return -EINVAL;
 }
-#endif
+#endif /* CONFIG_BPF_STREAM_PARSER */
 
 #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
 void bpf_sk_reuseport_detach(struct sock *sk);
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 2be51b7a5800..8a709f63c5e5 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -323,14 +323,6 @@ static inline void sk_psock_free_link(struct sk_psock_link *link)
 }
 
 struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock);
-#if defined(CONFIG_BPF_STREAM_PARSER)
-void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link);
-#else
-static inline void sk_psock_unlink(struct sock *sk,
-				   struct sk_psock_link *link)
-{
-}
-#endif
 
 void __sk_psock_purge_ingress_msg(struct sk_psock *psock);
 
@@ -399,26 +391,6 @@ static inline bool sk_psock_test_state(const struct sk_psock *psock,
 	return test_bit(bit, &psock->state);
 }
 
-static inline struct sk_psock *sk_psock_get_checked(struct sock *sk)
-{
-	struct sk_psock *psock;
-
-	rcu_read_lock();
-	psock = sk_psock(sk);
-	if (psock) {
-		if (sk->sk_prot->recvmsg != tcp_bpf_recvmsg) {
-			psock = ERR_PTR(-EBUSY);
-			goto out;
-		}
-
-		if (!refcount_inc_not_zero(&psock->refcnt))
-			psock = ERR_PTR(-EBUSY);
-	}
-out:
-	rcu_read_unlock();
-	return psock;
-}
-
 static inline struct sk_psock *sk_psock_get(struct sock *sk)
 {
 	struct sk_psock *psock;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index ad3abeaa703e..43fa07a36fa6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2195,19 +2195,22 @@ void tcp_update_ulp(struct sock *sk, struct proto *p,
 struct sk_msg;
 struct sk_psock;
 
+#ifdef CONFIG_BPF_STREAM_PARSER
+struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock);
+void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
+#else
+static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
+{
+}
+#endif /* CONFIG_BPF_STREAM_PARSER */
+
 #ifdef CONFIG_NET_SOCK_MSG
-int tcp_bpf_init(struct sock *sk);
 int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
 			  int flags);
 int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		    int nonblock, int flags, int *addr_len);
 int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 		      struct msghdr *msg, int len, int flags);
-void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
-#else
-static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
-{
-}
 #endif /* CONFIG_NET_SOCK_MSG */
 
 /* Call BPF_SOCK_OPS program that returns an int. If the return value
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index fafcbd22ecba..cb240d87e068 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -141,6 +141,51 @@ static void sock_map_unref(struct sock *sk, void *link_raw)
 	}
 }
 
+static int sock_map_init_proto(struct sock *sk)
+{
+	struct sk_psock *psock;
+	struct proto *prot;
+
+	sock_owned_by_me(sk);
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	prot = tcp_bpf_get_proto(sk, psock);
+	if (IS_ERR(prot)) {
+		rcu_read_unlock();
+		return PTR_ERR(prot);
+	}
+
+	sk_psock_update_proto(sk, psock, prot);
+	rcu_read_unlock();
+	return 0;
+}
+
+static struct sk_psock *sock_map_psock_get_checked(struct sock *sk)
+{
+	struct sk_psock *psock;
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (psock) {
+		if (sk->sk_prot->recvmsg != tcp_bpf_recvmsg) {
+			psock = ERR_PTR(-EBUSY);
+			goto out;
+		}
+
+		if (!refcount_inc_not_zero(&psock->refcnt))
+			psock = ERR_PTR(-EBUSY);
+	}
+out:
+	rcu_read_unlock();
+	return psock;
+}
+
 static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 			 struct sock *sk)
 {
@@ -172,7 +217,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 		}
 	}
 
-	psock = sk_psock_get_checked(sk);
+	psock = sock_map_psock_get_checked(sk);
 	if (IS_ERR(psock)) {
 		ret = PTR_ERR(psock);
 		goto out_progs;
@@ -196,7 +241,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 	if (msg_parser)
 		psock_set_prog(&psock->progs.msg_parser, msg_parser);
 
-	ret = tcp_bpf_init(sk);
+	ret = sock_map_init_proto(sk);
 	if (ret < 0)
 		goto out_drop;
 
@@ -231,7 +276,7 @@ static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk)
 	struct sk_psock *psock;
 	int ret;
 
-	psock = sk_psock_get_checked(sk);
+	psock = sock_map_psock_get_checked(sk);
 	if (IS_ERR(psock))
 		return PTR_ERR(psock);
 
@@ -241,7 +286,7 @@ static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk)
 			return -ENOMEM;
 	}
 
-	ret = tcp_bpf_init(sk);
+	ret = sock_map_init_proto(sk);
 	if (ret < 0)
 		sk_psock_put(sk, psock);
 	return ret;
@@ -1120,7 +1165,7 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
 	return 0;
 }
 
-void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link)
+static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link)
 {
 	switch (link->map->map_type) {
 	case BPF_MAP_TYPE_SOCKMAP:
@@ -1133,3 +1178,54 @@ void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link)
 		break;
 	}
 }
+
+static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock)
+{
+	struct sk_psock_link *link;
+
+	while ((link = sk_psock_link_pop(psock))) {
+		sock_map_unlink(sk, link);
+		sk_psock_free_link(link);
+	}
+}
+
+void sock_map_unhash(struct sock *sk)
+{
+	void (*saved_unhash)(struct sock *sk);
+	struct sk_psock *psock;
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		if (sk->sk_prot->unhash)
+			sk->sk_prot->unhash(sk);
+		return;
+	}
+
+	saved_unhash = psock->saved_unhash;
+	sock_map_remove_links(sk, psock);
+	rcu_read_unlock();
+	saved_unhash(sk);
+}
+
+void sock_map_close(struct sock *sk, long timeout)
+{
+	void (*saved_close)(struct sock *sk, long timeout);
+	struct sk_psock *psock;
+
+	lock_sock(sk);
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		release_sock(sk);
+		return sk->sk_prot->close(sk, timeout);
+	}
+
+	saved_close = psock->saved_close;
+	sock_map_remove_links(sk, psock);
+	rcu_read_unlock();
+	release_sock(sk);
+	saved_close(sk, timeout);
+}
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index ed8a8f3c9afe..fe7b4fbc31c1 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -528,57 +528,7 @@ out_err:
 	return copied ? copied : err;
 }
 
-static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock)
-{
-	struct sk_psock_link *link;
-
-	while ((link = sk_psock_link_pop(psock))) {
-		sk_psock_unlink(sk, link);
-		sk_psock_free_link(link);
-	}
-}
-
-static void tcp_bpf_unhash(struct sock *sk)
-{
-	void (*saved_unhash)(struct sock *sk);
-	struct sk_psock *psock;
-
-	rcu_read_lock();
-	psock = sk_psock(sk);
-	if (unlikely(!psock)) {
-		rcu_read_unlock();
-		if (sk->sk_prot->unhash)
-			sk->sk_prot->unhash(sk);
-		return;
-	}
-
-	saved_unhash = psock->saved_unhash;
-	tcp_bpf_remove(sk, psock);
-	rcu_read_unlock();
-	saved_unhash(sk);
-}
-
-static void tcp_bpf_close(struct sock *sk, long timeout)
-{
-	void (*saved_close)(struct sock *sk, long timeout);
-	struct sk_psock *psock;
-
-	lock_sock(sk);
-	rcu_read_lock();
-	psock = sk_psock(sk);
-	if (unlikely(!psock)) {
-		rcu_read_unlock();
-		release_sock(sk);
-		return sk->sk_prot->close(sk, timeout);
-	}
-
-	saved_close = psock->saved_close;
-	tcp_bpf_remove(sk, psock);
-	rcu_read_unlock();
-	release_sock(sk);
-	saved_close(sk, timeout);
-}
-
+#ifdef CONFIG_BPF_STREAM_PARSER
 enum {
 	TCP_BPF_IPV4,
 	TCP_BPF_IPV6,
@@ -599,8 +549,8 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
 				   struct proto *base)
 {
 	prot[TCP_BPF_BASE]			= *base;
-	prot[TCP_BPF_BASE].unhash		= tcp_bpf_unhash;
-	prot[TCP_BPF_BASE].close		= tcp_bpf_close;
+	prot[TCP_BPF_BASE].unhash		= sock_map_unhash;
+	prot[TCP_BPF_BASE].close		= sock_map_close;
 	prot[TCP_BPF_BASE].recvmsg		= tcp_bpf_recvmsg;
 	prot[TCP_BPF_BASE].stream_memory_read	= tcp_bpf_stream_read;
 
@@ -640,7 +590,7 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops)
 	       ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
 }
 
-static struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
+struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
 {
 	int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
 	int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
@@ -657,31 +607,6 @@ static struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
 	return &tcp_bpf_prots[family][config];
 }
 
-int tcp_bpf_init(struct sock *sk)
-{
-	struct sk_psock *psock;
-	struct proto *prot;
-
-	sock_owned_by_me(sk);
-
-	rcu_read_lock();
-	psock = sk_psock(sk);
-	if (unlikely(!psock)) {
-		rcu_read_unlock();
-		return -EINVAL;
-	}
-
-	prot = tcp_bpf_get_proto(sk, psock);
-	if (IS_ERR(prot)) {
-		rcu_read_unlock();
-		return PTR_ERR(prot);
-	}
-
-	sk_psock_update_proto(sk, psock, prot);
-	rcu_read_unlock();
-	return 0;
-}
-
 /* If a child got cloned from a listening socket that had tcp_bpf
  * protocol callbacks installed, we need to restore the callbacks to
  * the default ones because the child does not inherit the psock state
@@ -695,3 +620,4 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
 	if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE])
 		newsk->sk_prot = sk->sk_prot_creator;
 }
+#endif /* CONFIG_BPF_STREAM_PARSER */
-- 
cgit v1.2.3


From fcb26bd2b6cab573f06e5855638368cf88e99c2b Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Mon, 9 Mar 2020 09:36:26 +0100
Subject: net: phy: Add Synopsys DesignWare XPCS MDIO module

Synopsys DesignWare XPCS is an MMD that can manage link status,
auto-negotiation, link training, ...

In this commit we add basic support for XPCS using USXGMII interface and
Clause 73 Auto-negotiation.

This is highly tied with PHYLINK and can't be used without it. A given
ethernet driver can use the provided callbacks to add the support for
XPCS.

Signed-off-by: Jose Abreu <Jose.Abreu@synopsys.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                 |   7 +
 drivers/net/phy/Kconfig     |   6 +
 drivers/net/phy/Makefile    |   1 +
 drivers/net/phy/mdio-xpcs.c | 612 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mdio-xpcs.h   |  41 +++
 5 files changed, 667 insertions(+)
 create mode 100644 drivers/net/phy/mdio-xpcs.c
 create mode 100644 include/linux/mdio-xpcs.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index e8666f980a21..6918c3f0ff1c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16117,6 +16117,13 @@ L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/synopsys/
 
+SYNOPSYS DESIGNWARE ETHERNET XPCS DRIVER
+M:	Jose Abreu <Jose.Abreu@synopsys.com>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	drivers/net/phy/mdio-xpcs.c
+F:	include/linux/mdio-xpcs.h
+
 SYNOPSYS DESIGNWARE I2C DRIVER
 M:	Jarkko Nikula <jarkko.nikula@linux.intel.com>
 R:	Andy Shevchenko <andriy.shevchenko@linux.intel.com>
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index d6f197e06134..cc7f1df855da 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -214,6 +214,12 @@ config MDIO_XGENE
 	  This module provides a driver for the MDIO busses found in the
 	  APM X-Gene SoC's.
 
+config MDIO_XPCS
+	tristate "Synopsys DesignWare XPCS controller"
+	help
+	  This module provides helper functions for Synopsys DesignWare XPCS
+	  controllers.
+
 endif
 endif
 
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index d9b3c0fec8e3..26f8039f300f 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_MDIO_OCTEON)	+= mdio-octeon.o
 obj-$(CONFIG_MDIO_SUN4I)	+= mdio-sun4i.o
 obj-$(CONFIG_MDIO_THUNDER)	+= mdio-thunder.o
 obj-$(CONFIG_MDIO_XGENE)	+= mdio-xgene.o
+obj-$(CONFIG_MDIO_XPCS)		+= mdio-xpcs.o
 
 obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += mii_timestamper.o
 
diff --git a/drivers/net/phy/mdio-xpcs.c b/drivers/net/phy/mdio-xpcs.c
new file mode 100644
index 000000000000..973f588146f7
--- /dev/null
+++ b/drivers/net/phy/mdio-xpcs.c
@@ -0,0 +1,612 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Synopsys, Inc. and/or its affiliates.
+ * Synopsys DesignWare XPCS helpers
+ *
+ * Author: Jose Abreu <Jose.Abreu@synopsys.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/mdio.h>
+#include <linux/mdio-xpcs.h>
+#include <linux/phylink.h>
+#include <linux/workqueue.h>
+
+#define SYNOPSYS_XPCS_USXGMII_ID	0x7996ced0
+#define SYNOPSYS_XPCS_10GKR_ID		0x7996ced0
+#define SYNOPSYS_XPCS_MASK		0xffffffff
+
+/* Vendor regs access */
+#define DW_VENDOR			BIT(15)
+
+/* VR_XS_PCS */
+#define DW_USXGMII_RST			BIT(10)
+#define DW_USXGMII_EN			BIT(9)
+#define DW_VR_XS_PCS_DIG_STS		0x0010
+#define DW_RXFIFO_ERR			GENMASK(6, 5)
+
+/* SR_MII */
+#define DW_USXGMII_FULL			BIT(8)
+#define DW_USXGMII_SS_MASK		(BIT(13) | BIT(6) | BIT(5))
+#define DW_USXGMII_10000		(BIT(13) | BIT(6))
+#define DW_USXGMII_5000			(BIT(13) | BIT(5))
+#define DW_USXGMII_2500			(BIT(5))
+#define DW_USXGMII_1000			(BIT(6))
+#define DW_USXGMII_100			(BIT(13))
+#define DW_USXGMII_10			(0)
+
+/* SR_AN */
+#define DW_SR_AN_ADV1			0x10
+#define DW_SR_AN_ADV2			0x11
+#define DW_SR_AN_ADV3			0x12
+#define DW_SR_AN_LP_ABL1		0x13
+#define DW_SR_AN_LP_ABL2		0x14
+#define DW_SR_AN_LP_ABL3		0x15
+
+/* Clause 73 Defines */
+/* AN_LP_ABL1 */
+#define DW_C73_PAUSE			BIT(10)
+#define DW_C73_ASYM_PAUSE		BIT(11)
+#define DW_C73_AN_ADV_SF		0x1
+/* AN_LP_ABL2 */
+#define DW_C73_1000KX			BIT(5)
+#define DW_C73_10000KX4			BIT(6)
+#define DW_C73_10000KR			BIT(7)
+/* AN_LP_ABL3 */
+#define DW_C73_2500KX			BIT(0)
+#define DW_C73_5000KR			BIT(1)
+
+static const int xpcs_usxgmii_features[] = {
+	ETHTOOL_LINK_MODE_Pause_BIT,
+	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+	ETHTOOL_LINK_MODE_Autoneg_BIT,
+	ETHTOOL_LINK_MODE_1000baseKX_Full_BIT,
+	ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT,
+	ETHTOOL_LINK_MODE_10000baseKR_Full_BIT,
+	ETHTOOL_LINK_MODE_2500baseX_Full_BIT,
+	__ETHTOOL_LINK_MODE_MASK_NBITS,
+};
+
+static const int xpcs_10gkr_features[] = {
+	ETHTOOL_LINK_MODE_Pause_BIT,
+	ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+	ETHTOOL_LINK_MODE_10000baseKR_Full_BIT,
+	__ETHTOOL_LINK_MODE_MASK_NBITS,
+};
+
+static const phy_interface_t xpcs_usxgmii_interfaces[] = {
+	PHY_INTERFACE_MODE_USXGMII,
+	PHY_INTERFACE_MODE_MAX,
+};
+
+static const phy_interface_t xpcs_10gkr_interfaces[] = {
+	PHY_INTERFACE_MODE_10GKR,
+	PHY_INTERFACE_MODE_MAX,
+};
+
+static struct xpcs_id {
+	u32 id;
+	u32 mask;
+	const int *supported;
+	const phy_interface_t *interface;
+} xpcs_id_list[] = {
+	{
+		.id = SYNOPSYS_XPCS_USXGMII_ID,
+		.mask = SYNOPSYS_XPCS_MASK,
+		.supported = xpcs_usxgmii_features,
+		.interface = xpcs_usxgmii_interfaces,
+	}, {
+		.id = SYNOPSYS_XPCS_10GKR_ID,
+		.mask = SYNOPSYS_XPCS_MASK,
+		.supported = xpcs_10gkr_features,
+		.interface = xpcs_10gkr_interfaces,
+	},
+};
+
+static int xpcs_read(struct mdio_xpcs_args *xpcs, int dev, u32 reg)
+{
+	u32 reg_addr = MII_ADDR_C45 | dev << 16 | reg;
+
+	return mdiobus_read(xpcs->bus, xpcs->addr, reg_addr);
+}
+
+static int xpcs_write(struct mdio_xpcs_args *xpcs, int dev, u32 reg, u16 val)
+{
+	u32 reg_addr = MII_ADDR_C45 | dev << 16 | reg;
+
+	return mdiobus_write(xpcs->bus, xpcs->addr, reg_addr, val);
+}
+
+static int xpcs_read_vendor(struct mdio_xpcs_args *xpcs, int dev, u32 reg)
+{
+	return xpcs_read(xpcs, dev, DW_VENDOR | reg);
+}
+
+static int xpcs_write_vendor(struct mdio_xpcs_args *xpcs, int dev, int reg,
+			     u16 val)
+{
+	return xpcs_write(xpcs, dev, DW_VENDOR | reg, val);
+}
+
+static int xpcs_read_vpcs(struct mdio_xpcs_args *xpcs, int reg)
+{
+	return xpcs_read_vendor(xpcs, MDIO_MMD_PCS, reg);
+}
+
+static int xpcs_write_vpcs(struct mdio_xpcs_args *xpcs, int reg, u16 val)
+{
+	return xpcs_write_vendor(xpcs, MDIO_MMD_PCS, reg, val);
+}
+
+static int xpcs_poll_reset(struct mdio_xpcs_args *xpcs, int dev)
+{
+	/* Poll until the reset bit clears (50ms per retry == 0.6 sec) */
+	unsigned int retries = 12;
+	int ret;
+
+	do {
+		msleep(50);
+		ret = xpcs_read(xpcs, dev, MDIO_CTRL1);
+		if (ret < 0)
+			return ret;
+	} while (ret & MDIO_CTRL1_RESET && --retries);
+
+	return (ret & MDIO_CTRL1_RESET) ? -ETIMEDOUT : 0;
+}
+
+static int xpcs_soft_reset(struct mdio_xpcs_args *xpcs, int dev)
+{
+	int ret;
+
+	ret = xpcs_write(xpcs, dev, MDIO_CTRL1, MDIO_CTRL1_RESET);
+	if (ret < 0)
+		return ret;
+
+	return xpcs_poll_reset(xpcs, dev);
+}
+
+#define xpcs_warn(__xpcs, __state, __args...) \
+({ \
+	if ((__state)->link) \
+		dev_warn(&(__xpcs)->bus->dev, ##__args); \
+})
+
+static int xpcs_read_fault(struct mdio_xpcs_args *xpcs,
+			   struct phylink_link_state *state)
+{
+	int ret;
+
+	ret = xpcs_read(xpcs, MDIO_MMD_PCS, MDIO_STAT1);
+	if (ret < 0)
+		return ret;
+
+	if (ret & MDIO_STAT1_FAULT) {
+		xpcs_warn(xpcs, state, "Link fault condition detected!\n");
+		return -EFAULT;
+	}
+
+	ret = xpcs_read(xpcs, MDIO_MMD_PCS, MDIO_STAT2);
+	if (ret < 0)
+		return ret;
+
+	if (ret & MDIO_STAT2_RXFAULT)
+		xpcs_warn(xpcs, state, "Receiver fault detected!\n");
+	if (ret & MDIO_STAT2_TXFAULT)
+		xpcs_warn(xpcs, state, "Transmitter fault detected!\n");
+
+	ret = xpcs_read_vendor(xpcs, MDIO_MMD_PCS, DW_VR_XS_PCS_DIG_STS);
+	if (ret < 0)
+		return ret;
+
+	if (ret & DW_RXFIFO_ERR) {
+		xpcs_warn(xpcs, state, "FIFO fault condition detected!\n");
+		return -EFAULT;
+	}
+
+	ret = xpcs_read(xpcs, MDIO_MMD_PCS, MDIO_PCS_10GBRT_STAT1);
+	if (ret < 0)
+		return ret;
+
+	if (!(ret & MDIO_PCS_10GBRT_STAT1_BLKLK))
+		xpcs_warn(xpcs, state, "Link is not locked!\n");
+
+	ret = xpcs_read(xpcs, MDIO_MMD_PCS, MDIO_PCS_10GBRT_STAT2);
+	if (ret < 0)
+		return ret;
+
+	if (ret & MDIO_PCS_10GBRT_STAT2_ERR)
+		xpcs_warn(xpcs, state, "Link has errors!\n");
+
+	return 0;
+}
+
+static int xpcs_read_link(struct mdio_xpcs_args *xpcs, bool an)
+{
+	bool link = true;
+	int ret;
+
+	ret = xpcs_read(xpcs, MDIO_MMD_PCS, MDIO_STAT1);
+	if (ret < 0)
+		return ret;
+
+	if (!(ret & MDIO_STAT1_LSTATUS))
+		link = false;
+
+	if (an) {
+		ret = xpcs_read(xpcs, MDIO_MMD_AN, MDIO_STAT1);
+		if (ret < 0)
+			return ret;
+
+		if (!(ret & MDIO_STAT1_LSTATUS))
+			link = false;
+	}
+
+	return link;
+}
+
+static int xpcs_get_max_usxgmii_speed(const unsigned long *supported)
+{
+	int max = SPEED_UNKNOWN;
+
+	if (phylink_test(supported, 1000baseKX_Full))
+		max = SPEED_1000;
+	if (phylink_test(supported, 2500baseX_Full))
+		max = SPEED_2500;
+	if (phylink_test(supported, 10000baseKX4_Full))
+		max = SPEED_10000;
+	if (phylink_test(supported, 10000baseKR_Full))
+		max = SPEED_10000;
+
+	return max;
+}
+
+static int xpcs_config_usxgmii(struct mdio_xpcs_args *xpcs, int speed)
+{
+	int ret, speed_sel;
+
+	switch (speed) {
+	case SPEED_10:
+		speed_sel = DW_USXGMII_10;
+		break;
+	case SPEED_100:
+		speed_sel = DW_USXGMII_100;
+		break;
+	case SPEED_1000:
+		speed_sel = DW_USXGMII_1000;
+		break;
+	case SPEED_2500:
+		speed_sel = DW_USXGMII_2500;
+		break;
+	case SPEED_5000:
+		speed_sel = DW_USXGMII_5000;
+		break;
+	case SPEED_10000:
+		speed_sel = DW_USXGMII_10000;
+		break;
+	default:
+		/* Nothing to do here */
+		return -EINVAL;
+	}
+
+	ret = xpcs_read_vpcs(xpcs, MDIO_CTRL1);
+	if (ret < 0)
+		return ret;
+
+	ret = xpcs_write_vpcs(xpcs, MDIO_CTRL1, ret | DW_USXGMII_EN);
+	if (ret < 0)
+		return ret;
+
+	ret = xpcs_read(xpcs, MDIO_MMD_VEND2, MDIO_CTRL1);
+	if (ret < 0)
+		return ret;
+
+	ret &= ~DW_USXGMII_SS_MASK;
+	ret |= speed_sel | DW_USXGMII_FULL;
+
+	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, MDIO_CTRL1, ret);
+	if (ret < 0)
+		return ret;
+
+	ret = xpcs_read_vpcs(xpcs, MDIO_CTRL1);
+	if (ret < 0)
+		return ret;
+
+	return xpcs_write_vpcs(xpcs, MDIO_CTRL1, ret | DW_USXGMII_RST);
+}
+
+static int xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs)
+{
+	int ret, adv;
+
+	/* By default, in USXGMII mode XPCS operates at 10G baud and
+	 * replicates data to achieve lower speeds. Hereby, in this
+	 * default configuration we need to advertise all supported
+	 * modes and not only the ones we want to use.
+	 */
+
+	/* SR_AN_ADV3 */
+	adv = 0;
+	if (phylink_test(xpcs->supported, 2500baseX_Full))
+		adv |= DW_C73_2500KX;
+
+	/* TODO: 5000baseKR */
+
+	ret = xpcs_write(xpcs, MDIO_MMD_AN, DW_SR_AN_ADV3, adv);
+	if (ret < 0)
+		return ret;
+
+	/* SR_AN_ADV2 */
+	adv = 0;
+	if (phylink_test(xpcs->supported, 1000baseKX_Full))
+		adv |= DW_C73_1000KX;
+	if (phylink_test(xpcs->supported, 10000baseKX4_Full))
+		adv |= DW_C73_10000KX4;
+	if (phylink_test(xpcs->supported, 10000baseKR_Full))
+		adv |= DW_C73_10000KR;
+
+	ret = xpcs_write(xpcs, MDIO_MMD_AN, DW_SR_AN_ADV2, adv);
+	if (ret < 0)
+		return ret;
+
+	/* SR_AN_ADV1 */
+	adv = DW_C73_AN_ADV_SF;
+	if (phylink_test(xpcs->supported, Pause))
+		adv |= DW_C73_PAUSE;
+	if (phylink_test(xpcs->supported, Asym_Pause))
+		adv |= DW_C73_ASYM_PAUSE;
+
+	return xpcs_write(xpcs, MDIO_MMD_AN, DW_SR_AN_ADV1, adv);
+}
+
+static int xpcs_config_aneg(struct mdio_xpcs_args *xpcs)
+{
+	int ret;
+
+	ret = xpcs_config_aneg_c73(xpcs);
+	if (ret < 0)
+		return ret;
+
+	ret = xpcs_read(xpcs, MDIO_MMD_AN, MDIO_CTRL1);
+	if (ret < 0)
+		return ret;
+
+	ret |= MDIO_AN_CTRL1_ENABLE | MDIO_AN_CTRL1_RESTART;
+
+	return xpcs_write(xpcs, MDIO_MMD_AN, MDIO_CTRL1, ret);
+}
+
+static int xpcs_aneg_done(struct mdio_xpcs_args *xpcs,
+			  struct phylink_link_state *state)
+{
+	int ret;
+
+	ret = xpcs_read(xpcs, MDIO_MMD_AN, MDIO_STAT1);
+	if (ret < 0)
+		return ret;
+
+	if (ret & MDIO_AN_STAT1_COMPLETE) {
+		ret = xpcs_read(xpcs, MDIO_MMD_AN, DW_SR_AN_LP_ABL1);
+		if (ret < 0)
+			return ret;
+
+		/* Check if Aneg outcome is valid */
+		if (!(ret & DW_C73_AN_ADV_SF))
+			return 0;
+
+		return 1;
+	}
+
+	return 0;
+}
+
+static int xpcs_read_lpa(struct mdio_xpcs_args *xpcs,
+			 struct phylink_link_state *state)
+{
+	int ret;
+
+	ret = xpcs_read(xpcs, MDIO_MMD_AN, MDIO_STAT1);
+	if (ret < 0)
+		return ret;
+
+	if (!(ret & MDIO_AN_STAT1_LPABLE)) {
+		phylink_clear(state->lp_advertising, Autoneg);
+		return 0;
+	}
+
+	phylink_set(state->lp_advertising, Autoneg);
+
+	/* Clause 73 outcome */
+	ret = xpcs_read(xpcs, MDIO_MMD_AN, DW_SR_AN_LP_ABL3);
+	if (ret < 0)
+		return ret;
+
+	if (ret & DW_C73_2500KX)
+		phylink_set(state->lp_advertising, 2500baseX_Full);
+
+	ret = xpcs_read(xpcs, MDIO_MMD_AN, DW_SR_AN_LP_ABL2);
+	if (ret < 0)
+		return ret;
+
+	if (ret & DW_C73_1000KX)
+		phylink_set(state->lp_advertising, 1000baseKX_Full);
+	if (ret & DW_C73_10000KX4)
+		phylink_set(state->lp_advertising, 10000baseKX4_Full);
+	if (ret & DW_C73_10000KR)
+		phylink_set(state->lp_advertising, 10000baseKR_Full);
+
+	ret = xpcs_read(xpcs, MDIO_MMD_AN, DW_SR_AN_LP_ABL1);
+	if (ret < 0)
+		return ret;
+
+	if (ret & DW_C73_PAUSE)
+		phylink_set(state->lp_advertising, Pause);
+	if (ret & DW_C73_ASYM_PAUSE)
+		phylink_set(state->lp_advertising, Asym_Pause);
+
+	linkmode_and(state->lp_advertising, state->lp_advertising,
+		     state->advertising);
+	return 0;
+}
+
+static void xpcs_resolve_lpa(struct mdio_xpcs_args *xpcs,
+			     struct phylink_link_state *state)
+{
+	int max_speed = xpcs_get_max_usxgmii_speed(state->lp_advertising);
+
+	state->pause = MLO_PAUSE_TX | MLO_PAUSE_RX;
+	state->speed = max_speed;
+	state->duplex = DUPLEX_FULL;
+}
+
+static void xpcs_resolve_pma(struct mdio_xpcs_args *xpcs,
+			     struct phylink_link_state *state)
+{
+	state->pause = MLO_PAUSE_TX | MLO_PAUSE_RX;
+	state->duplex = DUPLEX_FULL;
+
+	switch (state->interface) {
+	case PHY_INTERFACE_MODE_10GKR:
+		state->speed = SPEED_10000;
+		break;
+	default:
+		state->speed = SPEED_UNKNOWN;
+		break;
+	}
+}
+
+static int xpcs_validate(struct mdio_xpcs_args *xpcs,
+			 unsigned long *supported,
+			 struct phylink_link_state *state)
+{
+	linkmode_and(supported, supported, xpcs->supported);
+	linkmode_and(state->advertising, state->advertising, xpcs->supported);
+	return 0;
+}
+
+static int xpcs_config(struct mdio_xpcs_args *xpcs,
+		       const struct phylink_link_state *state)
+{
+	int ret;
+
+	if (state->an_enabled) {
+		ret = xpcs_config_aneg(xpcs);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int xpcs_get_state(struct mdio_xpcs_args *xpcs,
+			  struct phylink_link_state *state)
+{
+	int ret;
+
+	/* Link needs to be read first ... */
+	state->link = xpcs_read_link(xpcs, state->an_enabled) > 0 ? 1 : 0;
+
+	/* ... and then we check the faults. */
+	ret = xpcs_read_fault(xpcs, state);
+	if (ret) {
+		ret = xpcs_soft_reset(xpcs, MDIO_MMD_PCS);
+		if (ret)
+			return ret;
+
+		state->link = 0;
+
+		return xpcs_config(xpcs, state);
+	}
+
+	if (state->link && state->an_enabled && xpcs_aneg_done(xpcs, state)) {
+		state->an_complete = true;
+		xpcs_read_lpa(xpcs, state);
+		xpcs_resolve_lpa(xpcs, state);
+	} else if (state->link) {
+		xpcs_resolve_pma(xpcs, state);
+	}
+
+	return 0;
+}
+
+static int xpcs_link_up(struct mdio_xpcs_args *xpcs, int speed,
+			phy_interface_t interface)
+{
+	if (interface == PHY_INTERFACE_MODE_USXGMII)
+		return xpcs_config_usxgmii(xpcs, speed);
+
+	return 0;
+}
+
+static u32 xpcs_get_id(struct mdio_xpcs_args *xpcs)
+{
+	int ret;
+	u32 id;
+
+	ret = xpcs_read(xpcs, MDIO_MMD_PCS, MII_PHYSID1);
+	if (ret < 0)
+		return 0xffffffff;
+
+	id = ret << 16;
+
+	ret = xpcs_read(xpcs, MDIO_MMD_PCS, MII_PHYSID2);
+	if (ret < 0)
+		return 0xffffffff;
+
+	return id | ret;
+}
+
+static bool xpcs_check_features(struct mdio_xpcs_args *xpcs,
+				struct xpcs_id *match,
+				phy_interface_t interface)
+{
+	int i;
+
+	for (i = 0; match->interface[i] != PHY_INTERFACE_MODE_MAX; i++) {
+		if (match->interface[i] == interface)
+			break;
+	}
+
+	if (match->interface[i] == PHY_INTERFACE_MODE_MAX)
+		return false;
+
+	for (i = 0; match->supported[i] != __ETHTOOL_LINK_MODE_MASK_NBITS; i++)
+		set_bit(match->supported[i], xpcs->supported);
+
+	return true;
+}
+
+static int xpcs_probe(struct mdio_xpcs_args *xpcs, phy_interface_t interface)
+{
+	u32 xpcs_id = xpcs_get_id(xpcs);
+	struct xpcs_id *match = NULL;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(xpcs_id_list); i++) {
+		struct xpcs_id *entry = &xpcs_id_list[i];
+
+		if ((xpcs_id & entry->mask) == entry->id) {
+			match = entry;
+
+			if (xpcs_check_features(xpcs, match, interface))
+				return 0;
+		}
+	}
+
+	return -ENODEV;
+}
+
+static struct mdio_xpcs_ops xpcs_ops = {
+	.validate = xpcs_validate,
+	.config = xpcs_config,
+	.get_state = xpcs_get_state,
+	.link_up = xpcs_link_up,
+	.probe = xpcs_probe,
+};
+
+struct mdio_xpcs_ops *mdio_xpcs_get_ops(void)
+{
+	return &xpcs_ops;
+}
+EXPORT_SYMBOL_GPL(mdio_xpcs_get_ops);
+
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mdio-xpcs.h b/include/linux/mdio-xpcs.h
new file mode 100644
index 000000000000..9a841aa5982d
--- /dev/null
+++ b/include/linux/mdio-xpcs.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2020 Synopsys, Inc. and/or its affiliates.
+ * Synopsys DesignWare XPCS helpers
+ */
+
+#ifndef __LINUX_MDIO_XPCS_H
+#define __LINUX_MDIO_XPCS_H
+
+#include <linux/phy.h>
+#include <linux/phylink.h>
+
+struct mdio_xpcs_args {
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
+	struct mii_bus *bus;
+	int addr;
+};
+
+struct mdio_xpcs_ops {
+	int (*validate)(struct mdio_xpcs_args *xpcs,
+			unsigned long *supported,
+			struct phylink_link_state *state);
+	int (*config)(struct mdio_xpcs_args *xpcs,
+		      const struct phylink_link_state *state);
+	int (*get_state)(struct mdio_xpcs_args *xpcs,
+			 struct phylink_link_state *state);
+	int (*link_up)(struct mdio_xpcs_args *xpcs, int speed,
+		       phy_interface_t interface);
+	int (*probe)(struct mdio_xpcs_args *xpcs, phy_interface_t interface);
+};
+
+#if IS_ENABLED(CONFIG_MDIO_XPCS)
+struct mdio_xpcs_ops *mdio_xpcs_get_ops(void);
+#else
+static inline struct mdio_xpcs_ops *mdio_xpcs_get_ops(void)
+{
+	return NULL;
+}
+#endif
+
+#endif /* __LINUX_MDIO_XPCS_H */
-- 
cgit v1.2.3


From f213bbe8a9d6ba1d0adf424787c02f361ea78c38 Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Mon, 9 Mar 2020 09:36:27 +0100
Subject: net: stmmac: Integrate it with DesignWare XPCS

Adds all the necessary logic so that stmmac can be used with Synopsys
DesignWare XPCS.

Signed-off-by: Jose Abreu <Jose.Abreu@synopsys.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/Kconfig       |  1 +
 drivers/net/ethernet/stmicro/stmmac/common.h      |  3 +++
 drivers/net/ethernet/stmicro/stmmac/hwif.h        | 12 ++++++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 16 ++++++++++++--
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 27 +++++++++++++++++++++++
 include/linux/stmmac.h                            |  1 +
 6 files changed, 58 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig
index 338e25a6374e..9ad927f646e8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/Kconfig
+++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig
@@ -3,6 +3,7 @@ config STMMAC_ETH
 	tristate "STMicroelectronics Multi-Gigabit Ethernet driver"
 	depends on HAS_IOMEM && HAS_DMA
 	select MII
+	select MDIO_XPCS
 	select PAGE_POOL
 	select PHYLINK
 	select CRC32
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 487099092693..9bdbf589d93f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -15,6 +15,7 @@
 #include <linux/netdevice.h>
 #include <linux/stmmac.h>
 #include <linux/phy.h>
+#include <linux/mdio-xpcs.h>
 #include <linux/module.h>
 #if IS_ENABLED(CONFIG_VLAN_8021Q)
 #define STMMAC_VLAN_TAG_USED
@@ -446,6 +447,8 @@ struct mac_device_info {
 	const struct stmmac_hwtimestamp *ptp;
 	const struct stmmac_tc_ops *tc;
 	const struct stmmac_mmc_ops *mmc;
+	const struct mdio_xpcs_ops *xpcs;
+	struct mdio_xpcs_args xpcs_args;
 	struct mii_regs mii;	/* MII register Addresses */
 	struct mac_link link;
 	void __iomem *pcsr;     /* vpointer to device CSRs */
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index df63b0367aff..c71dd99c8abf 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -577,6 +577,18 @@ struct stmmac_mmc_ops {
 #define stmmac_mmc_read(__priv, __args...) \
 	stmmac_do_void_callback(__priv, mmc, read, __args)
 
+/* XPCS callbacks */
+#define stmmac_xpcs_validate(__priv, __args...) \
+	stmmac_do_callback(__priv, xpcs, validate, __args)
+#define stmmac_xpcs_config(__priv, __args...) \
+	stmmac_do_callback(__priv, xpcs, config, __args)
+#define stmmac_xpcs_get_state(__priv, __args...) \
+	stmmac_do_callback(__priv, xpcs, get_state, __args)
+#define stmmac_xpcs_link_up(__priv, __args...) \
+	stmmac_do_callback(__priv, xpcs, link_up, __args)
+#define stmmac_xpcs_probe(__priv, __args...) \
+	stmmac_do_callback(__priv, xpcs, probe, __args)
+
 struct stmmac_regs_off {
 	u32 ptp_off;
 	u32 mmc_off;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 3a190cf250e6..f26699d9a050 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -863,18 +863,26 @@ static void stmmac_validate(struct phylink_config *config,
 
 	linkmode_and(state->advertising, state->advertising, mac_supported);
 	linkmode_andnot(state->advertising, state->advertising, mask);
+
+	/* If PCS is supported, check which modes it supports. */
+	stmmac_xpcs_validate(priv, &priv->hw->xpcs_args, supported, state);
 }
 
 static void stmmac_mac_pcs_get_state(struct phylink_config *config,
 				     struct phylink_link_state *state)
 {
+	struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev));
+
 	state->link = 0;
+	stmmac_xpcs_get_state(priv, &priv->hw->xpcs_args, state);
 }
 
 static void stmmac_mac_config(struct phylink_config *config, unsigned int mode,
 			      const struct phylink_link_state *state)
 {
-	/* Nothing for now. */
+	struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev));
+
+	stmmac_xpcs_config(priv, &priv->hw->xpcs_args, state);
 }
 
 static void stmmac_mac_an_restart(struct phylink_config *config)
@@ -902,6 +910,8 @@ static void stmmac_mac_link_up(struct phylink_config *config,
 	struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev));
 	u32 ctrl;
 
+	stmmac_xpcs_link_up(priv, &priv->hw->xpcs_args, speed, interface);
+
 	ctrl = readl(priv->ioaddr + MAC_CTRL_REG);
 	ctrl &= ~priv->hw->link.speed_mask;
 
@@ -1042,6 +1052,7 @@ static int stmmac_phy_setup(struct stmmac_priv *priv)
 
 	priv->phylink_config.dev = &priv->dev->dev;
 	priv->phylink_config.type = PHYLINK_NETDEV;
+	priv->phylink_config.pcs_poll = true;
 
 	if (!fwnode)
 		fwnode = dev_fwnode(priv->device);
@@ -2689,7 +2700,8 @@ static int stmmac_open(struct net_device *dev)
 	int ret;
 
 	if (priv->hw->pcs != STMMAC_PCS_TBI &&
-	    priv->hw->pcs != STMMAC_PCS_RTBI) {
+	    priv->hw->pcs != STMMAC_PCS_RTBI &&
+	    priv->hw->xpcs == NULL) {
 		ret = stmmac_init_phy(dev);
 		if (ret) {
 			netdev_err(priv->dev,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index cfe5d8b73142..b2a707e2ef43 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -382,6 +382,14 @@ int stmmac_mdio_register(struct net_device *ndev)
 		max_addr = PHY_MAX_ADDR;
 	}
 
+	if (mdio_bus_data->has_xpcs) {
+		priv->hw->xpcs = mdio_xpcs_get_ops();
+		if (!priv->hw->xpcs) {
+			err = -ENODEV;
+			goto bus_register_fail;
+		}
+	}
+
 	if (mdio_bus_data->needs_reset)
 		new_bus->reset = &stmmac_mdio_reset;
 
@@ -433,6 +441,25 @@ int stmmac_mdio_register(struct net_device *ndev)
 		found = 1;
 	}
 
+	/* Try to probe the XPCS by scanning all addresses. */
+	if (priv->hw->xpcs) {
+		struct mdio_xpcs_args *xpcs = &priv->hw->xpcs_args;
+		int ret, mode = priv->plat->phy_interface;
+		max_addr = PHY_MAX_ADDR;
+
+		xpcs->bus = new_bus;
+
+		for (addr = 0; addr < max_addr; addr++) {
+			xpcs->addr = addr;
+
+			ret = stmmac_xpcs_probe(priv, xpcs, mode);
+			if (!ret) {
+				found = 1;
+				break;
+			}
+		}
+	}
+
 	if (!found && !mdio_node) {
 		dev_warn(dev, "No PHY found\n");
 		mdiobus_unregister(new_bus);
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 19190c609282..fbafb353e9be 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -80,6 +80,7 @@
 
 struct stmmac_mdio_bus_data {
 	unsigned int phy_mask;
+	unsigned int has_xpcs;
 	int *irqs;
 	int probed_phy_irq;
 	bool needs_reset;
-- 
cgit v1.2.3


From 812df69beb86b0e6decbb109ee3fa408dcb7fa5d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 9 Mar 2020 19:15:06 -0700
Subject: net: liquidio: reject unsupported coalescing params

Set ethtool_ops->supported_coalesce_params to let
the core reject unsupported coalescing parameters.

This driver did not previously reject unsupported parameters.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c | 11 +++++++++++
 include/linux/ethtool.h                            |  5 +++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
index 2b27e3aad9db..16eebfc52109 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
@@ -3097,7 +3097,17 @@ static int lio_set_fecparam(struct net_device *netdev,
 	return 0;
 }
 
+#define LIO_ETHTOOL_COALESCE	(ETHTOOL_COALESCE_RX_USECS |		\
+				 ETHTOOL_COALESCE_MAX_FRAMES |		\
+				 ETHTOOL_COALESCE_USE_ADAPTIVE |	\
+				 ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW |	\
+				 ETHTOOL_COALESCE_TX_MAX_FRAMES_LOW |	\
+				 ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH |	\
+				 ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH |	\
+				 ETHTOOL_COALESCE_PKT_RATE_RX_USECS)
+
 static const struct ethtool_ops lio_ethtool_ops = {
+	.supported_coalesce_params = LIO_ETHTOOL_COALESCE,
 	.get_link_ksettings	= lio_get_link_ksettings,
 	.set_link_ksettings	= lio_set_link_ksettings,
 	.get_fecparam		= lio_get_fecparam,
@@ -3128,6 +3138,7 @@ static const struct ethtool_ops lio_ethtool_ops = {
 };
 
 static const struct ethtool_ops lio_vf_ethtool_ops = {
+	.supported_coalesce_params = LIO_ETHTOOL_COALESCE,
 	.get_link_ksettings	= lio_get_link_ksettings,
 	.get_link		= ethtool_op_get_link,
 	.get_drvinfo		= lio_get_vf_drvinfo,
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index e464c946bca4..9efeebde3514 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -211,6 +211,11 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 	 ETHTOOL_COALESCE_TX_MAX_FRAMES_IRQ)
 #define ETHTOOL_COALESCE_USE_ADAPTIVE					\
 	(ETHTOOL_COALESCE_USE_ADAPTIVE_RX | ETHTOOL_COALESCE_USE_ADAPTIVE_TX)
+#define ETHTOOL_COALESCE_PKT_RATE_RX_USECS				\
+	(ETHTOOL_COALESCE_USE_ADAPTIVE_RX |				\
+	 ETHTOOL_COALESCE_RX_USECS_LOW | ETHTOOL_COALESCE_RX_USECS_HIGH | \
+	 ETHTOOL_COALESCE_PKT_RATE_LOW | ETHTOOL_COALESCE_PKT_RATE_HIGH | \
+	 ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL)
 
 /**
  * struct ethtool_ops - optional netdev operations
-- 
cgit v1.2.3


From babf3164095b0670435910340c2a1eec37757b57 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Mon, 9 Mar 2020 16:10:51 -0700
Subject: bpf: Add bpf_link_new_file that doesn't install FD

Add bpf_link_new_file() API for cases when we need to ensure anon_inode is
successfully created before we proceed with expensive BPF program attachment
procedure, which will require equally (if not more so) expensive and
potentially failing compensation detachment procedure just because anon_inode
creation failed. This API allows to simplify code by ensuring first that
anon_inode is created and after BPF program is attached proceed with
fd_install() that can't fail.

After anon_inode file is created, link can't be just kfree()'d anymore,
because its destruction will be performed by deferred file_operations->release
call. For this, bpf_link API required specifying two separate operations:
release() and dealloc(), former performing detachment only, while the latter
frees memory used by bpf_link itself. dealloc() needs to be specified, because
struct bpf_link is frequently embedded into link type-specific container
struct (e.g., struct bpf_raw_tp_link), so bpf_link itself doesn't know how to
properly free the memory. In case when anon_inode file was successfully
created, but subsequent BPF attachment failed, bpf_link needs to be marked as
"defunct", so that file's release() callback will perform only memory
deallocation, but no detachment.

Convert raw tracepoint and tracing attachment to new API and eliminate
detachment from error handling path.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200309231051.1270337-1-andriin@fb.com
---
 include/linux/bpf.h  |   3 ++
 kernel/bpf/syscall.c | 122 +++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 91 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 94a329b9da81..4fd91b7c95ea 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1070,13 +1070,16 @@ struct bpf_link;
 
 struct bpf_link_ops {
 	void (*release)(struct bpf_link *link);
+	void (*dealloc)(struct bpf_link *link);
 };
 
 void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
 		   struct bpf_prog *prog);
+void bpf_link_defunct(struct bpf_link *link);
 void bpf_link_inc(struct bpf_link *link);
 void bpf_link_put(struct bpf_link *link);
 int bpf_link_new_fd(struct bpf_link *link);
+struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd);
 struct bpf_link *bpf_link_get_from_fd(u32 ufd);
 
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7ce0815793dd..b2f73ecacced 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2188,6 +2188,11 @@ void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
 	link->prog = prog;
 }
 
+void bpf_link_defunct(struct bpf_link *link)
+{
+	link->prog = NULL;
+}
+
 void bpf_link_inc(struct bpf_link *link)
 {
 	atomic64_inc(&link->refcnt);
@@ -2196,14 +2201,13 @@ void bpf_link_inc(struct bpf_link *link)
 /* bpf_link_free is guaranteed to be called from process context */
 static void bpf_link_free(struct bpf_link *link)
 {
-	struct bpf_prog *prog;
-
-	/* remember prog locally, because release below will free link memory */
-	prog = link->prog;
-	/* extra clean up and kfree of container link struct */
-	link->ops->release(link);
-	/* no more accesing of link members after this point */
-	bpf_prog_put(prog);
+	if (link->prog) {
+		/* detach BPF program, clean up used resources */
+		link->ops->release(link);
+		bpf_prog_put(link->prog);
+	}
+	/* free bpf_link and its containing memory */
+	link->ops->dealloc(link);
 }
 
 static void bpf_link_put_deferred(struct work_struct *work)
@@ -2281,6 +2285,33 @@ int bpf_link_new_fd(struct bpf_link *link)
 	return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
 }
 
+/* Similar to bpf_link_new_fd, create anon_inode for given bpf_link, but
+ * instead of immediately installing fd in fdtable, just reserve it and
+ * return. Caller then need to either install it with fd_install(fd, file) or
+ * release with put_unused_fd(fd).
+ * This is useful for cases when bpf_link attachment/detachment are
+ * complicated and expensive operations and should be delayed until all the fd
+ * reservation and anon_inode creation succeeds.
+ */
+struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd)
+{
+	struct file *file;
+	int fd;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		return ERR_PTR(fd);
+
+	file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return file;
+	}
+
+	*reserved_fd = fd;
+	return file;
+}
+
 struct bpf_link *bpf_link_get_from_fd(u32 ufd)
 {
 	struct fd f = fdget(ufd);
@@ -2305,21 +2336,27 @@ struct bpf_tracing_link {
 };
 
 static void bpf_tracing_link_release(struct bpf_link *link)
+{
+	WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog));
+}
+
+static void bpf_tracing_link_dealloc(struct bpf_link *link)
 {
 	struct bpf_tracing_link *tr_link =
 		container_of(link, struct bpf_tracing_link, link);
 
-	WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog));
 	kfree(tr_link);
 }
 
 static const struct bpf_link_ops bpf_tracing_link_lops = {
 	.release = bpf_tracing_link_release,
+	.dealloc = bpf_tracing_link_dealloc,
 };
 
 static int bpf_tracing_prog_attach(struct bpf_prog *prog)
 {
 	struct bpf_tracing_link *link;
+	struct file *link_file;
 	int link_fd, err;
 
 	if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
@@ -2337,20 +2374,24 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
 	}
 	bpf_link_init(&link->link, &bpf_tracing_link_lops, prog);
 
-	err = bpf_trampoline_link_prog(prog);
-	if (err)
-		goto out_free_link;
+	link_file = bpf_link_new_file(&link->link, &link_fd);
+	if (IS_ERR(link_file)) {
+		kfree(link);
+		err = PTR_ERR(link_file);
+		goto out_put_prog;
+	}
 
-	link_fd = bpf_link_new_fd(&link->link);
-	if (link_fd < 0) {
-		WARN_ON_ONCE(bpf_trampoline_unlink_prog(prog));
-		err = link_fd;
-		goto out_free_link;
+	err = bpf_trampoline_link_prog(prog);
+	if (err) {
+		bpf_link_defunct(&link->link);
+		fput(link_file);
+		put_unused_fd(link_fd);
+		goto out_put_prog;
 	}
+
+	fd_install(link_fd, link_file);
 	return link_fd;
 
-out_free_link:
-	kfree(link);
 out_put_prog:
 	bpf_prog_put(prog);
 	return err;
@@ -2368,19 +2409,28 @@ static void bpf_raw_tp_link_release(struct bpf_link *link)
 
 	bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
 	bpf_put_raw_tracepoint(raw_tp->btp);
+}
+
+static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_raw_tp_link *raw_tp =
+		container_of(link, struct bpf_raw_tp_link, link);
+
 	kfree(raw_tp);
 }
 
 static const struct bpf_link_ops bpf_raw_tp_lops = {
 	.release = bpf_raw_tp_link_release,
+	.dealloc = bpf_raw_tp_link_dealloc,
 };
 
 #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
 
 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 {
-	struct bpf_raw_tp_link *raw_tp;
+	struct bpf_raw_tp_link *link;
 	struct bpf_raw_event_map *btp;
+	struct file *link_file;
 	struct bpf_prog *prog;
 	const char *tp_name;
 	char buf[128];
@@ -2431,28 +2481,32 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 		goto out_put_prog;
 	}
 
-	raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
-	if (!raw_tp) {
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
 		err = -ENOMEM;
 		goto out_put_btp;
 	}
-	bpf_link_init(&raw_tp->link, &bpf_raw_tp_lops, prog);
-	raw_tp->btp = btp;
+	bpf_link_init(&link->link, &bpf_raw_tp_lops, prog);
+	link->btp = btp;
 
-	err = bpf_probe_register(raw_tp->btp, prog);
-	if (err)
-		goto out_free_tp;
+	link_file = bpf_link_new_file(&link->link, &link_fd);
+	if (IS_ERR(link_file)) {
+		kfree(link);
+		err = PTR_ERR(link_file);
+		goto out_put_btp;
+	}
 
-	link_fd = bpf_link_new_fd(&raw_tp->link);
-	if (link_fd < 0) {
-		bpf_probe_unregister(raw_tp->btp, prog);
-		err = link_fd;
-		goto out_free_tp;
+	err = bpf_probe_register(link->btp, prog);
+	if (err) {
+		bpf_link_defunct(&link->link);
+		fput(link_file);
+		put_unused_fd(link_fd);
+		goto out_put_btp;
 	}
+
+	fd_install(link_fd, link_file);
 	return link_fd;
 
-out_free_tp:
-	kfree(raw_tp);
 out_put_btp:
 	bpf_put_raw_tracepoint(btp);
 out_put_prog:
-- 
cgit v1.2.3


From 0fa81b304a7973a499f844176ca031109487dd31 Mon Sep 17 00:00:00 2001
From: Alexander Bersenev <bay@hackerdom.ru>
Date: Fri, 6 Mar 2020 01:33:16 +0500
Subject: cdc_ncm: Implement the 32-bit version of NCM Transfer Block

The NCM specification defines two formats of transfer blocks: with 16-bit
fields (NTB-16) and with 32-bit fields (NTB-32). Currently only NTB-16 is
implemented.

This patch adds the support of NTB-32. The motivation behind this is that
some devices such as E5785 or E5885 from the current generation of Huawei
LTE routers do not support NTB-16. The previous generations of Huawei
devices are also use NTB-32 by default.

Also this patch enables NTB-32 by default for Huawei devices.

During the 2019 ValdikSS made five attempts to contact Huawei to add the
NTB-16 support to their router firmware, but they were unsuccessful.

Signed-off-by: Alexander Bersenev <bay@hackerdom.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ncm.c        | 411 ++++++++++++++++++++++++++++++---------
 drivers/net/usb/huawei_cdc_ncm.c |   8 +-
 include/linux/usb/cdc_ncm.h      |  15 +-
 3 files changed, 340 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index c2c82e6391b4..5569077bd5b8 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -175,7 +175,11 @@ static u32 cdc_ncm_check_tx_max(struct usbnet *dev, u32 new_tx)
 	u32 val, max, min;
 
 	/* clamp new_tx to sane values */
-	min = ctx->max_datagram_size + ctx->max_ndp_size + sizeof(struct usb_cdc_ncm_nth16);
+	if (ctx->is_ndp16)
+		min = ctx->max_datagram_size + ctx->max_ndp_size + sizeof(struct usb_cdc_ncm_nth16);
+	else
+		min = ctx->max_datagram_size + ctx->max_ndp_size + sizeof(struct usb_cdc_ncm_nth32);
+
 	max = min_t(u32, CDC_NCM_NTB_MAX_SIZE_TX, le32_to_cpu(ctx->ncm_parm.dwNtbOutMaxSize));
 
 	/* some devices set dwNtbOutMaxSize too low for the above default */
@@ -307,10 +311,17 @@ static ssize_t ndp_to_end_store(struct device *d,  struct device_attribute *attr
 	if (enable == (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END))
 		return len;
 
-	if (enable && !ctx->delayed_ndp16) {
-		ctx->delayed_ndp16 = kzalloc(ctx->max_ndp_size, GFP_KERNEL);
-		if (!ctx->delayed_ndp16)
-			return -ENOMEM;
+	if (enable) {
+		if (ctx->is_ndp16 && !ctx->delayed_ndp16) {
+			ctx->delayed_ndp16 = kzalloc(ctx->max_ndp_size, GFP_KERNEL);
+			if (!ctx->delayed_ndp16)
+				return -ENOMEM;
+		}
+		if (!ctx->is_ndp16 && !ctx->delayed_ndp32) {
+			ctx->delayed_ndp32 = kzalloc(ctx->max_ndp_size, GFP_KERNEL);
+			if (!ctx->delayed_ndp32)
+				return -ENOMEM;
+		}
 	}
 
 	/* flush pending data before changing flag */
@@ -512,6 +523,9 @@ static int cdc_ncm_init(struct usbnet *dev)
 			dev_err(&dev->intf->dev, "SET_CRC_MODE failed\n");
 	}
 
+	/* use ndp16 by default */
+	ctx->is_ndp16 = 1;
+
 	/* set NTB format, if both formats are supported.
 	 *
 	 * "The host shall only send this command while the NCM Data
@@ -519,14 +533,27 @@ static int cdc_ncm_init(struct usbnet *dev)
 	 */
 	if (le16_to_cpu(ctx->ncm_parm.bmNtbFormatsSupported) &
 						USB_CDC_NCM_NTB32_SUPPORTED) {
-		dev_dbg(&dev->intf->dev, "Setting NTB format to 16-bit\n");
-		err = usbnet_write_cmd(dev, USB_CDC_SET_NTB_FORMAT,
-				       USB_TYPE_CLASS | USB_DIR_OUT
-				       | USB_RECIP_INTERFACE,
-				       USB_CDC_NCM_NTB16_FORMAT,
-				       iface_no, NULL, 0);
-		if (err < 0)
+		if (ctx->drvflags & CDC_NCM_FLAG_PREFER_NTB32) {
+			ctx->is_ndp16 = 0;
+			dev_dbg(&dev->intf->dev, "Setting NTB format to 32-bit\n");
+			err = usbnet_write_cmd(dev, USB_CDC_SET_NTB_FORMAT,
+					       USB_TYPE_CLASS | USB_DIR_OUT
+					       | USB_RECIP_INTERFACE,
+					       USB_CDC_NCM_NTB32_FORMAT,
+					       iface_no, NULL, 0);
+		} else {
+			ctx->is_ndp16 = 1;
+			dev_dbg(&dev->intf->dev, "Setting NTB format to 16-bit\n");
+			err = usbnet_write_cmd(dev, USB_CDC_SET_NTB_FORMAT,
+					       USB_TYPE_CLASS | USB_DIR_OUT
+					       | USB_RECIP_INTERFACE,
+					       USB_CDC_NCM_NTB16_FORMAT,
+					       iface_no, NULL, 0);
+		}
+		if (err < 0) {
+			ctx->is_ndp16 = 1;
 			dev_err(&dev->intf->dev, "SET_NTB_FORMAT failed\n");
+		}
 	}
 
 	/* set initial device values */
@@ -549,7 +576,10 @@ static int cdc_ncm_init(struct usbnet *dev)
 		ctx->tx_max_datagrams = CDC_NCM_DPT_DATAGRAMS_MAX;
 
 	/* set up maximum NDP size */
-	ctx->max_ndp_size = sizeof(struct usb_cdc_ncm_ndp16) + (ctx->tx_max_datagrams + 1) * sizeof(struct usb_cdc_ncm_dpe16);
+	if (ctx->is_ndp16)
+		ctx->max_ndp_size = sizeof(struct usb_cdc_ncm_ndp16) + (ctx->tx_max_datagrams + 1) * sizeof(struct usb_cdc_ncm_dpe16);
+	else
+		ctx->max_ndp_size = sizeof(struct usb_cdc_ncm_ndp32) + (ctx->tx_max_datagrams + 1) * sizeof(struct usb_cdc_ncm_dpe32);
 
 	/* initial coalescing timer interval */
 	ctx->timer_interval = CDC_NCM_TIMER_INTERVAL_USEC * NSEC_PER_USEC;
@@ -734,7 +764,10 @@ static void cdc_ncm_free(struct cdc_ncm_ctx *ctx)
 		ctx->tx_curr_skb = NULL;
 	}
 
-	kfree(ctx->delayed_ndp16);
+	if (ctx->is_ndp16)
+		kfree(ctx->delayed_ndp16);
+	else
+		kfree(ctx->delayed_ndp32);
 
 	kfree(ctx);
 }
@@ -772,10 +805,8 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_
 	u8 *buf;
 	int len;
 	int temp;
-	int err;
 	u8 iface_no;
 	struct usb_cdc_parsed_header hdr;
-	__le16 curr_ntb_format;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
@@ -879,32 +910,6 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_
 		goto error2;
 	}
 
-	/*
-	 * Some Huawei devices have been observed to come out of reset in NDP32 mode.
-	 * Let's check if this is the case, and set the device to NDP16 mode again if
-	 * needed.
-	*/
-	if (ctx->drvflags & CDC_NCM_FLAG_RESET_NTB16) {
-		err = usbnet_read_cmd(dev, USB_CDC_GET_NTB_FORMAT,
-				      USB_TYPE_CLASS | USB_DIR_IN | USB_RECIP_INTERFACE,
-				      0, iface_no, &curr_ntb_format, 2);
-		if (err < 0) {
-			goto error2;
-		}
-
-		if (curr_ntb_format == cpu_to_le16(USB_CDC_NCM_NTB32_FORMAT)) {
-			dev_info(&intf->dev, "resetting NTB format to 16-bit");
-			err = usbnet_write_cmd(dev, USB_CDC_SET_NTB_FORMAT,
-					       USB_TYPE_CLASS | USB_DIR_OUT
-					       | USB_RECIP_INTERFACE,
-					       USB_CDC_NCM_NTB16_FORMAT,
-					       iface_no, NULL, 0);
-
-			if (err < 0)
-				goto error2;
-		}
-	}
-
 	cdc_ncm_find_endpoints(dev, ctx->data);
 	cdc_ncm_find_endpoints(dev, ctx->control);
 	if (!dev->in || !dev->out || !dev->status) {
@@ -929,9 +934,15 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_
 
 	/* Allocate the delayed NDP if needed. */
 	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) {
-		ctx->delayed_ndp16 = kzalloc(ctx->max_ndp_size, GFP_KERNEL);
-		if (!ctx->delayed_ndp16)
-			goto error2;
+		if (ctx->is_ndp16) {
+			ctx->delayed_ndp16 = kzalloc(ctx->max_ndp_size, GFP_KERNEL);
+			if (!ctx->delayed_ndp16)
+				goto error2;
+		} else {
+			ctx->delayed_ndp32 = kzalloc(ctx->max_ndp_size, GFP_KERNEL);
+			if (!ctx->delayed_ndp32)
+				goto error2;
+		}
 		dev_info(&intf->dev, "NDP will be placed at end of frame for this device.");
 	}
 
@@ -1055,7 +1066,7 @@ static void cdc_ncm_align_tail(struct sk_buff *skb, size_t modulus, size_t remai
 /* return a pointer to a valid struct usb_cdc_ncm_ndp16 of type sign, possibly
  * allocating a new one within skb
  */
-static struct usb_cdc_ncm_ndp16 *cdc_ncm_ndp(struct cdc_ncm_ctx *ctx, struct sk_buff *skb, __le32 sign, size_t reserve)
+static struct usb_cdc_ncm_ndp16 *cdc_ncm_ndp16(struct cdc_ncm_ctx *ctx, struct sk_buff *skb, __le32 sign, size_t reserve)
 {
 	struct usb_cdc_ncm_ndp16 *ndp16 = NULL;
 	struct usb_cdc_ncm_nth16 *nth16 = (void *)skb->data;
@@ -1110,12 +1121,73 @@ static struct usb_cdc_ncm_ndp16 *cdc_ncm_ndp(struct cdc_ncm_ctx *ctx, struct sk_
 	return ndp16;
 }
 
+static struct usb_cdc_ncm_ndp32 *cdc_ncm_ndp32(struct cdc_ncm_ctx *ctx, struct sk_buff *skb, __le32 sign, size_t reserve)
+{
+	struct usb_cdc_ncm_ndp32 *ndp32 = NULL;
+	struct usb_cdc_ncm_nth32 *nth32 = (void *)skb->data;
+	size_t ndpoffset = le32_to_cpu(nth32->dwNdpIndex);
+
+	/* If NDP should be moved to the end of the NCM package, we can't follow the
+	 * NTH32 header as we would normally do. NDP isn't written to the SKB yet, and
+	 * the wNdpIndex field in the header is actually not consistent with reality. It will be later.
+	 */
+	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) {
+		if (ctx->delayed_ndp32->dwSignature == sign)
+			return ctx->delayed_ndp32;
+
+		/* We can only push a single NDP to the end. Return
+		 * NULL to send what we've already got and queue this
+		 * skb for later.
+		 */
+		else if (ctx->delayed_ndp32->dwSignature)
+			return NULL;
+	}
+
+	/* follow the chain of NDPs, looking for a match */
+	while (ndpoffset) {
+		ndp32 = (struct usb_cdc_ncm_ndp32 *)(skb->data + ndpoffset);
+		if  (ndp32->dwSignature == sign)
+			return ndp32;
+		ndpoffset = le32_to_cpu(ndp32->dwNextNdpIndex);
+	}
+
+	/* align new NDP */
+	if (!(ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END))
+		cdc_ncm_align_tail(skb, ctx->tx_ndp_modulus, 0, ctx->tx_curr_size);
+
+	/* verify that there is room for the NDP and the datagram (reserve) */
+	if ((ctx->tx_curr_size - skb->len - reserve) < ctx->max_ndp_size)
+		return NULL;
+
+	/* link to it */
+	if (ndp32)
+		ndp32->dwNextNdpIndex = cpu_to_le32(skb->len);
+	else
+		nth32->dwNdpIndex = cpu_to_le32(skb->len);
+
+	/* push a new empty NDP */
+	if (!(ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END))
+		ndp32 = skb_put_zero(skb, ctx->max_ndp_size);
+	else
+		ndp32 = ctx->delayed_ndp32;
+
+	ndp32->dwSignature = sign;
+	ndp32->wLength = cpu_to_le32(sizeof(struct usb_cdc_ncm_ndp32) + sizeof(struct usb_cdc_ncm_dpe32));
+	return ndp32;
+}
+
 struct sk_buff *
 cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 {
 	struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0];
-	struct usb_cdc_ncm_nth16 *nth16;
-	struct usb_cdc_ncm_ndp16 *ndp16;
+	union {
+		struct usb_cdc_ncm_nth16 *nth16;
+		struct usb_cdc_ncm_nth32 *nth32;
+	} nth;
+	union {
+		struct usb_cdc_ncm_ndp16 *ndp16;
+		struct usb_cdc_ncm_ndp32 *ndp32;
+	} ndp;
 	struct sk_buff *skb_out;
 	u16 n = 0, index, ndplen;
 	u8 ready2send = 0;
@@ -1179,11 +1251,19 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 			}
 			ctx->tx_low_mem_val--;
 		}
-		/* fill out the initial 16-bit NTB header */
-		nth16 = skb_put_zero(skb_out, sizeof(struct usb_cdc_ncm_nth16));
-		nth16->dwSignature = cpu_to_le32(USB_CDC_NCM_NTH16_SIGN);
-		nth16->wHeaderLength = cpu_to_le16(sizeof(struct usb_cdc_ncm_nth16));
-		nth16->wSequence = cpu_to_le16(ctx->tx_seq++);
+		if (ctx->is_ndp16) {
+			/* fill out the initial 16-bit NTB header */
+			nth.nth16 = skb_put_zero(skb_out, sizeof(struct usb_cdc_ncm_nth16));
+			nth.nth16->dwSignature = cpu_to_le32(USB_CDC_NCM_NTH16_SIGN);
+			nth.nth16->wHeaderLength = cpu_to_le16(sizeof(struct usb_cdc_ncm_nth16));
+			nth.nth16->wSequence = cpu_to_le16(ctx->tx_seq++);
+		} else {
+			/* fill out the initial 32-bit NTB header */
+			nth.nth32 = skb_put_zero(skb_out, sizeof(struct usb_cdc_ncm_nth32));
+			nth.nth32->dwSignature = cpu_to_le32(USB_CDC_NCM_NTH32_SIGN);
+			nth.nth32->wHeaderLength = cpu_to_le16(sizeof(struct usb_cdc_ncm_nth32));
+			nth.nth32->wSequence = cpu_to_le16(ctx->tx_seq++);
+		}
 
 		/* count total number of frames in this NTB */
 		ctx->tx_curr_frame_num = 0;
@@ -1205,13 +1285,17 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 		}
 
 		/* get the appropriate NDP for this skb */
-		ndp16 = cdc_ncm_ndp(ctx, skb_out, sign, skb->len + ctx->tx_modulus + ctx->tx_remainder);
+		if (ctx->is_ndp16)
+			ndp.ndp16 = cdc_ncm_ndp16(ctx, skb_out, sign, skb->len + ctx->tx_modulus + ctx->tx_remainder);
+		else
+			ndp.ndp32 = cdc_ncm_ndp32(ctx, skb_out, sign, skb->len + ctx->tx_modulus + ctx->tx_remainder);
 
 		/* align beginning of next frame */
 		cdc_ncm_align_tail(skb_out,  ctx->tx_modulus, ctx->tx_remainder, ctx->tx_curr_size);
 
 		/* check if we had enough room left for both NDP and frame */
-		if (!ndp16 || skb_out->len + skb->len + delayed_ndp_size > ctx->tx_curr_size) {
+		if ((ctx->is_ndp16 && !ndp.ndp16) || (!ctx->is_ndp16 && !ndp.ndp32) ||
+		    skb_out->len + skb->len + delayed_ndp_size > ctx->tx_curr_size) {
 			if (n == 0) {
 				/* won't fit, MTU problem? */
 				dev_kfree_skb_any(skb);
@@ -1233,13 +1317,22 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 		}
 
 		/* calculate frame number withing this NDP */
-		ndplen = le16_to_cpu(ndp16->wLength);
-		index = (ndplen - sizeof(struct usb_cdc_ncm_ndp16)) / sizeof(struct usb_cdc_ncm_dpe16) - 1;
+		if (ctx->is_ndp16) {
+			ndplen = le16_to_cpu(ndp.ndp16->wLength);
+			index = (ndplen - sizeof(struct usb_cdc_ncm_ndp16)) / sizeof(struct usb_cdc_ncm_dpe16) - 1;
+
+			/* OK, add this skb */
+			ndp.ndp16->dpe16[index].wDatagramLength = cpu_to_le16(skb->len);
+			ndp.ndp16->dpe16[index].wDatagramIndex = cpu_to_le16(skb_out->len);
+			ndp.ndp16->wLength = cpu_to_le16(ndplen + sizeof(struct usb_cdc_ncm_dpe16));
+		} else {
+			ndplen = le16_to_cpu(ndp.ndp32->wLength);
+			index = (ndplen - sizeof(struct usb_cdc_ncm_ndp32)) / sizeof(struct usb_cdc_ncm_dpe32) - 1;
 
-		/* OK, add this skb */
-		ndp16->dpe16[index].wDatagramLength = cpu_to_le16(skb->len);
-		ndp16->dpe16[index].wDatagramIndex = cpu_to_le16(skb_out->len);
-		ndp16->wLength = cpu_to_le16(ndplen + sizeof(struct usb_cdc_ncm_dpe16));
+			ndp.ndp32->dpe32[index].dwDatagramLength = cpu_to_le32(skb->len);
+			ndp.ndp32->dpe32[index].dwDatagramIndex = cpu_to_le32(skb_out->len);
+			ndp.ndp32->wLength = cpu_to_le16(ndplen + sizeof(struct usb_cdc_ncm_dpe32));
+		}
 		skb_put_data(skb_out, skb->data, skb->len);
 		ctx->tx_curr_frame_payload += skb->len;	/* count real tx payload data */
 		dev_kfree_skb_any(skb);
@@ -1286,13 +1379,22 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 
 	/* If requested, put NDP at end of frame. */
 	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) {
-		nth16 = (struct usb_cdc_ncm_nth16 *)skb_out->data;
-		cdc_ncm_align_tail(skb_out, ctx->tx_ndp_modulus, 0, ctx->tx_curr_size - ctx->max_ndp_size);
-		nth16->wNdpIndex = cpu_to_le16(skb_out->len);
-		skb_put_data(skb_out, ctx->delayed_ndp16, ctx->max_ndp_size);
+		if (ctx->is_ndp16) {
+			nth.nth16 = (struct usb_cdc_ncm_nth16 *)skb_out->data;
+			cdc_ncm_align_tail(skb_out, ctx->tx_ndp_modulus, 0, ctx->tx_curr_size - ctx->max_ndp_size);
+			nth.nth16->wNdpIndex = cpu_to_le16(skb_out->len);
+			skb_put_data(skb_out, ctx->delayed_ndp16, ctx->max_ndp_size);
+
+			/* Zero out delayed NDP - signature checking will naturally fail. */
+			ndp.ndp16 = memset(ctx->delayed_ndp16, 0, ctx->max_ndp_size);
+		} else {
+			nth.nth32 = (struct usb_cdc_ncm_nth32 *)skb_out->data;
+			cdc_ncm_align_tail(skb_out, ctx->tx_ndp_modulus, 0, ctx->tx_curr_size - ctx->max_ndp_size);
+			nth.nth32->dwNdpIndex = cpu_to_le32(skb_out->len);
+			skb_put_data(skb_out, ctx->delayed_ndp32, ctx->max_ndp_size);
 
-		/* Zero out delayed NDP - signature checking will naturally fail. */
-		ndp16 = memset(ctx->delayed_ndp16, 0, ctx->max_ndp_size);
+			ndp.ndp32 = memset(ctx->delayed_ndp32, 0, ctx->max_ndp_size);
+		}
 	}
 
 	/* If collected data size is less or equal ctx->min_tx_pkt
@@ -1314,8 +1416,13 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 	}
 
 	/* set final frame length */
-	nth16 = (struct usb_cdc_ncm_nth16 *)skb_out->data;
-	nth16->wBlockLength = cpu_to_le16(skb_out->len);
+	if (ctx->is_ndp16) {
+		nth.nth16 = (struct usb_cdc_ncm_nth16 *)skb_out->data;
+		nth.nth16->wBlockLength = cpu_to_le16(skb_out->len);
+	} else {
+		nth.nth32 = (struct usb_cdc_ncm_nth32 *)skb_out->data;
+		nth.nth32->dwBlockLength = cpu_to_le32(skb_out->len);
+	}
 
 	/* return skb */
 	ctx->tx_curr_skb = NULL;
@@ -1398,7 +1505,12 @@ cdc_ncm_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags)
 		goto error;
 
 	spin_lock_bh(&ctx->mtx);
-	skb_out = cdc_ncm_fill_tx_frame(dev, skb, cpu_to_le32(USB_CDC_NCM_NDP16_NOCRC_SIGN));
+
+	if (ctx->is_ndp16)
+		skb_out = cdc_ncm_fill_tx_frame(dev, skb, cpu_to_le32(USB_CDC_NCM_NDP16_NOCRC_SIGN));
+	else
+		skb_out = cdc_ncm_fill_tx_frame(dev, skb, cpu_to_le32(USB_CDC_NCM_NDP32_NOCRC_SIGN));
+
 	spin_unlock_bh(&ctx->mtx);
 	return skb_out;
 
@@ -1459,6 +1571,54 @@ error:
 }
 EXPORT_SYMBOL_GPL(cdc_ncm_rx_verify_nth16);
 
+int cdc_ncm_rx_verify_nth32(struct cdc_ncm_ctx *ctx, struct sk_buff *skb_in)
+{
+	struct usbnet *dev = netdev_priv(skb_in->dev);
+	struct usb_cdc_ncm_nth32 *nth32;
+	int len;
+	int ret = -EINVAL;
+
+	if (ctx == NULL)
+		goto error;
+
+	if (skb_in->len < (sizeof(struct usb_cdc_ncm_nth32) +
+					sizeof(struct usb_cdc_ncm_ndp32))) {
+		netif_dbg(dev, rx_err, dev->net, "frame too short\n");
+		goto error;
+	}
+
+	nth32 = (struct usb_cdc_ncm_nth32 *)skb_in->data;
+
+	if (nth32->dwSignature != cpu_to_le32(USB_CDC_NCM_NTH32_SIGN)) {
+		netif_dbg(dev, rx_err, dev->net,
+			  "invalid NTH32 signature <%#010x>\n",
+			  le32_to_cpu(nth32->dwSignature));
+		goto error;
+	}
+
+	len = le32_to_cpu(nth32->dwBlockLength);
+	if (len > ctx->rx_max) {
+		netif_dbg(dev, rx_err, dev->net,
+			  "unsupported NTB block length %u/%u\n", len,
+			  ctx->rx_max);
+		goto error;
+	}
+
+	if ((ctx->rx_seq + 1) != le16_to_cpu(nth32->wSequence) &&
+	    (ctx->rx_seq || le16_to_cpu(nth32->wSequence)) &&
+	    !((ctx->rx_seq == 0xffff) && !le16_to_cpu(nth32->wSequence))) {
+		netif_dbg(dev, rx_err, dev->net,
+			  "sequence number glitch prev=%d curr=%d\n",
+			  ctx->rx_seq, le16_to_cpu(nth32->wSequence));
+	}
+	ctx->rx_seq = le16_to_cpu(nth32->wSequence);
+
+	ret = le32_to_cpu(nth32->dwNdpIndex);
+error:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cdc_ncm_rx_verify_nth32);
+
 /* verify NDP header and return number of datagrams, or negative error */
 int cdc_ncm_rx_verify_ndp16(struct sk_buff *skb_in, int ndpoffset)
 {
@@ -1495,6 +1655,42 @@ error:
 }
 EXPORT_SYMBOL_GPL(cdc_ncm_rx_verify_ndp16);
 
+/* verify NDP header and return number of datagrams, or negative error */
+int cdc_ncm_rx_verify_ndp32(struct sk_buff *skb_in, int ndpoffset)
+{
+	struct usbnet *dev = netdev_priv(skb_in->dev);
+	struct usb_cdc_ncm_ndp32 *ndp32;
+	int ret = -EINVAL;
+
+	if ((ndpoffset + sizeof(struct usb_cdc_ncm_ndp32)) > skb_in->len) {
+		netif_dbg(dev, rx_err, dev->net, "invalid NDP offset  <%u>\n",
+			  ndpoffset);
+		goto error;
+	}
+	ndp32 = (struct usb_cdc_ncm_ndp32 *)(skb_in->data + ndpoffset);
+
+	if (le16_to_cpu(ndp32->wLength) < USB_CDC_NCM_NDP32_LENGTH_MIN) {
+		netif_dbg(dev, rx_err, dev->net, "invalid DPT32 length <%u>\n",
+			  le16_to_cpu(ndp32->wLength));
+		goto error;
+	}
+
+	ret = ((le16_to_cpu(ndp32->wLength) -
+					sizeof(struct usb_cdc_ncm_ndp32)) /
+					sizeof(struct usb_cdc_ncm_dpe32));
+	ret--; /* we process NDP entries except for the last one */
+
+	if ((sizeof(struct usb_cdc_ncm_ndp32) +
+	     ret * (sizeof(struct usb_cdc_ncm_dpe32))) > skb_in->len) {
+		netif_dbg(dev, rx_err, dev->net, "Invalid nframes = %d\n", ret);
+		ret = -EINVAL;
+	}
+
+error:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cdc_ncm_rx_verify_ndp32);
+
 int cdc_ncm_rx_fixup(struct usbnet *dev, struct sk_buff *skb_in)
 {
 	struct sk_buff *skb;
@@ -1503,34 +1699,66 @@ int cdc_ncm_rx_fixup(struct usbnet *dev, struct sk_buff *skb_in)
 	int nframes;
 	int x;
 	int offset;
-	struct usb_cdc_ncm_ndp16 *ndp16;
-	struct usb_cdc_ncm_dpe16 *dpe16;
+	union {
+		struct usb_cdc_ncm_ndp16 *ndp16;
+		struct usb_cdc_ncm_ndp32 *ndp32;
+	} ndp;
+	union {
+		struct usb_cdc_ncm_dpe16 *dpe16;
+		struct usb_cdc_ncm_dpe32 *dpe32;
+	} dpe;
+
 	int ndpoffset;
 	int loopcount = 50; /* arbitrary max preventing infinite loop */
 	u32 payload = 0;
 
-	ndpoffset = cdc_ncm_rx_verify_nth16(ctx, skb_in);
+	if (ctx->is_ndp16)
+		ndpoffset = cdc_ncm_rx_verify_nth16(ctx, skb_in);
+	else
+		ndpoffset = cdc_ncm_rx_verify_nth32(ctx, skb_in);
+
 	if (ndpoffset < 0)
 		goto error;
 
 next_ndp:
-	nframes = cdc_ncm_rx_verify_ndp16(skb_in, ndpoffset);
-	if (nframes < 0)
-		goto error;
+	if (ctx->is_ndp16) {
+		nframes = cdc_ncm_rx_verify_ndp16(skb_in, ndpoffset);
+		if (nframes < 0)
+			goto error;
 
-	ndp16 = (struct usb_cdc_ncm_ndp16 *)(skb_in->data + ndpoffset);
+		ndp.ndp16 = (struct usb_cdc_ncm_ndp16 *)(skb_in->data + ndpoffset);
 
-	if (ndp16->dwSignature != cpu_to_le32(USB_CDC_NCM_NDP16_NOCRC_SIGN)) {
-		netif_dbg(dev, rx_err, dev->net,
-			  "invalid DPT16 signature <%#010x>\n",
-			  le32_to_cpu(ndp16->dwSignature));
-		goto err_ndp;
+		if (ndp.ndp16->dwSignature != cpu_to_le32(USB_CDC_NCM_NDP16_NOCRC_SIGN)) {
+			netif_dbg(dev, rx_err, dev->net,
+				  "invalid DPT16 signature <%#010x>\n",
+				  le32_to_cpu(ndp.ndp16->dwSignature));
+			goto err_ndp;
+		}
+		dpe.dpe16 = ndp.ndp16->dpe16;
+	} else {
+		nframes = cdc_ncm_rx_verify_ndp32(skb_in, ndpoffset);
+		if (nframes < 0)
+			goto error;
+
+		ndp.ndp32 = (struct usb_cdc_ncm_ndp32 *)(skb_in->data + ndpoffset);
+
+		if (ndp.ndp32->dwSignature != cpu_to_le32(USB_CDC_NCM_NDP32_NOCRC_SIGN)) {
+			netif_dbg(dev, rx_err, dev->net,
+				  "invalid DPT32 signature <%#010x>\n",
+				  le32_to_cpu(ndp.ndp32->dwSignature));
+			goto err_ndp;
+		}
+		dpe.dpe32 = ndp.ndp32->dpe32;
 	}
-	dpe16 = ndp16->dpe16;
 
-	for (x = 0; x < nframes; x++, dpe16++) {
-		offset = le16_to_cpu(dpe16->wDatagramIndex);
-		len = le16_to_cpu(dpe16->wDatagramLength);
+	for (x = 0; x < nframes; x++) {
+		if (ctx->is_ndp16) {
+			offset = le16_to_cpu(dpe.dpe16->wDatagramIndex);
+			len = le16_to_cpu(dpe.dpe16->wDatagramLength);
+		} else {
+			offset = le32_to_cpu(dpe.dpe32->dwDatagramIndex);
+			len = le32_to_cpu(dpe.dpe32->dwDatagramLength);
+		}
 
 		/*
 		 * CDC NCM ch. 3.7
@@ -1561,10 +1789,19 @@ next_ndp:
 			usbnet_skb_return(dev, skb);
 			payload += len;	/* count payload bytes in this NTB */
 		}
+
+		if (ctx->is_ndp16)
+			dpe.dpe16++;
+		else
+			dpe.dpe32++;
 	}
 err_ndp:
 	/* are there more NDPs to process? */
-	ndpoffset = le16_to_cpu(ndp16->wNextNdpIndex);
+	if (ctx->is_ndp16)
+		ndpoffset = le16_to_cpu(ndp.ndp16->wNextNdpIndex);
+	else
+		ndpoffset = le32_to_cpu(ndp.ndp32->dwNextNdpIndex);
+
 	if (ndpoffset && loopcount--)
 		goto next_ndp;
 
diff --git a/drivers/net/usb/huawei_cdc_ncm.c b/drivers/net/usb/huawei_cdc_ncm.c
index e15a472c6a54..099d84827004 100644
--- a/drivers/net/usb/huawei_cdc_ncm.c
+++ b/drivers/net/usb/huawei_cdc_ncm.c
@@ -77,11 +77,11 @@ static int huawei_cdc_ncm_bind(struct usbnet *usbnet_dev,
 	 */
 	drvflags |= CDC_NCM_FLAG_NDP_TO_END;
 
-	/* Additionally, it has been reported that some Huawei E3372H devices, with
-	 * firmware version 21.318.01.00.541, come out of reset in NTB32 format mode, hence
-	 * needing to be set to the NTB16 one again.
+	/* For many Huawei devices the NTB32 mode is the default and the best mode
+	 * they work with. Huawei E5785 and E5885 devices refuse to work in NTB16 mode at all.
 	 */
-	drvflags |= CDC_NCM_FLAG_RESET_NTB16;
+	drvflags |= CDC_NCM_FLAG_PREFER_NTB32;
+
 	ret = cdc_ncm_bind_common(usbnet_dev, intf, 1, drvflags);
 	if (ret)
 		goto err;
diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index 1646c06989df..0ce4377545f8 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -46,9 +46,12 @@
 #define CDC_NCM_DATA_ALTSETTING_NCM		1
 #define CDC_NCM_DATA_ALTSETTING_MBIM		2
 
-/* CDC NCM subclass 3.2.1 */
+/* CDC NCM subclass 3.3.1 */
 #define USB_CDC_NCM_NDP16_LENGTH_MIN		0x10
 
+/* CDC NCM subclass 3.3.2 */
+#define USB_CDC_NCM_NDP32_LENGTH_MIN		0x20
+
 /* Maximum NTB length */
 #define	CDC_NCM_NTB_MAX_SIZE_TX			32768	/* bytes */
 #define	CDC_NCM_NTB_MAX_SIZE_RX			32768	/* bytes */
@@ -84,7 +87,7 @@
 /* Driver flags */
 #define CDC_NCM_FLAG_NDP_TO_END			0x02	/* NDP is placed at end of frame */
 #define CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE	0x04	/* Avoid altsetting toggle during init */
-#define CDC_NCM_FLAG_RESET_NTB16 0x08	/* set NDP16 one more time after altsetting switch */
+#define CDC_NCM_FLAG_PREFER_NTB32 0x08	/* prefer NDP32 over NDP16 */
 
 #define cdc_ncm_comm_intf_is_mbim(x)  ((x)->desc.bInterfaceSubClass == USB_CDC_SUBCLASS_MBIM && \
 				       (x)->desc.bInterfaceProtocol == USB_CDC_PROTO_NONE)
@@ -113,7 +116,11 @@ struct cdc_ncm_ctx {
 
 	u32 timer_interval;
 	u32 max_ndp_size;
-	struct usb_cdc_ncm_ndp16 *delayed_ndp16;
+	u8 is_ndp16;
+	union {
+		struct usb_cdc_ncm_ndp16 *delayed_ndp16;
+		struct usb_cdc_ncm_ndp32 *delayed_ndp32;
+	};
 
 	u32 tx_timer_pending;
 	u32 tx_curr_frame_num;
@@ -150,6 +157,8 @@ void cdc_ncm_unbind(struct usbnet *dev, struct usb_interface *intf);
 struct sk_buff *cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign);
 int cdc_ncm_rx_verify_nth16(struct cdc_ncm_ctx *ctx, struct sk_buff *skb_in);
 int cdc_ncm_rx_verify_ndp16(struct sk_buff *skb_in, int ndpoffset);
+int cdc_ncm_rx_verify_nth32(struct cdc_ncm_ctx *ctx, struct sk_buff *skb_in);
+int cdc_ncm_rx_verify_ndp32(struct sk_buff *skb_in, int ndpoffset);
 struct sk_buff *
 cdc_ncm_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags);
 int cdc_ncm_rx_fixup(struct usbnet *dev, struct sk_buff *skb_in);
-- 
cgit v1.2.3


From 8213f6c9a275da084dc9363f36f93138547f46f1 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 11 Mar 2020 15:32:48 -0700
Subject: net: be2net: reject unsupported coalescing params

Set ethtool_ops->supported_coalesce_params to let
the core reject unsupported coalescing parameters.

This driver did not previously reject unsupported parameters.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/emulex/benet/be_ethtool.c | 3 +++
 include/linux/ethtool.h                        | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c
index 9d9f0545fbfe..d6ed1d943762 100644
--- a/drivers/net/ethernet/emulex/benet/be_ethtool.c
+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
@@ -1408,6 +1408,9 @@ static int be_set_priv_flags(struct net_device *netdev, u32 flags)
 }
 
 const struct ethtool_ops be_ethtool_ops = {
+	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
+				     ETHTOOL_COALESCE_USE_ADAPTIVE |
+				     ETHTOOL_COALESCE_USECS_LOW_HIGH,
 	.get_drvinfo = be_get_drvinfo,
 	.get_wol = be_get_wol,
 	.set_wol = be_set_wol,
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 9efeebde3514..acfce915a02b 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -211,6 +211,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 	 ETHTOOL_COALESCE_TX_MAX_FRAMES_IRQ)
 #define ETHTOOL_COALESCE_USE_ADAPTIVE					\
 	(ETHTOOL_COALESCE_USE_ADAPTIVE_RX | ETHTOOL_COALESCE_USE_ADAPTIVE_TX)
+#define ETHTOOL_COALESCE_USECS_LOW_HIGH					\
+	(ETHTOOL_COALESCE_RX_USECS_LOW | ETHTOOL_COALESCE_TX_USECS_LOW | \
+	 ETHTOOL_COALESCE_RX_USECS_HIGH | ETHTOOL_COALESCE_TX_USECS_HIGH)
 #define ETHTOOL_COALESCE_PKT_RATE_RX_USECS				\
 	(ETHTOOL_COALESCE_USE_ADAPTIVE_RX |				\
 	 ETHTOOL_COALESCE_RX_USECS_LOW | ETHTOOL_COALESCE_RX_USECS_HIGH | \
-- 
cgit v1.2.3


From 4f9546d24a12e9041eae20574ab244f5410e02c4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 11 Mar 2020 15:32:52 -0700
Subject: net: hns: reject unsupported coalescing params

Set ethtool_ops->supported_coalesce_params to let
the core reject unsupported coalescing parameters.

This driver did not previously reject unsupported parameters.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 5 +++++
 include/linux/ethtool.h                          | 5 +++++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
index 717fccc2efba..49624acf2473 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
@@ -1264,6 +1264,11 @@ static int hns_get_rxnfc(struct net_device *netdev,
 }
 
 static const struct ethtool_ops hns_ethtool_ops = {
+	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
+				     ETHTOOL_COALESCE_MAX_FRAMES |
+				     ETHTOOL_COALESCE_USE_ADAPTIVE |
+				     ETHTOOL_COALESCE_USECS_LOW_HIGH |
+				     ETHTOOL_COALESCE_MAX_FRAMES_LOW_HIGH,
 	.get_drvinfo = hns_nic_get_drvinfo,
 	.get_link  = hns_nic_get_link,
 	.get_ringparam = hns_get_ringparam,
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index acfce915a02b..be355f37337d 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -214,6 +214,11 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 #define ETHTOOL_COALESCE_USECS_LOW_HIGH					\
 	(ETHTOOL_COALESCE_RX_USECS_LOW | ETHTOOL_COALESCE_TX_USECS_LOW | \
 	 ETHTOOL_COALESCE_RX_USECS_HIGH | ETHTOOL_COALESCE_TX_USECS_HIGH)
+#define ETHTOOL_COALESCE_MAX_FRAMES_LOW_HIGH	\
+	(ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW |	\
+	 ETHTOOL_COALESCE_TX_MAX_FRAMES_LOW |	\
+	 ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH |	\
+	 ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH)
 #define ETHTOOL_COALESCE_PKT_RATE_RX_USECS				\
 	(ETHTOOL_COALESCE_USE_ADAPTIVE_RX |				\
 	 ETHTOOL_COALESCE_RX_USECS_LOW | ETHTOOL_COALESCE_RX_USECS_HIGH | \
-- 
cgit v1.2.3


From 5b7cb7451585f83d414512a70b79b2086b8c6ed1 Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@mellanox.com>
Date: Thu, 12 Mar 2020 12:23:03 +0200
Subject: net/mlx5: E-Switch, Enable reg c1 loopback when possible

Enable reg c1 loopback if firmware reports it's supported,
as this is needed for restoring packet metadata (e.g chain).

Also define helper to query if it is enabled.

Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  1 +
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 44 ++++++++++++++++------
 include/linux/mlx5/eswitch.h                       |  7 ++++
 3 files changed, 41 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 9b5eaa8a47c4..ee36a8ab8e65 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -236,6 +236,7 @@ struct mlx5_esw_functions {
 
 enum {
 	MLX5_ESWITCH_VPORT_MATCH_METADATA = BIT(0),
+	MLX5_ESWITCH_REG_C1_LOOPBACK_ENABLED = BIT(1),
 };
 
 struct mlx5_eswitch {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index bdc2e247c053..5e751d7749db 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -763,14 +763,21 @@ void mlx5_eswitch_del_send_to_vport_rule(struct mlx5_flow_handle *rule)
 	mlx5_del_flow_rules(rule);
 }
 
+static bool mlx5_eswitch_reg_c1_loopback_supported(struct mlx5_eswitch *esw)
+{
+	return MLX5_CAP_ESW_FLOWTABLE(esw->dev, fdb_to_vport_reg_c_id) &
+	       MLX5_FDB_TO_VPORT_REG_C_1;
+}
+
 static int esw_set_passing_vport_metadata(struct mlx5_eswitch *esw, bool enable)
 {
 	u32 out[MLX5_ST_SZ_DW(query_esw_vport_context_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(modify_esw_vport_context_in)] = {};
-	u8 fdb_to_vport_reg_c_id;
+	u8 curr, wanted;
 	int err;
 
-	if (!mlx5_eswitch_vport_match_metadata_enabled(esw))
+	if (!mlx5_eswitch_reg_c1_loopback_supported(esw) &&
+	    !mlx5_eswitch_vport_match_metadata_enabled(esw))
 		return 0;
 
 	err = mlx5_eswitch_query_esw_vport_context(esw->dev, 0, false,
@@ -778,24 +785,33 @@ static int esw_set_passing_vport_metadata(struct mlx5_eswitch *esw, bool enable)
 	if (err)
 		return err;
 
-	fdb_to_vport_reg_c_id = MLX5_GET(query_esw_vport_context_out, out,
-					 esw_vport_context.fdb_to_vport_reg_c_id);
+	curr = MLX5_GET(query_esw_vport_context_out, out,
+			esw_vport_context.fdb_to_vport_reg_c_id);
+	wanted = MLX5_FDB_TO_VPORT_REG_C_0;
+	if (mlx5_eswitch_reg_c1_loopback_supported(esw))
+		wanted |= MLX5_FDB_TO_VPORT_REG_C_1;
 
 	if (enable)
-		fdb_to_vport_reg_c_id |= MLX5_FDB_TO_VPORT_REG_C_0 |
-					 MLX5_FDB_TO_VPORT_REG_C_1;
+		curr |= wanted;
 	else
-		fdb_to_vport_reg_c_id &= ~(MLX5_FDB_TO_VPORT_REG_C_0 |
-					   MLX5_FDB_TO_VPORT_REG_C_1);
+		curr &= ~wanted;
 
 	MLX5_SET(modify_esw_vport_context_in, in,
-		 esw_vport_context.fdb_to_vport_reg_c_id, fdb_to_vport_reg_c_id);
+		 esw_vport_context.fdb_to_vport_reg_c_id, curr);
 
 	MLX5_SET(modify_esw_vport_context_in, in,
 		 field_select.fdb_to_vport_reg_c_id, 1);
 
-	return mlx5_eswitch_modify_esw_vport_context(esw->dev, 0, false,
-						     in, sizeof(in));
+	err = mlx5_eswitch_modify_esw_vport_context(esw->dev, 0, false, in,
+						    sizeof(in));
+	if (!err) {
+		if (enable && (curr & MLX5_FDB_TO_VPORT_REG_C_1))
+			esw->flags |= MLX5_ESWITCH_REG_C1_LOOPBACK_ENABLED;
+		else
+			esw->flags &= ~MLX5_ESWITCH_REG_C1_LOOPBACK_ENABLED;
+	}
+
+	return err;
 }
 
 static void peer_miss_rules_setup(struct mlx5_eswitch *esw,
@@ -2831,6 +2847,12 @@ bool mlx5_eswitch_is_vf_vport(const struct mlx5_eswitch *esw, u16 vport_num)
 	       vport_num <= esw->dev->priv.sriov.max_vfs;
 }
 
+bool mlx5_eswitch_reg_c1_loopback_enabled(const struct mlx5_eswitch *esw)
+{
+	return !!(esw->flags & MLX5_ESWITCH_REG_C1_LOOPBACK_ENABLED);
+}
+EXPORT_SYMBOL(mlx5_eswitch_reg_c1_loopback_enabled);
+
 bool mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw)
 {
 	return !!(esw->flags & MLX5_ESWITCH_VPORT_MATCH_METADATA);
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 61705e74a5bb..c16827eeba9c 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -70,6 +70,7 @@ u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev);
 enum devlink_eswitch_encap_mode
 mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev);
 
+bool mlx5_eswitch_reg_c1_loopback_enabled(const struct mlx5_eswitch *esw);
 bool mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw);
 
 /* Reg C0 usage:
@@ -108,6 +109,12 @@ mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev)
 	return DEVLINK_ESWITCH_ENCAP_MODE_NONE;
 }
 
+static inline bool
+mlx5_eswitch_reg_c1_loopback_enabled(const struct mlx5_eswitch *esw)
+{
+	return false;
+};
+
 static inline bool
 mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw)
 {
-- 
cgit v1.2.3


From e31a50162feb352147d3fc87b9e036703c8f2636 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Thu, 12 Mar 2020 11:44:27 -0500
Subject: bitfield.h: add FIELD_MAX() and field_max()

Define FIELD_MAX(), which supplies the maximum value that can be
represented by a field value.  Define field_max() as well, to go
along with the lower-case forms of the field mask functions.

Signed-off-by: Alex Elder <elder@linaro.org>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bitfield.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index 4bbb5f1c8b5b..48ea093ff04c 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -55,6 +55,19 @@
 					      (1ULL << __bf_shf(_mask))); \
 	})
 
+/**
+ * FIELD_MAX() - produce the maximum value representable by a field
+ * @_mask: shifted mask defining the field's length and position
+ *
+ * FIELD_MAX() returns the maximum value that can be held in the field
+ * specified by @_mask.
+ */
+#define FIELD_MAX(_mask)						\
+	({								\
+		__BF_FIELD_CHECK(_mask, 0ULL, 0ULL, "FIELD_MAX: ");	\
+		(typeof(_mask))((_mask) >> __bf_shf(_mask));		\
+	})
+
 /**
  * FIELD_FIT() - check if value fits in the field
  * @_mask: shifted mask defining the field's length and position
@@ -110,6 +123,7 @@ static __always_inline u64 field_mask(u64 field)
 {
 	return field / field_multiplier(field);
 }
+#define field_max(field)	((typeof(field))field_mask(field))
 #define ____MAKE_OP(type,base,to,from)					\
 static __always_inline __##type type##_encode_bits(base v, base field)	\
 {									\
-- 
cgit v1.2.3


From 1e2328e762548c7d17b7ba8ded9f409d05710dd1 Mon Sep 17 00:00:00 2001
From: Carlos Neira <cneirabustos@gmail.com>
Date: Wed, 4 Mar 2020 17:41:55 -0300
Subject: fs/nsfs.c: Added ns_match

ns_match returns true if the namespace inode and dev_t matches the ones
provided by the caller.

Signed-off-by: Carlos Neira <cneirabustos@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200304204157.58695-2-cneirabustos@gmail.com
---
 fs/nsfs.c               | 14 ++++++++++++++
 include/linux/proc_ns.h |  2 ++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nsfs.c b/fs/nsfs.c
index b13bfd406820..4f1205725cfe 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -247,6 +247,20 @@ out_invalid:
 	return ERR_PTR(-EINVAL);
 }
 
+/**
+ * ns_match() - Returns true if current namespace matches dev/ino provided.
+ * @ns_common: current ns
+ * @dev: dev_t from nsfs that will be matched against current nsfs
+ * @ino: ino_t from nsfs that will be matched against current nsfs
+ *
+ * Return: true if dev and ino matches the current nsfs.
+ */
+bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino)
+{
+	return (ns->inum == ino) && (nsfs_mnt->mnt_sb->s_dev == dev);
+}
+
+
 static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 4626b1ac3b6c..adff08bfecf9 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -85,6 +85,8 @@ typedef struct ns_common *ns_get_path_helper_t(void *);
 extern int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb,
 			    void *private_data);
 
+extern bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino);
+
 extern int ns_get_name(char *buf, size_t size, struct task_struct *task,
 			const struct proc_ns_operations *ns_ops);
 extern void nsfs_init(void);
-- 
cgit v1.2.3


From b4490c5c4e023f09b7d27c9a9d3e7ad7d09ea6bf Mon Sep 17 00:00:00 2001
From: Carlos Neira <cneirabustos@gmail.com>
Date: Wed, 4 Mar 2020 17:41:56 -0300
Subject: bpf: Added new helper bpf_get_ns_current_pid_tgid

New bpf helper bpf_get_ns_current_pid_tgid,
This helper will return pid and tgid from current task
which namespace matches dev_t and inode number provided,
this will allows us to instrument a process inside a container.

Signed-off-by: Carlos Neira <cneirabustos@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20200304204157.58695-3-cneirabustos@gmail.com
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 20 ++++++++++++++++++-
 kernel/bpf/core.c              |  1 +
 kernel/bpf/helpers.c           | 45 ++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c       |  2 ++
 scripts/bpf_helpers_doc.py     |  1 +
 tools/include/uapi/linux/bpf.h | 20 ++++++++++++++++++-
 7 files changed, 88 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4fd91b7c95ea..4ec835334a1f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1497,6 +1497,7 @@ extern const struct bpf_func_proto bpf_strtol_proto;
 extern const struct bpf_func_proto bpf_strtoul_proto;
 extern const struct bpf_func_proto bpf_tcp_sock_proto;
 extern const struct bpf_func_proto bpf_jiffies64_proto;
+extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 40b2d9476268..15b239da775b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2914,6 +2914,19 @@ union bpf_attr {
  *		of sizeof(struct perf_branch_entry).
  *
  *		**-ENOENT** if architecture does not support branch records.
+ *
+ * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size)
+ *	Description
+ *		Returns 0 on success, values for *pid* and *tgid* as seen from the current
+ *		*namespace* will be returned in *nsdata*.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EINVAL** if dev and inum supplied don't match dev_t and inode number
+ *              with nsfs of current task, or if dev conversion to dev_t lost high bits.
+ *
+ *		**-ENOENT** if pidns does not exists for the current task.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3035,7 +3048,8 @@ union bpf_attr {
 	FN(tcp_send_ack),		\
 	FN(send_signal_thread),		\
 	FN(jiffies64),			\
-	FN(read_branch_records),
+	FN(read_branch_records),	\
+	FN(get_ns_current_pid_tgid),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -3829,4 +3843,8 @@ struct bpf_sockopt {
 	__s32	retval;
 };
 
+struct bpf_pidns_info {
+	__u32 pid;
+	__u32 tgid;
+};
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 973a20d49749..0f9ca46e1978 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2149,6 +2149,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
 const struct bpf_func_proto bpf_get_local_storage_proto __weak;
+const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index d8b7b110a1c5..01878db15eaf 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -12,6 +12,8 @@
 #include <linux/filter.h>
 #include <linux/ctype.h>
 #include <linux/jiffies.h>
+#include <linux/pid_namespace.h>
+#include <linux/proc_ns.h>
 
 #include "../../lib/kstrtox.h"
 
@@ -499,3 +501,46 @@ const struct bpf_func_proto bpf_strtoul_proto = {
 	.arg4_type	= ARG_PTR_TO_LONG,
 };
 #endif
+
+BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
+	   struct bpf_pidns_info *, nsdata, u32, size)
+{
+	struct task_struct *task = current;
+	struct pid_namespace *pidns;
+	int err = -EINVAL;
+
+	if (unlikely(size != sizeof(struct bpf_pidns_info)))
+		goto clear;
+
+	if (unlikely((u64)(dev_t)dev != dev))
+		goto clear;
+
+	if (unlikely(!task))
+		goto clear;
+
+	pidns = task_active_pid_ns(task);
+	if (unlikely(!pidns)) {
+		err = -ENOENT;
+		goto clear;
+	}
+
+	if (!ns_match(&pidns->ns, (dev_t)dev, ino))
+		goto clear;
+
+	nsdata->pid = task_pid_nr_ns(task, pidns);
+	nsdata->tgid = task_tgid_nr_ns(task, pidns);
+	return 0;
+clear:
+	memset((void *)nsdata, 0, (size_t) size);
+	return err;
+}
+
+const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = {
+	.func		= bpf_get_ns_current_pid_tgid,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type      = ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type      = ARG_CONST_SIZE,
+};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 6a490d8ce9de..b5071c7e93ca 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -843,6 +843,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_send_signal_thread_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
+	case BPF_FUNC_get_ns_current_pid_tgid:
+		return &bpf_get_ns_current_pid_tgid_proto;
 	default:
 		return NULL;
 	}
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index cebed6fb5bbb..c1e2b5410faa 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -435,6 +435,7 @@ class PrinterHelpers(Printer):
             'struct bpf_fib_lookup',
             'struct bpf_perf_event_data',
             'struct bpf_perf_event_value',
+            'struct bpf_pidns_info',
             'struct bpf_sock',
             'struct bpf_sock_addr',
             'struct bpf_sock_ops',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 40b2d9476268..15b239da775b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2914,6 +2914,19 @@ union bpf_attr {
  *		of sizeof(struct perf_branch_entry).
  *
  *		**-ENOENT** if architecture does not support branch records.
+ *
+ * int bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size)
+ *	Description
+ *		Returns 0 on success, values for *pid* and *tgid* as seen from the current
+ *		*namespace* will be returned in *nsdata*.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EINVAL** if dev and inum supplied don't match dev_t and inode number
+ *              with nsfs of current task, or if dev conversion to dev_t lost high bits.
+ *
+ *		**-ENOENT** if pidns does not exists for the current task.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3035,7 +3048,8 @@ union bpf_attr {
 	FN(tcp_send_ack),		\
 	FN(send_signal_thread),		\
 	FN(jiffies64),			\
-	FN(read_branch_records),
+	FN(read_branch_records),	\
+	FN(get_ns_current_pid_tgid),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -3829,4 +3843,8 @@ struct bpf_sockopt {
 	__s32	retval;
 };
 
+struct bpf_pidns_info {
+	__u32 pid;
+	__u32 tgid;
+};
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From fc6a9f86f08acd3665f788619afae0d2b2d5a480 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Tue, 10 Mar 2020 10:22:28 +0200
Subject: {IB,net}/mlx5: Assign mkey variant in mlx5_ib only

mkey variant is not required for mlx5_core use, move the mkey variant
counter to mlx5_ib.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c              |  1 +
 drivers/infiniband/hw/mlx5/mlx5_ib.h           |  5 +++
 drivers/infiniband/hw/mlx5/mr.c                | 58 +++++++++++++++++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/main.c |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/mr.c   |  8 +---
 include/linux/mlx5/driver.h                    |  4 --
 6 files changed, 55 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index e4bcfa81b70a..fce863621414 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6390,6 +6390,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
 	spin_lock_init(&dev->reset_flow_resource_lock);
 	xa_init(&dev->odp_mkeys);
 	xa_init(&dev->sig_mrs);
+	spin_lock_init(&dev->mkey_lock);
 
 	spin_lock_init(&dev->dm.lock);
 	dev->dm.dev = mdev;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index d9bffcc93587..89a050e516a8 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -992,6 +992,11 @@ struct mlx5_ib_dev {
 	/* sync used page count stats
 	 */
 	struct mlx5_ib_resources	devr;
+
+	/* protect mkey key part */
+	spinlock_t			mkey_lock;
+	u8				mkey_key;
+
 	struct mlx5_mr_cache		cache;
 	struct timer_list		delay_timer;
 	/* Prevents soft lock on massive reg MRs */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 45c3282dd5e1..1b83d00e8ecd 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -47,6 +47,46 @@ enum {
 
 #define MLX5_UMR_ALIGN 2048
 
+static void
+create_mkey_callback(int status, struct mlx5_async_work *context);
+
+static void
+assign_mkey_variant(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
+		    u32 *in)
+{
+	void *mkc;
+	u8 key;
+
+	spin_lock_irq(&dev->mkey_lock);
+	key = dev->mkey_key++;
+	spin_unlock_irq(&dev->mkey_lock);
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, mkey_7_0, key);
+	mkey->key = key;
+}
+
+static int
+mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
+		    u32 *in, int inlen)
+{
+	assign_mkey_variant(dev, mkey, in);
+	return mlx5_core_create_mkey(dev->mdev, mkey, in, inlen);
+}
+
+static int
+mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
+		       struct mlx5_core_mkey *mkey,
+		       struct mlx5_async_ctx *async_ctx,
+		       u32 *in, int inlen, u32 *out, int outlen,
+		       struct mlx5_async_work *context)
+{
+	assign_mkey_variant(dev, mkey, in);
+	return mlx5_core_create_mkey_cb(dev->mdev, mkey, async_ctx,
+					in, inlen, out, outlen,
+					create_mkey_callback, context);
+}
+
 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
 static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
 static int mr_cache_max_order(struct mlx5_ib_dev *dev);
@@ -79,7 +119,7 @@ static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
 		length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
 }
 
-static void reg_mr_callback(int status, struct mlx5_async_work *context)
+static void create_mkey_callback(int status, struct mlx5_async_work *context)
 {
 	struct mlx5_ib_mr *mr =
 		container_of(context, struct mlx5_ib_mr, cb_work);
@@ -160,10 +200,10 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
 		spin_lock_irq(&ent->lock);
 		ent->pending++;
 		spin_unlock_irq(&ent->lock);
-		err = mlx5_core_create_mkey_cb(dev->mdev, &mr->mmkey,
+		err = mlx5_ib_create_mkey_cb(dev, &mr->mmkey,
 					       &dev->async_ctx, in, inlen,
 					       mr->out, sizeof(mr->out),
-					       reg_mr_callback, &mr->cb_work);
+					       &mr->cb_work);
 		if (err) {
 			spin_lock_irq(&ent->lock);
 			ent->pending--;
@@ -682,7 +722,6 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_ib_mr *mr;
 	void *mkc;
 	u32 *in;
@@ -704,7 +743,7 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
 	MLX5_SET(mkc, mkc, length64, 1);
 	set_mkc_access_pd_addr_fields(mkc, acc, 0, pd);
 
-	err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
+	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
 	if (err)
 		goto err_in;
 
@@ -1094,7 +1133,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
 			 get_octo_len(virt_addr, length, page_shift));
 	}
 
-	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
+	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
 	if (err) {
 		mlx5_ib_warn(dev, "create mkey failed\n");
 		goto err_2;
@@ -1134,7 +1173,6 @@ static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_ib_mr *mr;
 	void *mkc;
 	u32 *in;
@@ -1157,7 +1195,7 @@ static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
 	MLX5_SET64(mkc, mkc, len, length);
 	set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
 
-	err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
+	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
 	if (err)
 		goto err_in;
 
@@ -1635,7 +1673,7 @@ static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
 
 	mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
 
-	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
+	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
 	if (err)
 		goto err_free_descs;
 
@@ -1902,7 +1940,7 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
 	MLX5_SET(mkc, mkc, en_rinval, !!((type == IB_MW_TYPE_2)));
 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
 
-	err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, inlen);
+	err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
 	if (err)
 		goto free;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index f554cfddcf4e..6b38ec72215a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1282,7 +1282,6 @@ static int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx)
 	mutex_init(&priv->alloc_mutex);
 	mutex_init(&priv->pgdir_mutex);
 	INIT_LIST_HEAD(&priv->pgdir_list);
-	spin_lock_init(&priv->mkey_lock);
 
 	priv->dbg_root = debugfs_create_dir(dev_name(dev->device),
 					    mlx5_debugfs_root);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
index 770d13bb4f20..51814d023efb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
@@ -49,14 +49,7 @@ int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev,
 	int err;
 	u8 key;
 
-	spin_lock_irq(&dev->priv.mkey_lock);
-	key = dev->priv.mkey_key++;
-	spin_unlock_irq(&dev->priv.mkey_lock);
-	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
-
 	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
-	MLX5_SET(mkc, mkc, mkey_7_0, key);
-	mkey->key = key;
 
 	if (callback)
 		return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
@@ -66,6 +59,7 @@ int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev,
 	if (err)
 		return err;
 
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	mkey_index = MLX5_GET(create_mkey_out, lout, mkey_index);
 	mkey->iova = MLX5_GET64(mkc, mkc, start_addr);
 	mkey->size = MLX5_GET64(mkc, mkc, len);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f2b4225ed650..e044703c056b 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -575,10 +575,6 @@ struct mlx5_priv {
 	/* end: alloc staff */
 	struct dentry	       *dbg_root;
 
-	/* protect mkey key part */
-	spinlock_t		mkey_lock;
-	u8			mkey_key;
-
 	struct list_head        dev_list;
 	struct list_head        ctx_list;
 	spinlock_t              ctx_lock;
-- 
cgit v1.2.3


From a3cfdd3928113012d0f2c5353277f4e27878a663 Mon Sep 17 00:00:00 2001
From: Michael Guralnik <michaelgur@mellanox.com>
Date: Tue, 10 Mar 2020 10:22:30 +0200
Subject: {IB,net}/mlx5: Move asynchronous mkey creation to mlx5_ib

As mlx5_ib is the only user of the mlx5_core_create_mkey_cb, move the
logic inside mlx5_ib and cleanup the code in mlx5_core.

Signed-off-by: Michael Guralnik <michaelgur@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mr.c              |  6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/mr.c | 22 +++-------------------
 include/linux/mlx5/driver.h                  |  6 ------
 3 files changed, 6 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 1b83d00e8ecd..8508af500972 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -81,10 +81,10 @@ mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
 		       u32 *in, int inlen, u32 *out, int outlen,
 		       struct mlx5_async_work *context)
 {
+	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
 	assign_mkey_variant(dev, mkey, in);
-	return mlx5_core_create_mkey_cb(dev->mdev, mkey, async_ctx,
-					in, inlen, out, outlen,
-					create_mkey_callback, context);
+	return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
+				create_mkey_callback, context);
 }
 
 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
index 51814d023efb..fd3e6d217c3b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
@@ -36,12 +36,9 @@
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
 
-int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev,
-			     struct mlx5_core_mkey *mkey,
-			     struct mlx5_async_ctx *async_ctx, u32 *in,
-			     int inlen, u32 *out, int outlen,
-			     mlx5_async_cbk_t callback,
-			     struct mlx5_async_work *context)
+int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
+			  struct mlx5_core_mkey *mkey,
+			  u32 *in, int inlen)
 {
 	u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {0};
 	u32 mkey_index;
@@ -51,10 +48,6 @@ int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev,
 
 	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
 
-	if (callback)
-		return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
-					callback, context);
-
 	err = mlx5_cmd_exec(dev, in, inlen, lout, sizeof(lout));
 	if (err)
 		return err;
@@ -70,15 +63,6 @@ int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev,
 		      mkey_index, key, mkey->key);
 	return 0;
 }
-EXPORT_SYMBOL(mlx5_core_create_mkey_cb);
-
-int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
-			  struct mlx5_core_mkey *mkey,
-			  u32 *in, int inlen)
-{
-	return mlx5_core_create_mkey_cb(dev, mkey, NULL, in, inlen,
-					NULL, 0, NULL, NULL);
-}
 EXPORT_SYMBOL(mlx5_core_create_mkey);
 
 int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index e044703c056b..1de78f001d26 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -943,12 +943,6 @@ struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
 						      gfp_t flags, int npages);
 void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev,
 				 struct mlx5_cmd_mailbox *head);
-int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev,
-			     struct mlx5_core_mkey *mkey,
-			     struct mlx5_async_ctx *async_ctx, u32 *in,
-			     int inlen, u32 *out, int outlen,
-			     mlx5_async_cbk_t callback,
-			     struct mlx5_async_work *context);
 int mlx5_core_create_mkey(struct mlx5_core_dev *dev,
 			  struct mlx5_core_mkey *mkey,
 			  u32 *in, int inlen);
-- 
cgit v1.2.3


From 98868668367b24487c0b0b3298d7ca98409baf07 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Thu, 12 Mar 2020 17:21:28 -0700
Subject: bpf: Abstract away entire bpf_link clean up procedure

Instead of requiring users to do three steps for cleaning up bpf_link, its
anon_inode file, and unused fd, abstract that away into bpf_link_cleanup()
helper. bpf_link_defunct() is removed, as it shouldn't be needed as an
individual operation anymore.

v1->v2:
- keep bpf_link_cleanup() static for now (Daniel).

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20200313002128.2028680-1-andriin@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  |  1 -
 kernel/bpf/syscall.c | 18 +++++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4ec835334a1f..c2f815e9f7d0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1075,7 +1075,6 @@ struct bpf_link_ops {
 
 void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
 		   struct bpf_prog *prog);
-void bpf_link_defunct(struct bpf_link *link);
 void bpf_link_inc(struct bpf_link *link);
 void bpf_link_put(struct bpf_link *link);
 int bpf_link_new_fd(struct bpf_link *link);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b2f73ecacced..85567a6ea5f9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2188,9 +2188,17 @@ void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
 	link->prog = prog;
 }
 
-void bpf_link_defunct(struct bpf_link *link)
+/* Clean up bpf_link and corresponding anon_inode file and FD. After
+ * anon_inode is created, bpf_link can't be just kfree()'d due to deferred
+ * anon_inode's release() call. This helper manages marking bpf_link as
+ * defunct, releases anon_inode file and puts reserved FD.
+ */
+static void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
+			     int link_fd)
 {
 	link->prog = NULL;
+	fput(link_file);
+	put_unused_fd(link_fd);
 }
 
 void bpf_link_inc(struct bpf_link *link)
@@ -2383,9 +2391,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
 
 	err = bpf_trampoline_link_prog(prog);
 	if (err) {
-		bpf_link_defunct(&link->link);
-		fput(link_file);
-		put_unused_fd(link_fd);
+		bpf_link_cleanup(&link->link, link_file, link_fd);
 		goto out_put_prog;
 	}
 
@@ -2498,9 +2504,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 
 	err = bpf_probe_register(link->btp, prog);
 	if (err) {
-		bpf_link_defunct(&link->link);
-		fput(link_file);
-		put_unused_fd(link_fd);
+		bpf_link_cleanup(&link->link, link_file, link_fd);
 		goto out_put_btp;
 	}
 
-- 
cgit v1.2.3


From 6a64037d4bf252bb8cf13917320c8e001da8997a Mon Sep 17 00:00:00 2001
From: Björn Töpel <bjorn.topel@intel.com>
Date: Thu, 12 Mar 2020 20:55:57 +0100
Subject: bpf: Add bpf_trampoline_ name prefix for DECLARE_BPF_DISPATCHER
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adding bpf_trampoline_ name prefix for DECLARE_BPF_DISPATCHER,
so all the dispatchers have the common name prefix.

And also a small '_' cleanup for bpf_dispatcher_nopfunc function
name.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200312195610.346362-3-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h    | 21 +++++++++++----------
 include/linux/filter.h |  9 ++++-----
 net/core/filter.c      |  5 ++---
 3 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c2f815e9f7d0..fe1f8b075378 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -522,7 +522,7 @@ struct bpf_dispatcher {
 	u32 image_off;
 };
 
-static __always_inline unsigned int bpf_dispatcher_nopfunc(
+static __always_inline unsigned int bpf_dispatcher_nop_func(
 	const void *ctx,
 	const struct bpf_insn *insnsi,
 	unsigned int (*bpf_func)(const void *,
@@ -537,7 +537,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog);
 void bpf_trampoline_put(struct bpf_trampoline *tr);
 #define BPF_DISPATCHER_INIT(name) {			\
 	.mutex = __MUTEX_INITIALIZER(name.mutex),	\
-	.func = &name##func,				\
+	.func = &name##_func,				\
 	.progs = {},					\
 	.num_progs = 0,					\
 	.image = NULL,					\
@@ -545,7 +545,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr);
 }
 
 #define DEFINE_BPF_DISPATCHER(name)					\
-	noinline unsigned int name##func(				\
+	noinline unsigned int bpf_dispatcher_##name##_func(		\
 		const void *ctx,					\
 		const struct bpf_insn *insnsi,				\
 		unsigned int (*bpf_func)(const void *,			\
@@ -553,17 +553,18 @@ void bpf_trampoline_put(struct bpf_trampoline *tr);
 	{								\
 		return bpf_func(ctx, insnsi);				\
 	}								\
-	EXPORT_SYMBOL(name##func);			\
-	struct bpf_dispatcher name = BPF_DISPATCHER_INIT(name);
+	EXPORT_SYMBOL(bpf_dispatcher_##name##_func);			\
+	struct bpf_dispatcher bpf_dispatcher_##name =			\
+		BPF_DISPATCHER_INIT(bpf_dispatcher_##name);
 #define DECLARE_BPF_DISPATCHER(name)					\
-	unsigned int name##func(					\
+	unsigned int bpf_dispatcher_##name##_func(			\
 		const void *ctx,					\
 		const struct bpf_insn *insnsi,				\
 		unsigned int (*bpf_func)(const void *,			\
 					 const struct bpf_insn *));	\
-	extern struct bpf_dispatcher name;
-#define BPF_DISPATCHER_FUNC(name) name##func
-#define BPF_DISPATCHER_PTR(name) (&name)
+	extern struct bpf_dispatcher bpf_dispatcher_##name;
+#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_##name##_func
+#define BPF_DISPATCHER_PTR(name) (&bpf_dispatcher_##name)
 void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 				struct bpf_prog *to);
 struct bpf_image {
@@ -589,7 +590,7 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
 static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
 #define DEFINE_BPF_DISPATCHER(name)
 #define DECLARE_BPF_DISPATCHER(name)
-#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_nopfunc
+#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_nop_func
 #define BPF_DISPATCHER_PTR(name) NULL
 static inline void bpf_dispatcher_change_prog(struct bpf_dispatcher *d,
 					      struct bpf_prog *from,
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 43b5e455d2f5..6249679275b3 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -577,7 +577,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 	ret; })
 
 #define BPF_PROG_RUN(prog, ctx)						\
-	__BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc)
+	__BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func)
 
 /*
  * Use in preemptible and therefore migratable context to make sure that
@@ -596,7 +596,7 @@ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
 	u32 ret;
 
 	migrate_disable();
-	ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc);
+	ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func);
 	migrate_enable();
 	return ret;
 }
@@ -722,7 +722,7 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
 	return res;
 }
 
-DECLARE_BPF_DISPATCHER(bpf_dispatcher_xdp)
+DECLARE_BPF_DISPATCHER(xdp)
 
 static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 					    struct xdp_buff *xdp)
@@ -733,8 +733,7 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 	 * already takes rcu_read_lock() when fetching the program, so
 	 * it's not necessary here anymore.
 	 */
-	return __BPF_PROG_RUN(prog, xdp,
-			      BPF_DISPATCHER_FUNC(bpf_dispatcher_xdp));
+	return __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp));
 }
 
 void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog);
diff --git a/net/core/filter.c b/net/core/filter.c
index 22219544410f..96350a743539 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8859,10 +8859,9 @@ const struct bpf_prog_ops sk_reuseport_prog_ops = {
 };
 #endif /* CONFIG_INET */
 
-DEFINE_BPF_DISPATCHER(bpf_dispatcher_xdp)
+DEFINE_BPF_DISPATCHER(xdp)
 
 void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
 {
-	bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(bpf_dispatcher_xdp),
-				   prev_prog, prog);
+	bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
 }
-- 
cgit v1.2.3


From 535911c80ad4f5801700e9d827a1985bbff41519 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 12 Mar 2020 20:55:58 +0100
Subject: bpf: Add struct bpf_ksym

Adding 'struct bpf_ksym' object that will carry the
kallsym information for bpf symbol. Adding the start
and end address to begin with. It will be used by
bpf_prog, bpf_trampoline, bpf_dispatcher objects.

The symbol_start/symbol_end values were originally used
to sort bpf_prog objects. For the address displayed in
/proc/kallsyms we are using prog->bpf_func value.

I'm using the bpf_func value for program symbol start
instead of the symbol_start, because it makes no difference
for sorting bpf_prog objects and we can use it directly as
an address to display it in /proc/kallsyms.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200312195610.346362-4-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h |  6 ++++++
 kernel/bpf/core.c   | 28 ++++++++++++----------------
 2 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index fe1f8b075378..6ca3d5c8ccf3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -471,6 +471,11 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
 u64 notrace __bpf_prog_enter(void);
 void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
 
+struct bpf_ksym {
+	unsigned long		 start;
+	unsigned long		 end;
+};
+
 enum bpf_tramp_prog_type {
 	BPF_TRAMP_FENTRY,
 	BPF_TRAMP_FEXIT,
@@ -653,6 +658,7 @@ struct bpf_prog_aux {
 	u32 size_poke_tab;
 	struct latch_tree_node ksym_tnode;
 	struct list_head ksym_lnode;
+	struct bpf_ksym ksym;
 	const struct bpf_prog_ops *ops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 0f9ca46e1978..e587d6306d7c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -523,18 +523,16 @@ int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
 int bpf_jit_harden   __read_mostly;
 long bpf_jit_limit   __read_mostly;
 
-static __always_inline void
-bpf_get_prog_addr_region(const struct bpf_prog *prog,
-			 unsigned long *symbol_start,
-			 unsigned long *symbol_end)
+static void
+bpf_prog_ksym_set_addr(struct bpf_prog *prog)
 {
 	const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
 	unsigned long addr = (unsigned long)hdr;
 
 	WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
 
-	*symbol_start = addr;
-	*symbol_end   = addr + hdr->pages * PAGE_SIZE;
+	prog->aux->ksym.start = (unsigned long) prog->bpf_func;
+	prog->aux->ksym.end   = addr + hdr->pages * PAGE_SIZE;
 }
 
 void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
@@ -575,13 +573,10 @@ void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
 static __always_inline unsigned long
 bpf_get_prog_addr_start(struct latch_tree_node *n)
 {
-	unsigned long symbol_start, symbol_end;
 	const struct bpf_prog_aux *aux;
 
 	aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
-	bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
-
-	return symbol_start;
+	return aux->ksym.start;
 }
 
 static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
@@ -593,15 +588,13 @@ static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
 static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
 {
 	unsigned long val = (unsigned long)key;
-	unsigned long symbol_start, symbol_end;
 	const struct bpf_prog_aux *aux;
 
 	aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
-	bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
 
-	if (val < symbol_start)
+	if (val < aux->ksym.start)
 		return -1;
-	if (val >= symbol_end)
+	if (val >= aux->ksym.end)
 		return  1;
 
 	return 0;
@@ -649,6 +642,8 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 	    !capable(CAP_SYS_ADMIN))
 		return;
 
+	bpf_prog_ksym_set_addr(fp);
+
 	spin_lock_bh(&bpf_lock);
 	bpf_prog_ksym_node_add(fp->aux);
 	spin_unlock_bh(&bpf_lock);
@@ -677,14 +672,15 @@ static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
 const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
 				 unsigned long *off, char *sym)
 {
-	unsigned long symbol_start, symbol_end;
 	struct bpf_prog *prog;
 	char *ret = NULL;
 
 	rcu_read_lock();
 	prog = bpf_prog_kallsyms_find(addr);
 	if (prog) {
-		bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end);
+		unsigned long symbol_start = prog->aux->ksym.start;
+		unsigned long symbol_end = prog->aux->ksym.end;
+
 		bpf_get_prog_name(prog, sym);
 
 		ret = sym;
-- 
cgit v1.2.3


From bfea9a8574f34597581f74f792d044d38497b775 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 12 Mar 2020 20:55:59 +0100
Subject: bpf: Add name to struct bpf_ksym

Adding name to 'struct bpf_ksym' object to carry the name
of the symbol for bpf_prog, bpf_trampoline, bpf_dispatcher
objects.

The current benefit is that name is now generated only when
the symbol is added to the list, so we don't need to generate
it every time it's accessed.

The future benefit is that we will have all the bpf objects
symbols represented by struct bpf_ksym.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200312195610.346362-5-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h    | 2 ++
 include/linux/filter.h | 6 ------
 kernel/bpf/core.c      | 9 ++++++---
 kernel/events/core.c   | 9 ++++-----
 4 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6ca3d5c8ccf3..047b44deb3c5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -18,6 +18,7 @@
 #include <linux/refcount.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
+#include <linux/kallsyms.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -474,6 +475,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
 struct bpf_ksym {
 	unsigned long		 start;
 	unsigned long		 end;
+	char			 name[KSYM_NAME_LEN];
 };
 
 enum bpf_tramp_prog_type {
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6249679275b3..9b5aa5c483cc 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1083,7 +1083,6 @@ bpf_address_lookup(unsigned long addr, unsigned long *size,
 
 void bpf_prog_kallsyms_add(struct bpf_prog *fp);
 void bpf_prog_kallsyms_del(struct bpf_prog *fp);
-void bpf_get_prog_name(const struct bpf_prog *prog, char *sym);
 
 #else /* CONFIG_BPF_JIT */
 
@@ -1152,11 +1151,6 @@ static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp)
 {
 }
 
-static inline void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
-{
-	sym[0] = '\0';
-}
-
 #endif /* CONFIG_BPF_JIT */
 
 void bpf_prog_kallsyms_del_all(struct bpf_prog *fp);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index e587d6306d7c..f6800c2d4b01 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -535,8 +535,10 @@ bpf_prog_ksym_set_addr(struct bpf_prog *prog)
 	prog->aux->ksym.end   = addr + hdr->pages * PAGE_SIZE;
 }
 
-void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+static void
+bpf_prog_ksym_set_name(struct bpf_prog *prog)
 {
+	char *sym = prog->aux->ksym.name;
 	const char *end = sym + KSYM_NAME_LEN;
 	const struct btf_type *type;
 	const char *func_name;
@@ -643,6 +645,7 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 		return;
 
 	bpf_prog_ksym_set_addr(fp);
+	bpf_prog_ksym_set_name(fp);
 
 	spin_lock_bh(&bpf_lock);
 	bpf_prog_ksym_node_add(fp->aux);
@@ -681,7 +684,7 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
 		unsigned long symbol_start = prog->aux->ksym.start;
 		unsigned long symbol_end = prog->aux->ksym.end;
 
-		bpf_get_prog_name(prog, sym);
+		strncpy(sym, prog->aux->ksym.name, KSYM_NAME_LEN);
 
 		ret = sym;
 		if (size)
@@ -738,7 +741,7 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 		if (it++ != symnum)
 			continue;
 
-		bpf_get_prog_name(aux->prog, sym);
+		strncpy(sym, aux->ksym.name, KSYM_NAME_LEN);
 
 		*value = (unsigned long)aux->prog->bpf_func;
 		*type  = BPF_SYM_ELF_TYPE;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bbdfac0182f4..9b89ef176247 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8255,23 +8255,22 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
 					 enum perf_bpf_event_type type)
 {
 	bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
-	char sym[KSYM_NAME_LEN];
 	int i;
 
 	if (prog->aux->func_cnt == 0) {
-		bpf_get_prog_name(prog, sym);
 		perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
 				   (u64)(unsigned long)prog->bpf_func,
-				   prog->jited_len, unregister, sym);
+				   prog->jited_len, unregister,
+				   prog->aux->ksym.name);
 	} else {
 		for (i = 0; i < prog->aux->func_cnt; i++) {
 			struct bpf_prog *subprog = prog->aux->func[i];
 
-			bpf_get_prog_name(subprog, sym);
 			perf_event_ksymbol(
 				PERF_RECORD_KSYMBOL_TYPE_BPF,
 				(u64)(unsigned long)subprog->bpf_func,
-				subprog->jited_len, unregister, sym);
+				subprog->jited_len, unregister,
+				prog->aux->ksym.name);
 		}
 	}
 }
-- 
cgit v1.2.3


From ecb60d1c670e9b205197d8e4381b19e77bc2d834 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 12 Mar 2020 20:56:00 +0100
Subject: bpf: Move lnode list node to struct bpf_ksym

Adding lnode list node to 'struct bpf_ksym' object,
so the struct bpf_ksym itself can be chained and used
in other objects like bpf_trampoline and bpf_dispatcher.

Changing iterator to bpf_ksym in bpf_get_kallsym function.

The ksym->start is holding the prog->bpf_func value,
so it's ok to use it as value in bpf_get_kallsym.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200312195610.346362-6-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h |  2 +-
 kernel/bpf/core.c   | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 047b44deb3c5..4fad2fa4135c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -476,6 +476,7 @@ struct bpf_ksym {
 	unsigned long		 start;
 	unsigned long		 end;
 	char			 name[KSYM_NAME_LEN];
+	struct list_head	 lnode;
 };
 
 enum bpf_tramp_prog_type {
@@ -659,7 +660,6 @@ struct bpf_prog_aux {
 	struct bpf_jit_poke_descriptor *poke_tab;
 	u32 size_poke_tab;
 	struct latch_tree_node ksym_tnode;
-	struct list_head ksym_lnode;
 	struct bpf_ksym ksym;
 	const struct bpf_prog_ops *ops;
 	struct bpf_map **used_maps;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f6800c2d4b01..5eb5d5bb7a95 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -97,7 +97,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 	fp->aux->prog = fp;
 	fp->jit_requested = ebpf_jit_enabled();
 
-	INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode);
+	INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
 
 	return fp;
 }
@@ -613,18 +613,18 @@ static struct latch_tree_root bpf_tree __cacheline_aligned;
 
 static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
 {
-	WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
-	list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms);
+	WARN_ON_ONCE(!list_empty(&aux->ksym.lnode));
+	list_add_tail_rcu(&aux->ksym.lnode, &bpf_kallsyms);
 	latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
 }
 
 static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux)
 {
-	if (list_empty(&aux->ksym_lnode))
+	if (list_empty(&aux->ksym.lnode))
 		return;
 
 	latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
-	list_del_rcu(&aux->ksym_lnode);
+	list_del_rcu(&aux->ksym.lnode);
 }
 
 static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
@@ -634,8 +634,8 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
 
 static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
 {
-	return list_empty(&fp->aux->ksym_lnode) ||
-	       fp->aux->ksym_lnode.prev == LIST_POISON2;
+	return list_empty(&fp->aux->ksym.lnode) ||
+	       fp->aux->ksym.lnode.prev == LIST_POISON2;
 }
 
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
@@ -729,7 +729,7 @@ out:
 int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 		    char *sym)
 {
-	struct bpf_prog_aux *aux;
+	struct bpf_ksym *ksym;
 	unsigned int it = 0;
 	int ret = -ERANGE;
 
@@ -737,13 +737,13 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 		return ret;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) {
+	list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) {
 		if (it++ != symnum)
 			continue;
 
-		strncpy(sym, aux->ksym.name, KSYM_NAME_LEN);
+		strncpy(sym, ksym->name, KSYM_NAME_LEN);
 
-		*value = (unsigned long)aux->prog->bpf_func;
+		*value = ksym->start;
 		*type  = BPF_SYM_ELF_TYPE;
 
 		ret = 0;
-- 
cgit v1.2.3


From ca4424c920f574b7246ff1b6d83cfdfd709e42c8 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 12 Mar 2020 20:56:01 +0100
Subject: bpf: Move ksym_tnode to bpf_ksym

Moving ksym_tnode list node to 'struct bpf_ksym' object,
so the symbol itself can be chained and used in other
objects like bpf_trampoline and bpf_dispatcher.

We need bpf_ksym object to be linked both in bpf_kallsyms
via lnode for /proc/kallsyms and in bpf_tree via tnode for
bpf address lookup functions like __bpf_address_lookup or
bpf_prog_kallsyms_find.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200312195610.346362-7-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h |  2 +-
 kernel/bpf/core.c   | 24 ++++++++++--------------
 2 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4fad2fa4135c..68d66b0078df 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -477,6 +477,7 @@ struct bpf_ksym {
 	unsigned long		 end;
 	char			 name[KSYM_NAME_LEN];
 	struct list_head	 lnode;
+	struct latch_tree_node	 tnode;
 };
 
 enum bpf_tramp_prog_type {
@@ -659,7 +660,6 @@ struct bpf_prog_aux {
 	void *jit_data; /* JIT specific data. arch dependent */
 	struct bpf_jit_poke_descriptor *poke_tab;
 	u32 size_poke_tab;
-	struct latch_tree_node ksym_tnode;
 	struct bpf_ksym ksym;
 	const struct bpf_prog_ops *ops;
 	struct bpf_map **used_maps;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5eb5d5bb7a95..ab1846c34167 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -572,31 +572,27 @@ bpf_prog_ksym_set_name(struct bpf_prog *prog)
 		*sym = 0;
 }
 
-static __always_inline unsigned long
-bpf_get_prog_addr_start(struct latch_tree_node *n)
+static unsigned long bpf_get_ksym_start(struct latch_tree_node *n)
 {
-	const struct bpf_prog_aux *aux;
-
-	aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
-	return aux->ksym.start;
+	return container_of(n, struct bpf_ksym, tnode)->start;
 }
 
 static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
 					  struct latch_tree_node *b)
 {
-	return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b);
+	return bpf_get_ksym_start(a) < bpf_get_ksym_start(b);
 }
 
 static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
 {
 	unsigned long val = (unsigned long)key;
-	const struct bpf_prog_aux *aux;
+	const struct bpf_ksym *ksym;
 
-	aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
+	ksym = container_of(n, struct bpf_ksym, tnode);
 
-	if (val < aux->ksym.start)
+	if (val < ksym->start)
 		return -1;
-	if (val >= aux->ksym.end)
+	if (val >= ksym->end)
 		return  1;
 
 	return 0;
@@ -615,7 +611,7 @@ static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
 {
 	WARN_ON_ONCE(!list_empty(&aux->ksym.lnode));
 	list_add_tail_rcu(&aux->ksym.lnode, &bpf_kallsyms);
-	latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
+	latch_tree_insert(&aux->ksym.tnode, &bpf_tree, &bpf_tree_ops);
 }
 
 static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux)
@@ -623,7 +619,7 @@ static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux)
 	if (list_empty(&aux->ksym.lnode))
 		return;
 
-	latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
+	latch_tree_erase(&aux->ksym.tnode, &bpf_tree, &bpf_tree_ops);
 	list_del_rcu(&aux->ksym.lnode);
 }
 
@@ -668,7 +664,7 @@ static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
 
 	n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
 	return n ?
-	       container_of(n, struct bpf_prog_aux, ksym_tnode)->prog :
+	       container_of(n, struct bpf_prog_aux, ksym.tnode)->prog :
 	       NULL;
 }
 
-- 
cgit v1.2.3


From cbd76f8d5ac9c4e99c4ffe5e39a1e907cdf5a76f Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 12 Mar 2020 20:56:03 +0100
Subject: bpf: Add prog flag to struct bpf_ksym object

Adding 'prog' bool flag to 'struct bpf_ksym' to mark that
this object belongs to bpf_prog object.

This change allows having bpf_prog objects together with
other types (trampolines and dispatchers) in the single
bpf_tree. It's used when searching for bpf_prog exception
tables by the bpf_prog_ksym_find function, where we need
to get the bpf_prog pointer.

>From now we can safely add bpf_ksym support for trampoline
or dispatcher objects, because we can differentiate them
from bpf_prog objects.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200312195610.346362-9-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h |  1 +
 kernel/bpf/core.c   | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 68d66b0078df..a0cef664c1a9 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -478,6 +478,7 @@ struct bpf_ksym {
 	char			 name[KSYM_NAME_LEN];
 	struct list_head	 lnode;
 	struct latch_tree_node	 tnode;
+	bool			 prog;
 };
 
 enum bpf_tramp_prog_type {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index cd380f7f015c..7516cbc65996 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -642,6 +642,7 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 
 	bpf_prog_ksym_set_addr(fp);
 	bpf_prog_ksym_set_name(fp);
+	fp->aux->ksym.prog = true;
 
 	spin_lock_bh(&bpf_lock);
 	bpf_prog_ksym_node_add(fp->aux);
@@ -658,16 +659,6 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp)
 	spin_unlock_bh(&bpf_lock);
 }
 
-static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
-{
-	struct latch_tree_node *n;
-
-	n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
-	return n ?
-	       container_of(n, struct bpf_prog_aux, ksym.tnode)->prog :
-	       NULL;
-}
-
 static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
 {
 	struct latch_tree_node *n;
@@ -712,13 +703,22 @@ bool is_bpf_text_address(unsigned long addr)
 	return ret;
 }
 
+static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+{
+	struct bpf_ksym *ksym = bpf_ksym_find(addr);
+
+	return ksym && ksym->prog ?
+	       container_of(ksym, struct bpf_prog_aux, ksym)->prog :
+	       NULL;
+}
+
 const struct exception_table_entry *search_bpf_extables(unsigned long addr)
 {
 	const struct exception_table_entry *e = NULL;
 	struct bpf_prog *prog;
 
 	rcu_read_lock();
-	prog = bpf_prog_kallsyms_find(addr);
+	prog = bpf_prog_ksym_find(addr);
 	if (!prog)
 		goto out;
 	if (!prog->aux->num_exentries)
-- 
cgit v1.2.3


From dba122fb5e122e8e07e2f11cdebc10ba4f425cf7 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 12 Mar 2020 20:56:04 +0100
Subject: bpf: Add bpf_ksym_add/del functions

Separating /proc/kallsyms add/del code and adding bpf_ksym_add/del
functions for that.

Moving bpf_prog_ksym_node_add/del functions to __bpf_ksym_add/del
and changing their argument to 'struct bpf_ksym' object. This way
we can call them for other bpf objects types like trampoline and
dispatcher.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200312195610.346362-10-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h |  3 +++
 kernel/bpf/core.c   | 33 +++++++++++++++++++--------------
 2 files changed, 22 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a0cef664c1a9..ec1de88b8487 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -584,6 +584,9 @@ struct bpf_image {
 #define BPF_IMAGE_SIZE (PAGE_SIZE - sizeof(struct bpf_image))
 bool is_bpf_image_address(unsigned long address);
 void *bpf_image_alloc(void);
+/* Called only from JIT-enabled code, so there's no need for stubs. */
+void bpf_ksym_add(struct bpf_ksym *ksym);
+void bpf_ksym_del(struct bpf_ksym *ksym);
 #else
 static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7516cbc65996..914f3463aa41 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -607,20 +607,29 @@ static DEFINE_SPINLOCK(bpf_lock);
 static LIST_HEAD(bpf_kallsyms);
 static struct latch_tree_root bpf_tree __cacheline_aligned;
 
-static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
+void bpf_ksym_add(struct bpf_ksym *ksym)
 {
-	WARN_ON_ONCE(!list_empty(&aux->ksym.lnode));
-	list_add_tail_rcu(&aux->ksym.lnode, &bpf_kallsyms);
-	latch_tree_insert(&aux->ksym.tnode, &bpf_tree, &bpf_tree_ops);
+	spin_lock_bh(&bpf_lock);
+	WARN_ON_ONCE(!list_empty(&ksym->lnode));
+	list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms);
+	latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
+	spin_unlock_bh(&bpf_lock);
 }
 
-static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux)
+static void __bpf_ksym_del(struct bpf_ksym *ksym)
 {
-	if (list_empty(&aux->ksym.lnode))
+	if (list_empty(&ksym->lnode))
 		return;
 
-	latch_tree_erase(&aux->ksym.tnode, &bpf_tree, &bpf_tree_ops);
-	list_del_rcu(&aux->ksym.lnode);
+	latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
+	list_del_rcu(&ksym->lnode);
+}
+
+void bpf_ksym_del(struct bpf_ksym *ksym)
+{
+	spin_lock_bh(&bpf_lock);
+	__bpf_ksym_del(ksym);
+	spin_unlock_bh(&bpf_lock);
 }
 
 static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
@@ -644,9 +653,7 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 	bpf_prog_ksym_set_name(fp);
 	fp->aux->ksym.prog = true;
 
-	spin_lock_bh(&bpf_lock);
-	bpf_prog_ksym_node_add(fp->aux);
-	spin_unlock_bh(&bpf_lock);
+	bpf_ksym_add(&fp->aux->ksym);
 }
 
 void bpf_prog_kallsyms_del(struct bpf_prog *fp)
@@ -654,9 +661,7 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp)
 	if (!bpf_prog_kallsyms_candidate(fp))
 		return;
 
-	spin_lock_bh(&bpf_lock);
-	bpf_prog_ksym_node_del(fp->aux);
-	spin_unlock_bh(&bpf_lock);
+	bpf_ksym_del(&fp->aux->ksym);
 }
 
 static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
-- 
cgit v1.2.3


From a108f7dcfa010e3da825af90d77ac0a6a0240992 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 12 Mar 2020 20:56:05 +0100
Subject: bpf: Add trampolines to kallsyms

Adding trampolines to kallsyms. It's displayed as
  bpf_trampoline_<ID> [bpf]

where ID is the BTF id of the trampoline function.

Adding bpf_image_ksym_add/del functions that setup
the start/end values and call KSYMBOL perf events
handlers.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200312195610.346362-11-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h     |  3 +++
 kernel/bpf/trampoline.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ec1de88b8487..083860be1944 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -513,6 +513,7 @@ struct bpf_trampoline {
 	/* Executable image of trampoline */
 	void *image;
 	u64 selector;
+	struct bpf_ksym ksym;
 };
 
 #define BPF_DISPATCHER_MAX 48 /* Fits in 2048B */
@@ -585,6 +586,8 @@ struct bpf_image {
 bool is_bpf_image_address(unsigned long address);
 void *bpf_image_alloc(void);
 /* Called only from JIT-enabled code, so there's no need for stubs. */
+void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
+void bpf_image_ksym_del(struct bpf_ksym *ksym);
 void bpf_ksym_add(struct bpf_ksym *ksym);
 void bpf_ksym_del(struct bpf_ksym *ksym);
 #else
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 221a17af1f81..36549c9afec4 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -5,6 +5,7 @@
 #include <linux/filter.h>
 #include <linux/ftrace.h>
 #include <linux/rbtree_latch.h>
+#include <linux/perf_event.h>
 
 /* dummy _ops. The verifier will operate on target program's ops. */
 const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -96,6 +97,30 @@ bool is_bpf_image_address(unsigned long addr)
 	return ret;
 }
 
+void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
+{
+	ksym->start = (unsigned long) data;
+	ksym->end = ksym->start + BPF_IMAGE_SIZE;
+	bpf_ksym_add(ksym);
+	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
+			   BPF_IMAGE_SIZE, false, ksym->name);
+}
+
+void bpf_image_ksym_del(struct bpf_ksym *ksym)
+{
+	bpf_ksym_del(ksym);
+	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
+			   BPF_IMAGE_SIZE, true, ksym->name);
+}
+
+static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr)
+{
+	struct bpf_ksym *ksym = &tr->ksym;
+
+	snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key);
+	bpf_image_ksym_add(tr->image, ksym);
+}
+
 struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 {
 	struct bpf_trampoline *tr;
@@ -131,6 +156,8 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 	for (i = 0; i < BPF_TRAMP_MAX; i++)
 		INIT_HLIST_HEAD(&tr->progs_hlist[i]);
 	tr->image = image;
+	INIT_LIST_HEAD_RCU(&tr->ksym.lnode);
+	bpf_trampoline_ksym_add(tr);
 out:
 	mutex_unlock(&trampoline_mutex);
 	return tr;
@@ -368,6 +395,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
 		goto out;
 	if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
 		goto out;
+	bpf_image_ksym_del(&tr->ksym);
 	image = container_of(tr->image, struct bpf_image, data);
 	latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops);
 	/* wait for tasks to get out of trampoline before freeing it */
-- 
cgit v1.2.3


From 517b75e44c7be9c776aa5f7beaa85baff3868f80 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 12 Mar 2020 20:56:06 +0100
Subject: bpf: Add dispatchers to kallsyms

Adding dispatchers to kallsyms. It's displayed as
  bpf_dispatcher_<NAME>

where NAME is the name of dispatcher.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200312195610.346362-12-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h     | 19 ++++++++++++-------
 kernel/bpf/dispatcher.c |  1 +
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 083860be1944..86cacb54ba23 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -531,6 +531,7 @@ struct bpf_dispatcher {
 	int num_progs;
 	void *image;
 	u32 image_off;
+	struct bpf_ksym ksym;
 };
 
 static __always_inline unsigned int bpf_dispatcher_nop_func(
@@ -546,13 +547,17 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key);
 int bpf_trampoline_link_prog(struct bpf_prog *prog);
 int bpf_trampoline_unlink_prog(struct bpf_prog *prog);
 void bpf_trampoline_put(struct bpf_trampoline *tr);
-#define BPF_DISPATCHER_INIT(name) {			\
-	.mutex = __MUTEX_INITIALIZER(name.mutex),	\
-	.func = &name##_func,				\
-	.progs = {},					\
-	.num_progs = 0,					\
-	.image = NULL,					\
-	.image_off = 0					\
+#define BPF_DISPATCHER_INIT(_name) {				\
+	.mutex = __MUTEX_INITIALIZER(_name.mutex),		\
+	.func = &_name##_func,					\
+	.progs = {},						\
+	.num_progs = 0,						\
+	.image = NULL,						\
+	.image_off = 0,						\
+	.ksym = {						\
+		.name  = #_name,				\
+		.lnode = LIST_HEAD_INIT(_name.ksym.lnode),	\
+	},							\
 }
 
 #define DEFINE_BPF_DISPATCHER(name)					\
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index b3e5b214fed8..a2679bae9e73 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -143,6 +143,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 		d->image = bpf_image_alloc();
 		if (!d->image)
 			goto out;
+		bpf_image_ksym_add(d->image, &d->ksym);
 	}
 
 	prev_num_progs = d->num_progs;
-- 
cgit v1.2.3


From 7ac88eba185b4d0e06a71678e54bc092edcd3af3 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 12 Mar 2020 20:56:07 +0100
Subject: bpf: Remove bpf_image tree

Now that we have all the objects (bpf_prog, bpf_trampoline,
bpf_dispatcher) linked in bpf_tree, there's no need to have
separate bpf_image tree for images.

Reverting the bpf_image tree together with struct bpf_image,
because it's no longer needed.

Also removing bpf_image_alloc function and adding the original
bpf_jit_alloc_exec_page interface instead.

The kernel_text_address function can now rely only on is_bpf_text_address,
because it checks the bpf_tree that contains all the objects.

Keeping bpf_image_ksym_add and bpf_image_ksym_del because they are
useful wrappers with perf's ksymbol interface calls.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200312195610.346362-13-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h     |  8 +----
 kernel/bpf/dispatcher.c |  4 +--
 kernel/bpf/trampoline.c | 83 ++++++-------------------------------------------
 kernel/extable.c        |  2 --
 4 files changed, 13 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 86cacb54ba23..bdb981c204fa 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -583,14 +583,8 @@ void bpf_trampoline_put(struct bpf_trampoline *tr);
 #define BPF_DISPATCHER_PTR(name) (&bpf_dispatcher_##name)
 void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 				struct bpf_prog *to);
-struct bpf_image {
-	struct latch_tree_node tnode;
-	unsigned char data[];
-};
-#define BPF_IMAGE_SIZE (PAGE_SIZE - sizeof(struct bpf_image))
-bool is_bpf_image_address(unsigned long address);
-void *bpf_image_alloc(void);
 /* Called only from JIT-enabled code, so there's no need for stubs. */
+void *bpf_jit_alloc_exec_page(void);
 void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
 void bpf_image_ksym_del(struct bpf_ksym *ksym);
 void bpf_ksym_add(struct bpf_ksym *ksym);
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index a2679bae9e73..2444bd15cc2d 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -113,7 +113,7 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
 		noff = 0;
 	} else {
 		old = d->image + d->image_off;
-		noff = d->image_off ^ (BPF_IMAGE_SIZE / 2);
+		noff = d->image_off ^ (PAGE_SIZE / 2);
 	}
 
 	new = d->num_progs ? d->image + noff : NULL;
@@ -140,7 +140,7 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 
 	mutex_lock(&d->mutex);
 	if (!d->image) {
-		d->image = bpf_image_alloc();
+		d->image = bpf_jit_alloc_exec_page();
 		if (!d->image)
 			goto out;
 		bpf_image_ksym_add(d->image, &d->ksym);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 36549c9afec4..f42f700c1d28 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -18,12 +18,11 @@ const struct bpf_prog_ops bpf_extension_prog_ops = {
 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
 
 static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
-static struct latch_tree_root image_tree __cacheline_aligned;
 
-/* serializes access to trampoline_table and image_tree */
+/* serializes access to trampoline_table */
 static DEFINE_MUTEX(trampoline_mutex);
 
-static void *bpf_jit_alloc_exec_page(void)
+void *bpf_jit_alloc_exec_page(void)
 {
 	void *image;
 
@@ -39,78 +38,20 @@ static void *bpf_jit_alloc_exec_page(void)
 	return image;
 }
 
-static __always_inline bool image_tree_less(struct latch_tree_node *a,
-				      struct latch_tree_node *b)
-{
-	struct bpf_image *ia = container_of(a, struct bpf_image, tnode);
-	struct bpf_image *ib = container_of(b, struct bpf_image, tnode);
-
-	return ia < ib;
-}
-
-static __always_inline int image_tree_comp(void *addr, struct latch_tree_node *n)
-{
-	void *image = container_of(n, struct bpf_image, tnode);
-
-	if (addr < image)
-		return -1;
-	if (addr >= image + PAGE_SIZE)
-		return 1;
-
-	return 0;
-}
-
-static const struct latch_tree_ops image_tree_ops = {
-	.less	= image_tree_less,
-	.comp	= image_tree_comp,
-};
-
-static void *__bpf_image_alloc(bool lock)
-{
-	struct bpf_image *image;
-
-	image = bpf_jit_alloc_exec_page();
-	if (!image)
-		return NULL;
-
-	if (lock)
-		mutex_lock(&trampoline_mutex);
-	latch_tree_insert(&image->tnode, &image_tree, &image_tree_ops);
-	if (lock)
-		mutex_unlock(&trampoline_mutex);
-	return image->data;
-}
-
-void *bpf_image_alloc(void)
-{
-	return __bpf_image_alloc(true);
-}
-
-bool is_bpf_image_address(unsigned long addr)
-{
-	bool ret;
-
-	rcu_read_lock();
-	ret = latch_tree_find((void *) addr, &image_tree, &image_tree_ops) != NULL;
-	rcu_read_unlock();
-
-	return ret;
-}
-
 void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
 {
 	ksym->start = (unsigned long) data;
-	ksym->end = ksym->start + BPF_IMAGE_SIZE;
+	ksym->end = ksym->start + PAGE_SIZE;
 	bpf_ksym_add(ksym);
 	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
-			   BPF_IMAGE_SIZE, false, ksym->name);
+			   PAGE_SIZE, false, ksym->name);
 }
 
 void bpf_image_ksym_del(struct bpf_ksym *ksym)
 {
 	bpf_ksym_del(ksym);
 	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
-			   BPF_IMAGE_SIZE, true, ksym->name);
+			   PAGE_SIZE, true, ksym->name);
 }
 
 static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr)
@@ -141,7 +82,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 		goto out;
 
 	/* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
-	image = __bpf_image_alloc(false);
+	image = bpf_jit_alloc_exec_page();
 	if (!image) {
 		kfree(tr);
 		tr = NULL;
@@ -243,8 +184,8 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
 
 static int bpf_trampoline_update(struct bpf_trampoline *tr)
 {
-	void *old_image = tr->image + ((tr->selector + 1) & 1) * BPF_IMAGE_SIZE/2;
-	void *new_image = tr->image + (tr->selector & 1) * BPF_IMAGE_SIZE/2;
+	void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
+	void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
 	struct bpf_tramp_progs *tprogs;
 	u32 flags = BPF_TRAMP_F_RESTORE_REGS;
 	int err, total;
@@ -272,7 +213,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
 
 	synchronize_rcu_tasks();
 
-	err = arch_prepare_bpf_trampoline(new_image, new_image + BPF_IMAGE_SIZE / 2,
+	err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
 					  &tr->func.model, flags, tprogs,
 					  tr->func.addr);
 	if (err < 0)
@@ -383,8 +324,6 @@ out:
 
 void bpf_trampoline_put(struct bpf_trampoline *tr)
 {
-	struct bpf_image *image;
-
 	if (!tr)
 		return;
 	mutex_lock(&trampoline_mutex);
@@ -396,11 +335,9 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
 	if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
 		goto out;
 	bpf_image_ksym_del(&tr->ksym);
-	image = container_of(tr->image, struct bpf_image, data);
-	latch_tree_erase(&image->tnode, &image_tree, &image_tree_ops);
 	/* wait for tasks to get out of trampoline before freeing it */
 	synchronize_rcu_tasks();
-	bpf_jit_free_exec(image);
+	bpf_jit_free_exec(tr->image);
 	hlist_del(&tr->hlist);
 	kfree(tr);
 out:
diff --git a/kernel/extable.c b/kernel/extable.c
index a0024f27d3a1..7681f87e89dd 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -149,8 +149,6 @@ int kernel_text_address(unsigned long addr)
 		goto out;
 	if (is_bpf_text_address(addr))
 		goto out;
-	if (is_bpf_image_address(addr))
-		goto out;
 	ret = 0;
 out:
 	if (no_rcu)
-- 
cgit v1.2.3


From 58b05e58d155fd5a9a181d51b4c9c8a69a0816d3 Mon Sep 17 00:00:00 2001
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Thu, 12 Mar 2020 18:10:09 +0100
Subject: net: phy: Add XLGMII interface define

Add a define for XLGMII interface.

Signed-off-by: Jose Abreu <Jose.Abreu@synopsys.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 7a08023bdbc5..6b872aed8ba6 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -94,6 +94,7 @@ typedef enum {
 	PHY_INTERFACE_MODE_RTBI,
 	PHY_INTERFACE_MODE_SMII,
 	PHY_INTERFACE_MODE_XGMII,
+	PHY_INTERFACE_MODE_XLGMII,
 	PHY_INTERFACE_MODE_MOCA,
 	PHY_INTERFACE_MODE_QSGMII,
 	PHY_INTERFACE_MODE_TRGMII,
@@ -165,6 +166,8 @@ static inline const char *phy_modes(phy_interface_t interface)
 		return "smii";
 	case PHY_INTERFACE_MODE_XGMII:
 		return "xgmii";
+	case PHY_INTERFACE_MODE_XLGMII:
+		return "xlgmii";
 	case PHY_INTERFACE_MODE_MOCA:
 		return "moca";
 	case PHY_INTERFACE_MODE_QSGMII:
-- 
cgit v1.2.3


From 6daf14140129d30207ed6a0a69851fa6a3636bda Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Thu, 20 Feb 2020 07:59:14 -0600
Subject: netfilter: Replace zero-length array with flexible-array member

The current codebase makes use of the zero-length array language
extension to the C90 standard, but the preferred mechanism to declare
variable-length types such as these ones is a flexible array member[1][2],
introduced in C99:

struct foo {
        int stuff;
        struct boo array[];
};

By making use of the mechanism above, we will get a compiler warning
in case the flexible array does not occur last in the structure, which
will help us prevent some kind of undefined behavior bugs from being
inadvertently introduced[3] to the codebase from now on.

Also, notice that, dynamic memory allocations won't be affected by
this change:

"Flexible array members have incomplete type, and so the sizeof operator
may not be applied. As a quirk of the original implementation of
zero-length arrays, sizeof evaluates to zero."[1]

Lastly, fix checkpatch.pl warning
WARNING: __aligned(size) is preferred over __attribute__((aligned(size)))
in net/bridge/netfilter/ebtables.c

This issue was found with the help of Coccinelle.

[1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html
[2] https://github.com/KSPP/linux/issues/21
[3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour")

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h          | 2 +-
 include/linux/netfilter/x_tables.h              | 8 ++++----
 include/linux/netfilter_arp/arp_tables.h        | 2 +-
 include/linux/netfilter_bridge/ebtables.h       | 2 +-
 include/linux/netfilter_ipv4/ip_tables.h        | 2 +-
 include/linux/netfilter_ipv6/ip6_tables.h       | 2 +-
 include/net/netfilter/nf_conntrack_extend.h     | 2 +-
 include/net/netfilter/nf_conntrack_timeout.h    | 2 +-
 include/net/netfilter/nf_tables.h               | 6 +++---
 include/uapi/linux/netfilter_bridge/ebt_among.h | 2 +-
 net/bridge/netfilter/ebtables.c                 | 2 +-
 net/ipv4/netfilter/arp_tables.c                 | 4 ++--
 net/ipv4/netfilter/ip_tables.c                  | 4 ++--
 net/ipv6/netfilter/ip6_tables.c                 | 4 ++--
 net/netfilter/ipset/ip_set_bitmap_ip.c          | 2 +-
 net/netfilter/ipset/ip_set_bitmap_ipmac.c       | 2 +-
 net/netfilter/ipset/ip_set_bitmap_port.c        | 2 +-
 net/netfilter/ipset/ip_set_hash_gen.h           | 4 ++--
 net/netfilter/nfnetlink_acct.c                  | 2 +-
 net/netfilter/nft_set_pipapo.c                  | 2 +-
 net/netfilter/xt_hashlimit.c                    | 2 +-
 net/netfilter/xt_recent.c                       | 4 ++--
 22 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 5448c8b443db..ab192720e2d6 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -98,7 +98,7 @@ struct ip_set_counter {
 
 struct ip_set_comment_rcu {
 	struct rcu_head rcu;
-	char str[0];
+	char str[];
 };
 
 struct ip_set_comment {
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 1b261c51b3a3..5da88451853b 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -264,7 +264,7 @@ struct xt_table_info {
 	unsigned int stacksize;
 	void ***jumpstack;
 
-	unsigned char entries[0] __aligned(8);
+	unsigned char entries[] __aligned(8);
 };
 
 int xt_register_target(struct xt_target *target);
@@ -464,7 +464,7 @@ struct compat_xt_entry_match {
 		} kernel;
 		u_int16_t match_size;
 	} u;
-	unsigned char data[0];
+	unsigned char data[];
 };
 
 struct compat_xt_entry_target {
@@ -480,7 +480,7 @@ struct compat_xt_entry_target {
 		} kernel;
 		u_int16_t target_size;
 	} u;
-	unsigned char data[0];
+	unsigned char data[];
 };
 
 /* FIXME: this works only on 32 bit tasks
@@ -494,7 +494,7 @@ struct compat_xt_counters {
 struct compat_xt_counters_info {
 	char name[XT_TABLE_MAXNAMELEN];
 	compat_uint_t num_counters;
-	struct compat_xt_counters counters[0];
+	struct compat_xt_counters counters[];
 };
 
 struct _compat_xt_align {
diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index e98028f00e47..7d3537c40ec9 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -67,7 +67,7 @@ struct compat_arpt_entry {
 	__u16 next_offset;
 	compat_uint_t comefrom;
 	struct compat_xt_counters counters;
-	unsigned char elems[0];
+	unsigned char elems[];
 };
 
 static inline struct xt_entry_target *
diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index 162f59d0d17a..2f5c4e6ecd8a 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -85,7 +85,7 @@ struct ebt_table_info {
 	/* room to maintain the stack used for jumping from and into udc */
 	struct ebt_chainstack **chainstack;
 	char *entries;
-	struct ebt_counter counters[0] ____cacheline_aligned;
+	struct ebt_counter counters[] ____cacheline_aligned;
 };
 
 struct ebt_table {
diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index e9e1ed74cdf1..b394bd4f68a3 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -76,7 +76,7 @@ struct compat_ipt_entry {
 	__u16 next_offset;
 	compat_uint_t comefrom;
 	struct compat_xt_counters counters;
-	unsigned char elems[0];
+	unsigned char elems[];
 };
 
 /* Helper functions */
diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index 78ab959c4575..8225f7821a29 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -43,7 +43,7 @@ struct compat_ip6t_entry {
 	__u16 next_offset;
 	compat_uint_t comefrom;
 	struct compat_xt_counters counters;
-	unsigned char elems[0];
+	unsigned char elems[];
 };
 
 static inline struct xt_entry_target *
diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 5ae5295aa46d..e1e588387103 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -45,7 +45,7 @@ enum nf_ct_ext_id {
 struct nf_ct_ext {
 	u8 offset[NF_CT_EXT_NUM];
 	u8 len;
-	char data[0];
+	char data[];
 };
 
 static inline bool __nf_ct_ext_exist(const struct nf_ct_ext *ext, u8 id)
diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h
index 6dd72396f534..659b0ea25b4d 100644
--- a/include/net/netfilter/nf_conntrack_timeout.h
+++ b/include/net/netfilter/nf_conntrack_timeout.h
@@ -14,7 +14,7 @@
 struct nf_ct_timeout {
 	__u16			l3num;
 	const struct nf_conntrack_l4proto *l4proto;
-	char			data[0];
+	char			data[];
 };
 
 struct ctnl_timeout {
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index d913cdb6a27b..01f48b5e6bf2 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -224,7 +224,7 @@ int nft_validate_register_store(const struct nft_ctx *ctx,
  */
 struct nft_userdata {
 	u8			len;
-	unsigned char		data[0];
+	unsigned char		data[];
 };
 
 /**
@@ -565,7 +565,7 @@ struct nft_set_ext_tmpl {
 struct nft_set_ext {
 	u8	genmask;
 	u8	offset[NFT_SET_EXT_NUM];
-	char	data[0];
+	char	data[];
 };
 
 static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl)
@@ -1375,7 +1375,7 @@ struct nft_trans {
 	int				msg_type;
 	bool				put_net;
 	struct nft_ctx			ctx;
-	char				data[0];
+	char				data[];
 };
 
 struct nft_trans_rule {
diff --git a/include/uapi/linux/netfilter_bridge/ebt_among.h b/include/uapi/linux/netfilter_bridge/ebt_among.h
index 9acf757bc1f7..73b26a280c4f 100644
--- a/include/uapi/linux/netfilter_bridge/ebt_among.h
+++ b/include/uapi/linux/netfilter_bridge/ebt_among.h
@@ -40,7 +40,7 @@ struct ebt_mac_wormhash_tuple {
 struct ebt_mac_wormhash {
 	int table[257];
 	int poolsize;
-	struct ebt_mac_wormhash_tuple pool[0];
+	struct ebt_mac_wormhash_tuple pool[];
 };
 
 #define ebt_mac_wormhash_size(x) ((x) ? sizeof(struct ebt_mac_wormhash) \
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index e1256e03a9a8..78db58c7aec2 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1561,7 +1561,7 @@ struct compat_ebt_entry_mwt {
 		compat_uptr_t ptr;
 	} u;
 	compat_uint_t match_size;
-	compat_uint_t data[0] __attribute__ ((aligned (__alignof__(struct compat_ebt_replace))));
+	compat_uint_t data[] __aligned(__alignof__(struct compat_ebt_replace));
 };
 
 /* account for possible padding between match_size and ->data */
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index f1f78a742b36..b167f4a5b684 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1057,7 +1057,7 @@ struct compat_arpt_replace {
 	u32				underflow[NF_ARP_NUMHOOKS];
 	u32				num_counters;
 	compat_uptr_t			counters;
-	struct compat_arpt_entry	entries[0];
+	struct compat_arpt_entry	entries[];
 };
 
 static inline void compat_release_entry(struct compat_arpt_entry *e)
@@ -1383,7 +1383,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
 struct compat_arpt_get_entries {
 	char name[XT_TABLE_MAXNAMELEN];
 	compat_uint_t size;
-	struct compat_arpt_entry entrytable[0];
+	struct compat_arpt_entry entrytable[];
 };
 
 static int compat_get_entries(struct net *net,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 10b91ebdf213..c2670eaa74e6 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1211,7 +1211,7 @@ struct compat_ipt_replace {
 	u32			underflow[NF_INET_NUMHOOKS];
 	u32			num_counters;
 	compat_uptr_t		counters;	/* struct xt_counters * */
-	struct compat_ipt_entry	entries[0];
+	struct compat_ipt_entry	entries[];
 };
 
 static int
@@ -1562,7 +1562,7 @@ compat_do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user,
 struct compat_ipt_get_entries {
 	char name[XT_TABLE_MAXNAMELEN];
 	compat_uint_t size;
-	struct compat_ipt_entry entrytable[0];
+	struct compat_ipt_entry entrytable[];
 };
 
 static int
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index c973ace208c5..e27393498ecb 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1227,7 +1227,7 @@ struct compat_ip6t_replace {
 	u32			underflow[NF_INET_NUMHOOKS];
 	u32			num_counters;
 	compat_uptr_t		counters;	/* struct xt_counters * */
-	struct compat_ip6t_entry entries[0];
+	struct compat_ip6t_entry entries[];
 };
 
 static int
@@ -1571,7 +1571,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
 struct compat_ip6t_get_entries {
 	char name[XT_TABLE_MAXNAMELEN];
 	compat_uint_t size;
-	struct compat_ip6t_entry entrytable[0];
+	struct compat_ip6t_entry entrytable[];
 };
 
 static int
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 0a2196f59106..486959f70cf3 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -46,7 +46,7 @@ struct bitmap_ip {
 	u8 netmask;		/* subnet netmask */
 	struct timer_list gc;	/* garbage collection */
 	struct ip_set *set;	/* attached to this ip_set */
-	unsigned char extensions[0]	/* data extensions */
+	unsigned char extensions[]	/* data extensions */
 		__aligned(__alignof__(u64));
 };
 
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 739e343efaf6..2310a316e0af 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -49,7 +49,7 @@ struct bitmap_ipmac {
 	size_t memsize;		/* members size */
 	struct timer_list gc;	/* garbage collector */
 	struct ip_set *set;	/* attached to this ip_set */
-	unsigned char extensions[0]	/* MAC + data extensions */
+	unsigned char extensions[]	/* MAC + data extensions */
 		__aligned(__alignof__(u64));
 };
 
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index b49978dd810d..e56ced66f202 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -37,7 +37,7 @@ struct bitmap_port {
 	size_t memsize;		/* members size */
 	struct timer_list gc;	/* garbage collection */
 	struct ip_set *set;	/* attached to this ip_set */
-	unsigned char extensions[0]	/* data extensions */
+	unsigned char extensions[]	/* data extensions */
 		__aligned(__alignof__(u64));
 };
 
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index e52d7b7597a0..1ee43752d6d3 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -76,7 +76,7 @@ struct hbucket {
 	DECLARE_BITMAP(used, AHASH_MAX_TUNED);
 	u8 size;		/* size of the array */
 	u8 pos;			/* position of the first free entry */
-	unsigned char value[0]	/* the array of the values */
+	unsigned char value[]	/* the array of the values */
 		__aligned(__alignof__(u64));
 };
 
@@ -109,7 +109,7 @@ struct htable {
 	u8 htable_bits;		/* size of hash table == 2^htable_bits */
 	u32 maxelem;		/* Maxelem per region */
 	struct ip_set_region *hregion;	/* Region locks and ext sizes */
-	struct hbucket __rcu *bucket[0]; /* hashtable buckets */
+	struct hbucket __rcu *bucket[]; /* hashtable buckets */
 };
 
 #define hbucket(h, i)		((h)->bucket[i])
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 2481470dec36..5827117f2635 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -33,7 +33,7 @@ struct nf_acct {
 	refcount_t		refcnt;
 	char			name[NFACCT_NAME_MAX];
 	struct rcu_head		rcu_head;
-	char			data[0];
+	char			data[];
 };
 
 struct nfacct_filter {
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index d752a39b97d6..26395c8188b1 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -433,7 +433,7 @@ struct nft_pipapo_match {
 	unsigned long * __percpu *scratch;
 	size_t bsize_max;
 	struct rcu_head rcu;
-	struct nft_pipapo_field f[0];
+	struct nft_pipapo_field f[];
 };
 
 /* Current working bitmap index, toggled between field matches */
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 8c835ad63729..9c5cfd74a0ee 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -132,7 +132,7 @@ struct xt_hashlimit_htable {
 	const char *name;
 	struct net *net;
 
-	struct hlist_head hash[0];	/* hashtable itself */
+	struct hlist_head hash[];	/* hashtable itself */
 };
 
 static int
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 225a7ab6d79a..19bef176145e 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -71,7 +71,7 @@ struct recent_entry {
 	u_int8_t		ttl;
 	u_int8_t		index;
 	u_int16_t		nstamps;
-	unsigned long		stamps[0];
+	unsigned long		stamps[];
 };
 
 struct recent_table {
@@ -82,7 +82,7 @@ struct recent_table {
 	unsigned int		entries;
 	u8			nstamps_max_mask;
 	struct list_head	lru_list;
-	struct list_head	iphash[0];
+	struct list_head	iphash[];
 };
 
 struct recent_net {
-- 
cgit v1.2.3


From f6554187855a4ddce235d22cc1486c118846b592 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Sat, 14 Mar 2020 10:09:53 +0000
Subject: net: mii: convert mii_lpa_to_ethtool_lpa_x() to linkmode variant

Add a LPA to linkmode decoder for 1000BASE-X protocols; this decoder
only provides the modify semantics similar to other such decoders.
This replaces the unused mii_lpa_to_ethtool_lpa_x() helper.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mii.h | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mii.h b/include/linux/mii.h
index 18c6208f56fc..309de4a3e6e7 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -354,24 +354,6 @@ static inline u32 mii_adv_to_ethtool_adv_x(u32 adv)
 	return result;
 }
 
-/**
- * mii_lpa_to_ethtool_lpa_x
- * @adv: value of the MII_LPA register
- *
- * A small helper function that translates MII_LPA
- * bits, when in 1000Base-X mode, to ethtool
- * LP advertisement settings.
- */
-static inline u32 mii_lpa_to_ethtool_lpa_x(u32 lpa)
-{
-	u32 result = 0;
-
-	if (lpa & LPA_LPACK)
-		result |= ADVERTISED_Autoneg;
-
-	return result | mii_adv_to_ethtool_adv_x(lpa);
-}
-
 /**
  * mii_lpa_mod_linkmode_adv_sgmii
  * @lp_advertising: pointer to destination link mode.
@@ -535,6 +517,25 @@ static inline u32 linkmode_adv_to_lcl_adv_t(unsigned long *advertising)
 	return lcl_adv;
 }
 
+/**
+ * mii_lpa_mod_linkmode_x - decode the link partner's config_reg to linkmodes
+ * @linkmodes: link modes array
+ * @lpa: config_reg word from link partner
+ * @fd_bit: link mode for 1000XFULL bit
+ */
+static inline void mii_lpa_mod_linkmode_x(unsigned long *linkmodes, u16 lpa,
+					 int fd_bit)
+{
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, linkmodes,
+			 lpa & LPA_LPACK);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT, linkmodes,
+			 lpa & LPA_1000XPAUSE);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, linkmodes,
+			 lpa & LPA_1000XPAUSE_ASYM);
+	linkmode_mod_bit(fd_bit, linkmodes,
+			 lpa & LPA_1000XFULL);
+}
+
 /**
  * mii_advertise_flowctrl - get flow control advertisement flags
  * @cap: Flow control capabilities (FLOW_CTRL_RX, FLOW_CTRL_TX or both)
-- 
cgit v1.2.3


From a9f28eba6eaab651e4cec6d936a107b03d61754a Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Sat, 14 Mar 2020 10:09:58 +0000
Subject: net: mii: add linkmode_adv_to_mii_adv_x()

Add a helper to convert a linkmode advertisement to a clause 37
advertisement value for 1000base-x and 2500base-x.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mii.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mii.h b/include/linux/mii.h
index 309de4a3e6e7..219b93cad1dd 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -536,6 +536,26 @@ static inline void mii_lpa_mod_linkmode_x(unsigned long *linkmodes, u16 lpa,
 			 lpa & LPA_1000XFULL);
 }
 
+/**
+ * linkmode_adv_to_mii_adv_x - encode a linkmode to config_reg
+ * @linkmodes: linkmodes
+ * @fd_bit: full duplex bit
+ */
+static inline u16 linkmode_adv_to_mii_adv_x(const unsigned long *linkmodes,
+					    int fd_bit)
+{
+	u16 adv = 0;
+
+	if (linkmode_test_bit(fd_bit, linkmodes))
+		adv |= ADVERTISE_1000XFULL;
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, linkmodes))
+		adv |= ADVERTISE_1000XPAUSE;
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, linkmodes))
+		adv |= ADVERTISE_1000XPSE_ASYM;
+
+	return adv;
+}
+
 /**
  * mii_advertise_flowctrl - get flow control advertisement flags
  * @cap: Flow control capabilities (FLOW_CTRL_RX, FLOW_CTRL_TX or both)
-- 
cgit v1.2.3


From b030f194aed290705426c62e501201c0739405c5 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Wed, 11 Mar 2020 12:59:01 +0100
Subject: netfilter: Rename ingress hook include file

Prepare for addition of a netfilter egress hook by renaming
<linux/netfilter_ingress.h> to <linux/netfilter_netdev.h>.

The egress hook also necessitates a refactoring of the include file,
but that is done in a separate commit to ease reviewing.

No functional change intended.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ingress.h | 58 ---------------------------------------
 include/linux/netfilter_netdev.h  | 58 +++++++++++++++++++++++++++++++++++++++
 net/core/dev.c                    |  2 +-
 3 files changed, 59 insertions(+), 59 deletions(-)
 delete mode 100644 include/linux/netfilter_ingress.h
 create mode 100644 include/linux/netfilter_netdev.h

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
deleted file mode 100644
index a13774be2eb5..000000000000
--- a/include/linux/netfilter_ingress.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NETFILTER_INGRESS_H_
-#define _NETFILTER_INGRESS_H_
-
-#include <linux/netfilter.h>
-#include <linux/netdevice.h>
-
-#ifdef CONFIG_NETFILTER_INGRESS
-static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
-{
-#ifdef CONFIG_JUMP_LABEL
-	if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS]))
-		return false;
-#endif
-	return rcu_access_pointer(skb->dev->nf_hooks_ingress);
-}
-
-/* caller must hold rcu_read_lock */
-static inline int nf_hook_ingress(struct sk_buff *skb)
-{
-	struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress);
-	struct nf_hook_state state;
-	int ret;
-
-	/* Must recheck the ingress hook head, in the event it became NULL
-	 * after the check in nf_hook_ingress_active evaluated to true.
-	 */
-	if (unlikely(!e))
-		return 0;
-
-	nf_hook_state_init(&state, NF_NETDEV_INGRESS,
-			   NFPROTO_NETDEV, skb->dev, NULL, NULL,
-			   dev_net(skb->dev), NULL);
-	ret = nf_hook_slow(skb, &state, e, 0);
-	if (ret == 0)
-		return -1;
-
-	return ret;
-}
-
-static inline void nf_hook_ingress_init(struct net_device *dev)
-{
-	RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
-}
-#else /* CONFIG_NETFILTER_INGRESS */
-static inline int nf_hook_ingress_active(struct sk_buff *skb)
-{
-	return 0;
-}
-
-static inline int nf_hook_ingress(struct sk_buff *skb)
-{
-	return 0;
-}
-
-static inline void nf_hook_ingress_init(struct net_device *dev) {}
-#endif /* CONFIG_NETFILTER_INGRESS */
-#endif /* _NETFILTER_INGRESS_H_ */
diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h
new file mode 100644
index 000000000000..a13774be2eb5
--- /dev/null
+++ b/include/linux/netfilter_netdev.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NETFILTER_INGRESS_H_
+#define _NETFILTER_INGRESS_H_
+
+#include <linux/netfilter.h>
+#include <linux/netdevice.h>
+
+#ifdef CONFIG_NETFILTER_INGRESS
+static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
+{
+#ifdef CONFIG_JUMP_LABEL
+	if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS]))
+		return false;
+#endif
+	return rcu_access_pointer(skb->dev->nf_hooks_ingress);
+}
+
+/* caller must hold rcu_read_lock */
+static inline int nf_hook_ingress(struct sk_buff *skb)
+{
+	struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress);
+	struct nf_hook_state state;
+	int ret;
+
+	/* Must recheck the ingress hook head, in the event it became NULL
+	 * after the check in nf_hook_ingress_active evaluated to true.
+	 */
+	if (unlikely(!e))
+		return 0;
+
+	nf_hook_state_init(&state, NF_NETDEV_INGRESS,
+			   NFPROTO_NETDEV, skb->dev, NULL, NULL,
+			   dev_net(skb->dev), NULL);
+	ret = nf_hook_slow(skb, &state, e, 0);
+	if (ret == 0)
+		return -1;
+
+	return ret;
+}
+
+static inline void nf_hook_ingress_init(struct net_device *dev)
+{
+	RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
+}
+#else /* CONFIG_NETFILTER_INGRESS */
+static inline int nf_hook_ingress_active(struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline int nf_hook_ingress(struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline void nf_hook_ingress_init(struct net_device *dev) {}
+#endif /* CONFIG_NETFILTER_INGRESS */
+#endif /* _NETFILTER_INGRESS_H_ */
diff --git a/net/core/dev.c b/net/core/dev.c
index d84541c24446..b1ce1c942b54 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,7 +135,7 @@
 #include <linux/if_macvlan.h>
 #include <linux/errqueue.h>
 #include <linux/hrtimer.h>
-#include <linux/netfilter_ingress.h>
+#include <linux/netfilter_netdev.h>
 #include <linux/crash_dump.h>
 #include <linux/sctp.h>
 #include <net/udp_tunnel.h>
-- 
cgit v1.2.3


From 5418d3881e1f5d2cf9c1076eb8bd85770393a0e8 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Wed, 11 Mar 2020 12:59:02 +0100
Subject: netfilter: Generalize ingress hook

Prepare for addition of a netfilter egress hook by generalizing the
ingress hook introduced by commit e687ad60af09 ("netfilter: add
netfilter ingress hook after handle_ing() under unique static key").

In particular, rename and refactor the ingress hook's static inlines
such that they can be reused for an egress hook.

No functional change intended.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_netdev.h | 45 +++++++++++++++++++++++++++-------------
 net/core/dev.c                   |  2 +-
 2 files changed, 32 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h
index a13774be2eb5..49e26479642e 100644
--- a/include/linux/netfilter_netdev.h
+++ b/include/linux/netfilter_netdev.h
@@ -1,34 +1,37 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NETFILTER_INGRESS_H_
-#define _NETFILTER_INGRESS_H_
+#ifndef _NETFILTER_NETDEV_H_
+#define _NETFILTER_NETDEV_H_
 
 #include <linux/netfilter.h>
 #include <linux/netdevice.h>
 
-#ifdef CONFIG_NETFILTER_INGRESS
-static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
+#ifdef CONFIG_NETFILTER
+static __always_inline bool nf_hook_netdev_active(enum nf_dev_hooks hooknum,
+					  struct nf_hook_entries __rcu *hooks)
 {
 #ifdef CONFIG_JUMP_LABEL
-	if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS]))
+	if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][hooknum]))
 		return false;
 #endif
-	return rcu_access_pointer(skb->dev->nf_hooks_ingress);
+	return rcu_access_pointer(hooks);
 }
 
 /* caller must hold rcu_read_lock */
-static inline int nf_hook_ingress(struct sk_buff *skb)
+static __always_inline int nf_hook_netdev(struct sk_buff *skb,
+					  enum nf_dev_hooks hooknum,
+					  struct nf_hook_entries __rcu *hooks)
 {
-	struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress);
+	struct nf_hook_entries *e = rcu_dereference(hooks);
 	struct nf_hook_state state;
 	int ret;
 
-	/* Must recheck the ingress hook head, in the event it became NULL
-	 * after the check in nf_hook_ingress_active evaluated to true.
+	/* Must recheck the hook head, in the event it became NULL
+	 * after the check in nf_hook_netdev_active evaluated to true.
 	 */
 	if (unlikely(!e))
 		return 0;
 
-	nf_hook_state_init(&state, NF_NETDEV_INGRESS,
+	nf_hook_state_init(&state, hooknum,
 			   NFPROTO_NETDEV, skb->dev, NULL, NULL,
 			   dev_net(skb->dev), NULL);
 	ret = nf_hook_slow(skb, &state, e, 0);
@@ -37,10 +40,26 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 
 	return ret;
 }
+#endif /* CONFIG_NETFILTER */
 
-static inline void nf_hook_ingress_init(struct net_device *dev)
+static inline void nf_hook_netdev_init(struct net_device *dev)
 {
+#ifdef CONFIG_NETFILTER_INGRESS
 	RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
+#endif
+}
+
+#ifdef CONFIG_NETFILTER_INGRESS
+static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
+{
+	return nf_hook_netdev_active(NF_NETDEV_INGRESS,
+				     skb->dev->nf_hooks_ingress);
+}
+
+static inline int nf_hook_ingress(struct sk_buff *skb)
+{
+	return nf_hook_netdev(skb, NF_NETDEV_INGRESS,
+			      skb->dev->nf_hooks_ingress);
 }
 #else /* CONFIG_NETFILTER_INGRESS */
 static inline int nf_hook_ingress_active(struct sk_buff *skb)
@@ -52,7 +71,5 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 {
 	return 0;
 }
-
-static inline void nf_hook_ingress_init(struct net_device *dev) {}
 #endif /* CONFIG_NETFILTER_INGRESS */
 #endif /* _NETFILTER_INGRESS_H_ */
diff --git a/net/core/dev.c b/net/core/dev.c
index b1ce1c942b54..13d562f67e9c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9846,7 +9846,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	if (!dev->ethtool_ops)
 		dev->ethtool_ops = &default_ethtool_ops;
 
-	nf_hook_ingress_init(dev);
+	nf_hook_netdev_init(dev);
 
 	return dev;
 
-- 
cgit v1.2.3


From 8537f78647c072bdb1a5dbe32e1c7e5b13ff1258 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Wed, 11 Mar 2020 12:59:03 +0100
Subject: netfilter: Introduce egress hook

Commit e687ad60af09 ("netfilter: add netfilter ingress hook after
handle_ing() under unique static key") introduced the ability to
classify packets on ingress.

Allow the same on egress.  Position the hook immediately before a packet
is handed to tc and then sent out on an interface, thereby mirroring the
ingress order.  This order allows marking packets in the netfilter
egress hook and subsequently using the mark in tc.  Another benefit of
this order is consistency with a lot of existing documentation which
says that egress tc is performed after netfilter hooks.

Egress hooks already exist for the most common protocols, such as
NF_INET_LOCAL_OUT or NF_ARP_OUT, and those are to be preferred because
they are executed earlier during packet processing.  However for more
exotic protocols, there is currently no provision to apply netfilter on
egress.  A common workaround is to enslave the interface to a bridge and
use ebtables, or to resort to tc.  But when the ingress hook was
introduced, consensus was that users should be given the choice to use
netfilter or tc, whichever tool suits their needs best:
https://lore.kernel.org/netdev/20150430153317.GA3230@salvia/
This hook is also useful for NAT46/NAT64, tunneling and filtering of
locally generated af_packet traffic such as dhclient.

There have also been occasional user requests for a netfilter egress
hook in the past, e.g.:
https://www.spinics.net/lists/netfilter/msg50038.html

Performance measurements with pktgen surprisingly show a speedup rather
than a slowdown with this commit:

* Without this commit:
  Result: OK: 34240933(c34238375+d2558) usec, 100000000 (60byte,0frags)
  2920481pps 1401Mb/sec (1401830880bps) errors: 0

* With this commit:
  Result: OK: 33997299(c33994193+d3106) usec, 100000000 (60byte,0frags)
  2941410pps 1411Mb/sec (1411876800bps) errors: 0

* Without this commit + tc egress:
  Result: OK: 39022386(c39019547+d2839) usec, 100000000 (60byte,0frags)
  2562631pps 1230Mb/sec (1230062880bps) errors: 0

* With this commit + tc egress:
  Result: OK: 37604447(c37601877+d2570) usec, 100000000 (60byte,0frags)
  2659259pps 1276Mb/sec (1276444320bps) errors: 0

* With this commit + nft egress:
  Result: OK: 41436689(c41434088+d2600) usec, 100000000 (60byte,0frags)
  2413320pps 1158Mb/sec (1158393600bps) errors: 0

Tested on a bare-metal Core i7-3615QM, each measurement was performed
three times to verify that the numbers are stable.

Commands to perform a measurement:
modprobe pktgen
echo "add_device lo@3" > /proc/net/pktgen/kpktgend_3
samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -i 'lo@3' -n 100000000

Commands for testing tc egress:
tc qdisc add dev lo clsact
tc filter add dev lo egress protocol ip prio 1 u32 match ip dst 4.3.2.1/32

Commands for testing nft egress:
nft add table netdev t
nft add chain netdev t co \{ type filter hook egress device lo priority 0 \; \}
nft add rule netdev t co ip daddr 4.3.2.1/32 drop

All testing was performed on the loopback interface to avoid distorting
measurements by the packet handling in the low-level Ethernet driver.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netdevice.h        |  4 ++++
 include/linux/netfilter_netdev.h | 27 +++++++++++++++++++++++++++
 include/uapi/linux/netfilter.h   |  1 +
 net/core/dev.c                   | 23 ++++++++++++++++++++---
 net/netfilter/Kconfig            |  8 ++++++++
 net/netfilter/core.c             | 24 ++++++++++++++++++++----
 net/netfilter/nft_chain_filter.c |  4 +++-
 7 files changed, 83 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 654808bfad83..15f1e32b430c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1751,6 +1751,7 @@ enum netdev_priv_flags {
  *	@xps_maps:	XXX: need comments on this one
  *	@miniq_egress:		clsact qdisc specific data for
  *				egress processing
+ *	@nf_hooks_egress:	netfilter hooks executed for egress packets
  *	@qdisc_hash:		qdisc hash table
  *	@watchdog_timeo:	Represents the timeout that is used by
  *				the watchdog (see dev_watchdog())
@@ -2026,6 +2027,9 @@ struct net_device {
 #ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc __rcu	*miniq_egress;
 #endif
+#ifdef CONFIG_NETFILTER_EGRESS
+	struct nf_hook_entries __rcu *nf_hooks_egress;
+#endif
 
 #ifdef CONFIG_NET_SCHED
 	DECLARE_HASHTABLE	(qdisc_hash, 4);
diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h
index 49e26479642e..92d3611a782e 100644
--- a/include/linux/netfilter_netdev.h
+++ b/include/linux/netfilter_netdev.h
@@ -47,6 +47,9 @@ static inline void nf_hook_netdev_init(struct net_device *dev)
 #ifdef CONFIG_NETFILTER_INGRESS
 	RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
 #endif
+#ifdef CONFIG_NETFILTER_EGRESS
+	RCU_INIT_POINTER(dev->nf_hooks_egress, NULL);
+#endif
 }
 
 #ifdef CONFIG_NETFILTER_INGRESS
@@ -72,4 +75,28 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 	return 0;
 }
 #endif /* CONFIG_NETFILTER_INGRESS */
+
+#ifdef CONFIG_NETFILTER_EGRESS
+static inline bool nf_hook_egress_active(const struct sk_buff *skb)
+{
+	return nf_hook_netdev_active(NF_NETDEV_EGRESS,
+				     skb->dev->nf_hooks_egress);
+}
+
+static inline int nf_hook_egress(struct sk_buff *skb)
+{
+	return nf_hook_netdev(skb, NF_NETDEV_EGRESS,
+			      skb->dev->nf_hooks_egress);
+}
+#else /* CONFIG_NETFILTER_EGRESS */
+static inline int nf_hook_egress_active(struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline int nf_hook_egress(struct sk_buff *skb)
+{
+	return 0;
+}
+#endif /* CONFIG_NETFILTER_EGRESS */
 #endif /* _NETFILTER_INGRESS_H_ */
diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h
index ca9e63d6e0e4..d1616574c54f 100644
--- a/include/uapi/linux/netfilter.h
+++ b/include/uapi/linux/netfilter.h
@@ -50,6 +50,7 @@ enum nf_inet_hooks {
 
 enum nf_dev_hooks {
 	NF_NETDEV_INGRESS,
+	NF_NETDEV_EGRESS,
 	NF_NETDEV_NUMHOOKS
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 13d562f67e9c..a2da72a77c20 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3773,6 +3773,7 @@ EXPORT_SYMBOL(dev_loopback_xmit);
 static struct sk_buff *
 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 {
+#ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
 	struct tcf_result cl_res;
 
@@ -3806,11 +3807,24 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 	default:
 		break;
 	}
-
+#endif /* CONFIG_NET_CLS_ACT */
 	return skb;
 }
 #endif /* CONFIG_NET_EGRESS */
 
+static inline int nf_egress(struct sk_buff *skb)
+{
+	if (nf_hook_egress_active(skb)) {
+		int ret;
+
+		rcu_read_lock();
+		ret = nf_hook_egress(skb);
+		rcu_read_unlock();
+		return ret;
+	}
+	return 0;
+}
+
 #ifdef CONFIG_XPS
 static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 			       struct xps_dev_maps *dev_maps, unsigned int tci)
@@ -3997,13 +4011,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 	qdisc_pkt_len_init(skb);
 #ifdef CONFIG_NET_CLS_ACT
 	skb->tc_at_ingress = 0;
-# ifdef CONFIG_NET_EGRESS
+#endif
+#ifdef CONFIG_NET_EGRESS
 	if (static_branch_unlikely(&egress_needed_key)) {
+		if (nf_egress(skb) < 0)
+			goto out;
+
 		skb = sch_handle_egress(skb, &rc, dev);
 		if (!skb)
 			goto out;
 	}
-# endif
 #endif
 	/* If device/qdisc don't need skb->dst, release it right now while
 	 * its hot in this cpu cache.
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 468fea1aebba..f4c68f60f241 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -10,6 +10,14 @@ config NETFILTER_INGRESS
 	  This allows you to classify packets from ingress using the Netfilter
 	  infrastructure.
 
+config NETFILTER_EGRESS
+	bool "Netfilter egress support"
+	default y
+	select NET_EGRESS
+	help
+	  This allows you to classify packets before transmission using the
+	  Netfilter infrastructure.
+
 config NETFILTER_NETLINK
 	tristate
 
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 78f046ec506f..85e9c959aba7 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -306,6 +306,12 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
 		if (dev && dev_net(dev) == net)
 			return &dev->nf_hooks_ingress;
 	}
+#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+	if (hooknum == NF_NETDEV_EGRESS) {
+		if (dev && dev_net(dev) == net)
+			return &dev->nf_hooks_egress;
+	}
 #endif
 	WARN_ON_ONCE(1);
 	return NULL;
@@ -318,11 +324,13 @@ static int __nf_register_net_hook(struct net *net, int pf,
 	struct nf_hook_entries __rcu **pp;
 
 	if (pf == NFPROTO_NETDEV) {
-#ifndef CONFIG_NETFILTER_INGRESS
-		if (reg->hooknum == NF_NETDEV_INGRESS)
+		if ((!IS_ENABLED(CONFIG_NETFILTER_INGRESS) &&
+		     reg->hooknum == NF_NETDEV_INGRESS) ||
+		    (!IS_ENABLED(CONFIG_NETFILTER_EGRESS) &&
+		     reg->hooknum == NF_NETDEV_EGRESS))
 			return -EOPNOTSUPP;
-#endif
-		if (reg->hooknum != NF_NETDEV_INGRESS ||
+		if ((reg->hooknum != NF_NETDEV_INGRESS &&
+		     reg->hooknum != NF_NETDEV_EGRESS) ||
 		    !reg->dev || dev_net(reg->dev) != net)
 			return -EINVAL;
 	}
@@ -348,6 +356,10 @@ static int __nf_register_net_hook(struct net *net, int pf,
 	if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
 		net_inc_ingress_queue();
 #endif
+#ifdef CONFIG_NETFILTER_EGRESS
+	if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS)
+		net_inc_egress_queue();
+#endif
 #ifdef CONFIG_JUMP_LABEL
 	static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]);
 #endif
@@ -406,6 +418,10 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
 		if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
 			net_dec_ingress_queue();
 #endif
+#ifdef CONFIG_NETFILTER_EGRESS
+		if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS)
+			net_dec_egress_queue();
+#endif
 #ifdef CONFIG_JUMP_LABEL
 		static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]);
 #endif
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index c78d01bc02e9..67ce6dbb5496 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -277,9 +277,11 @@ static const struct nft_chain_type nft_chain_filter_netdev = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
 	.family		= NFPROTO_NETDEV,
-	.hook_mask	= (1 << NF_NETDEV_INGRESS),
+	.hook_mask	= (1 << NF_NETDEV_INGRESS) |
+			  (1 << NF_NETDEV_EGRESS),
 	.hooks		= {
 		[NF_NETDEV_INGRESS]	= nft_do_chain_netdev,
+		[NF_NETDEV_EGRESS]	= nft_do_chain_netdev,
 	},
 };
 
-- 
cgit v1.2.3


From 613f53fe09a27f928a7d05132e1a74b5136e8f04 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Mon, 24 Feb 2020 16:59:54 +0200
Subject: net/mlx5: Eswitch, enable forwarding back to uplink port

Add dependencny on cap termination_table_raw_traffic to allow non
encapsulated packets received from uplink to be forwarded back to the
received uplink port.

Refactor the conditions into a separate function.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Oz Shlomo <ozsh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 59 ++++++++++++++++++-------
 include/linux/mlx5/mlx5_ifc.h                   |  3 +-
 2 files changed, 45 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index ddb933aa8d59..ebf60ff30295 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -3641,6 +3641,46 @@ static int mlx5_validate_goto_chain(struct mlx5_eswitch *esw,
 	return 0;
 }
 
+static int verify_uplink_forwarding(struct mlx5e_priv *priv,
+				    struct mlx5e_tc_flow *flow,
+				    struct net_device *out_dev,
+				    struct netlink_ext_ack *extack)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+	struct mlx5e_rep_priv *rep_priv;
+
+	/* Forwarding non encapsulated traffic between
+	 * uplink ports is allowed only if
+	 * termination_table_raw_traffic cap is set.
+	 *
+	 * Input vport was stored esw_attr->in_rep.
+	 * In LAG case, *priv* is the private data of
+	 * uplink which may be not the input vport.
+	 */
+	rep_priv = mlx5e_rep_to_rep_priv(attr->in_rep);
+
+	if (!(mlx5e_eswitch_uplink_rep(rep_priv->netdev) &&
+	      mlx5e_eswitch_uplink_rep(out_dev)))
+		return 0;
+
+	if (!MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev,
+					termination_table_raw_traffic)) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "devices are both uplink, can't offload forwarding");
+			pr_err("devices %s %s are both uplink, can't offload forwarding\n",
+			       priv->netdev->name, out_dev->name);
+			return -EOPNOTSUPP;
+	} else if (out_dev != rep_priv->netdev) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "devices are not the same uplink, can't offload forwarding");
+		pr_err("devices %s %s are both uplink but not the same, can't offload forwarding\n",
+		       priv->netdev->name, out_dev->name);
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
 static int parse_tc_fdb_actions(struct mlx5e_priv *priv,
 				struct flow_action *flow_action,
 				struct mlx5e_tc_flow *flow,
@@ -3738,7 +3778,6 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv,
 				struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 				struct net_device *uplink_dev = mlx5_eswitch_uplink_get_proto_dev(esw, REP_ETH);
 				struct net_device *uplink_upper;
-				struct mlx5e_rep_priv *rep_priv;
 
 				if (is_duplicated_output_device(priv->netdev,
 								out_dev,
@@ -3774,21 +3813,9 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv,
 						return err;
 				}
 
-				/* Don't allow forwarding between uplink.
-				 *
-				 * Input vport was stored esw_attr->in_rep.
-				 * In LAG case, *priv* is the private data of
-				 * uplink which may be not the input vport.
-				 */
-				rep_priv = mlx5e_rep_to_rep_priv(attr->in_rep);
-				if (mlx5e_eswitch_uplink_rep(rep_priv->netdev) &&
-				    mlx5e_eswitch_uplink_rep(out_dev)) {
-					NL_SET_ERR_MSG_MOD(extack,
-							   "devices are both uplink, can't offload forwarding");
-					pr_err("devices %s %s are both uplink, can't offload forwarding\n",
-					       priv->netdev->name, out_dev->name);
-					return -EOPNOTSUPP;
-				}
+				err = verify_uplink_forwarding(priv, flow, out_dev, extack);
+				if (err)
+					return err;
 
 				if (!mlx5e_is_valid_eswitch_fwd_dev(priv, out_dev)) {
 					NL_SET_ERR_MSG_MOD(extack,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 2bd920965bd3..cc55cee3b53c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -416,7 +416,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         termination_table[0x1];
 	u8         reformat_and_fwd_to_table[0x1];
 	u8         reserved_at_1a[0x6];
-	u8         reserved_at_20[0x2];
+	u8         termination_table_raw_traffic[0x1];
+	u8         reserved_at_21[0x1];
 	u8         log_max_ft_size[0x6];
 	u8         log_max_modify_header_context[0x8];
 	u8         max_modify_header_actions[0x8];
-- 
cgit v1.2.3


From 9000edb71ab29d184aa33f5a77fa6e52d8812bb9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 16 Mar 2020 13:47:12 -0700
Subject: net: ethtool: require drivers to set supported_coalesce_params

Now that all in-tree drivers have been updated we can
make the supported_coalesce_params mandatory.

To save debugging time in case some driver was missed
(or is out of tree) add a warning when netdev is registered
with set_coalesce but without supported_coalesce_params.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h |  2 ++
 net/core/dev.c          |  4 ++++
 net/ethtool/common.c    | 11 +++++++++++
 net/ethtool/ioctl.c     |  3 ---
 4 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index be355f37337d..c1d379bf6ee1 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -458,6 +458,8 @@ struct ethtool_ops {
 					 struct ethtool_stats *, u64 *);
 };
 
+int ethtool_check_ops(const struct ethtool_ops *ops);
+
 struct ethtool_rx_flow_rule {
 	struct flow_rule	*rule;
 	unsigned long		priv[0];
diff --git a/net/core/dev.c b/net/core/dev.c
index d84541c24446..021e18251465 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9283,6 +9283,10 @@ int register_netdevice(struct net_device *dev)
 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 	BUG_ON(!net);
 
+	ret = ethtool_check_ops(dev->ethtool_ops);
+	if (ret)
+		return ret;
+
 	spin_lock_init(&dev->addr_list_lock);
 	lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
 
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 0b22741b2f8f..dab047eec943 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -289,3 +289,14 @@ out:
 	kfree(indir);
 	return ret;
 }
+
+int ethtool_check_ops(const struct ethtool_ops *ops)
+{
+	if (WARN_ON(ops->set_coalesce && !ops->supported_coalesce_params))
+		return -EINVAL;
+	/* NOTE: sufficiently insane drivers may swap ethtool_ops at runtime,
+	 * the fact that ops are checked at registration time does not
+	 * mean the ops attached to a netdev later on are sane.
+	 */
+	return 0;
+}
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 258840b19fb5..3852a58d7f95 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1519,9 +1519,6 @@ ethtool_set_coalesce_supported(struct net_device *dev,
 	u32 supported_params = dev->ethtool_ops->supported_coalesce_params;
 	u32 nonzero_params = 0;
 
-	if (!supported_params)
-		return true;
-
 	if (coalesce->rx_coalesce_usecs)
 		nonzero_params |= ETHTOOL_COALESCE_RX_USECS;
 	if (coalesce->rx_max_coalesced_frames)
-- 
cgit v1.2.3


From 9010f9deb000edce823cb79345f137742ccffa19 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 16 Mar 2020 22:32:33 +0100
Subject: net: phy: improve phy_driver callback handle_interrupt

did_interrupt() clears the interrupt, therefore handle_interrupt() can
not check which event triggered the interrupt. To overcome this
constraint and allow more flexibility for customer interrupt handlers,
let's decouple handle_interrupt() from parts of the phylib interrupt
handling. Custom interrupt handlers now have to implement the
did_interrupt() functionality in handle_interrupt() if needed.

Fortunately we have just one custom interrupt handler so far (in the
mscc PHY driver), convert it to the changed API.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mscc/mscc_main.c | 11 +++++++++--
 drivers/net/phy/phy.c            | 26 ++++++++++++--------------
 include/linux/phy.h              |  3 ++-
 3 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c
index cb4d65f81095..4727aba8e051 100644
--- a/drivers/net/phy/mscc/mscc_main.c
+++ b/drivers/net/phy/mscc/mscc_main.c
@@ -1429,11 +1429,18 @@ err:
 	return ret;
 }
 
-static int vsc8584_handle_interrupt(struct phy_device *phydev)
+static irqreturn_t vsc8584_handle_interrupt(struct phy_device *phydev)
 {
+	int irq_status;
+
+	irq_status = phy_read(phydev, MII_VSC85XX_INT_STATUS);
+	if (irq_status < 0 || !(irq_status & MII_VSC85XX_INT_MASK_MASK))
+		return IRQ_NONE;
+
 	vsc8584_handle_macsec_interrupt(phydev);
 	phy_mac_interrupt(phydev);
-	return 0;
+
+	return IRQ_HANDLED;
 }
 
 static int vsc85xx_config_init(struct phy_device *phydev)
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 355bfdef48d2..d71212a418f3 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -715,26 +715,24 @@ static int phy_disable_interrupts(struct phy_device *phydev)
 static irqreturn_t phy_interrupt(int irq, void *phy_dat)
 {
 	struct phy_device *phydev = phy_dat;
+	struct phy_driver *drv = phydev->drv;
 
-	if (phydev->drv->did_interrupt && !phydev->drv->did_interrupt(phydev))
+	if (drv->handle_interrupt)
+		return drv->handle_interrupt(phydev);
+
+	if (drv->did_interrupt && !drv->did_interrupt(phydev))
 		return IRQ_NONE;
 
-	if (phydev->drv->handle_interrupt) {
-		if (phydev->drv->handle_interrupt(phydev))
-			goto phy_err;
-	} else {
-		/* reschedule state queue work to run as soon as possible */
-		phy_trigger_machine(phydev);
-	}
+	/* reschedule state queue work to run as soon as possible */
+	phy_trigger_machine(phydev);
 
 	/* did_interrupt() may have cleared the interrupt already */
-	if (!phydev->drv->did_interrupt && phy_clear_interrupt(phydev))
-		goto phy_err;
-	return IRQ_HANDLED;
+	if (!drv->did_interrupt && phy_clear_interrupt(phydev)) {
+		phy_error(phydev);
+		return IRQ_NONE;
+	}
 
-phy_err:
-	phy_error(phydev);
-	return IRQ_NONE;
+	return IRQ_HANDLED;
 }
 
 /**
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 6b872aed8ba6..cb5a2182ba6d 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -23,6 +23,7 @@
 #include <linux/workqueue.h>
 #include <linux/mod_devicetable.h>
 #include <linux/u64_stats_sync.h>
+#include <linux/irqreturn.h>
 
 #include <linux/atomic.h>
 
@@ -568,7 +569,7 @@ struct phy_driver {
 	int (*did_interrupt)(struct phy_device *phydev);
 
 	/* Override default interrupt handling */
-	int (*handle_interrupt)(struct phy_device *phydev);
+	irqreturn_t (*handle_interrupt)(struct phy_device *phydev);
 
 	/* Clears up any memory if needed */
 	void (*remove)(struct phy_device *phydev);
-- 
cgit v1.2.3


From 6cc7cf8125b3d086cd80c96e02edb6f4ab9b20fa Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 17 Mar 2020 14:52:31 +0000
Subject: net: mdiobus: add APIs for modifying a MDIO device register

Add APIs for modifying a MDIO device register, similar to the existing
phy_modify() group of functions, but at mdiobus level instead.  Adapt
__phy_modify_changed() to use the new mdiobus level helper.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio_bus.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/phy/phy-core.c | 31 -------------------------
 include/linux/mdio.h       |  4 ++++
 include/linux/phy.h        | 19 ++++++++++++++++
 4 files changed, 79 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 129e60630319..522760c8bca6 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -824,6 +824,38 @@ int __mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val)
 }
 EXPORT_SYMBOL(__mdiobus_write);
 
+/**
+ * __mdiobus_modify_changed - Unlocked version of the mdiobus_modify function
+ * @bus: the mii_bus struct
+ * @addr: the phy address
+ * @regnum: register number to modify
+ * @mask: bit mask of bits to clear
+ * @set: bit mask of bits to set
+ *
+ * Read, modify, and if any change, write the register value back to the
+ * device. Any error returns a negative number.
+ *
+ * NOTE: MUST NOT be called from interrupt context.
+ */
+int __mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum,
+			     u16 mask, u16 set)
+{
+	int new, ret;
+
+	ret = __mdiobus_read(bus, addr, regnum);
+	if (ret < 0)
+		return ret;
+
+	new = (ret & ~mask) | set;
+	if (new == ret)
+		return 0;
+
+	ret = __mdiobus_write(bus, addr, regnum, new);
+
+	return ret < 0 ? ret : 1;
+}
+EXPORT_SYMBOL_GPL(__mdiobus_modify_changed);
+
 /**
  * mdiobus_read_nested - Nested version of the mdiobus_read function
  * @bus: the mii_bus struct
@@ -932,6 +964,30 @@ int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val)
 }
 EXPORT_SYMBOL(mdiobus_write);
 
+/**
+ * mdiobus_modify - Convenience function for modifying a given mdio device
+ *	register
+ * @bus: the mii_bus struct
+ * @addr: the phy address
+ * @regnum: register number to write
+ * @mask: bit mask of bits to clear
+ * @set: bit mask of bits to set
+ */
+int mdiobus_modify(struct mii_bus *bus, int addr, u32 regnum, u16 mask, u16 set)
+{
+	int err;
+
+	if (WARN_ON_ONCE(in_interrupt()))
+		return -EINVAL;
+
+	mutex_lock(&bus->mdio_lock);
+	err = __mdiobus_modify_changed(bus, addr, regnum, mask, set);
+	mutex_unlock(&bus->mdio_lock);
+
+	return err < 0 ? err : 0;
+}
+EXPORT_SYMBOL_GPL(mdiobus_modify);
+
 /**
  * mdio_bus_match - determine if given MDIO driver supports the given
  *		    MDIO device
diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index e083e7a76ada..94cd85b1e49b 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -488,37 +488,6 @@ int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val)
 }
 EXPORT_SYMBOL(phy_write_mmd);
 
-/**
- * __phy_modify_changed() - Convenience function for modifying a PHY register
- * @phydev: a pointer to a &struct phy_device
- * @regnum: register number
- * @mask: bit mask of bits to clear
- * @set: bit mask of bits to set
- *
- * Unlocked helper function which allows a PHY register to be modified as
- * new register value = (old register value & ~mask) | set
- *
- * Returns negative errno, 0 if there was no change, and 1 in case of change
- */
-int __phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask,
-			 u16 set)
-{
-	int new, ret;
-
-	ret = __phy_read(phydev, regnum);
-	if (ret < 0)
-		return ret;
-
-	new = (ret & ~mask) | set;
-	if (new == ret)
-		return 0;
-
-	ret = __phy_write(phydev, regnum, new);
-
-	return ret < 0 ? ret : 1;
-}
-EXPORT_SYMBOL_GPL(__phy_modify_changed);
-
 /**
  * phy_modify_changed - Function for modifying a PHY register
  * @phydev: the phy_device struct
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index a7604248777b..917e4bb2ed71 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -316,11 +316,15 @@ static inline void mii_10gbt_stat_mod_linkmode_lpa_t(unsigned long *advertising,
 
 int __mdiobus_read(struct mii_bus *bus, int addr, u32 regnum);
 int __mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val);
+int __mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum,
+			     u16 mask, u16 set);
 
 int mdiobus_read(struct mii_bus *bus, int addr, u32 regnum);
 int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum);
 int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val);
 int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val);
+int mdiobus_modify(struct mii_bus *bus, int addr, u32 regnum, u16 mask,
+		   u16 set);
 
 int mdiobus_register_device(struct mdio_device *mdiodev);
 int mdiobus_unregister_device(struct mdio_device *mdiodev);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index cb5a2182ba6d..36d9dea04016 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -754,6 +754,25 @@ static inline int __phy_write(struct phy_device *phydev, u32 regnum, u16 val)
 			       val);
 }
 
+/**
+ * __phy_modify_changed() - Convenience function for modifying a PHY register
+ * @phydev: a pointer to a &struct phy_device
+ * @regnum: register number
+ * @mask: bit mask of bits to clear
+ * @set: bit mask of bits to set
+ *
+ * Unlocked helper function which allows a PHY register to be modified as
+ * new register value = (old register value & ~mask) | set
+ *
+ * Returns negative errno, 0 if there was no change, and 1 in case of change
+ */
+static inline int __phy_modify_changed(struct phy_device *phydev, u32 regnum,
+				       u16 mask, u16 set)
+{
+	return __mdiobus_modify_changed(phydev->mdio.bus, phydev->mdio.addr,
+					regnum, mask, set);
+}
+
 /**
  * phy_read_mmd - Convenience function for reading a register
  * from an MMD on a given PHY.
-- 
cgit v1.2.3


From 74db1c18d80aaacdc42e6d00d7aca2bb5c97c7ea Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 17 Mar 2020 14:52:36 +0000
Subject: net: phylink: pcs: add 802.3 clause 22 helpers

Implement helpers for PCS accessed via the MII bus using 802.3 clause
22 cycles, conforming to 802.3 clause 37 and Cisco SGMII specifications
for the advertisement word.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/phylink.h   |   6 ++
 include/uapi/linux/mii.h  |   5 ++
 3 files changed, 217 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 60f32b354013..ced99e8fda31 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -2068,4 +2068,210 @@ void phylink_helper_basex_speed(struct phylink_link_state *state)
 }
 EXPORT_SYMBOL_GPL(phylink_helper_basex_speed);
 
+static void phylink_decode_c37_word(struct phylink_link_state *state,
+				    uint16_t config_reg, int speed)
+{
+	bool tx_pause, rx_pause;
+	int fd_bit;
+
+	if (speed == SPEED_2500)
+		fd_bit = ETHTOOL_LINK_MODE_2500baseX_Full_BIT;
+	else
+		fd_bit = ETHTOOL_LINK_MODE_1000baseX_Full_BIT;
+
+	mii_lpa_mod_linkmode_x(state->lp_advertising, config_reg, fd_bit);
+
+	if (linkmode_test_bit(fd_bit, state->advertising) &&
+	    linkmode_test_bit(fd_bit, state->lp_advertising)) {
+		state->speed = speed;
+		state->duplex = DUPLEX_FULL;
+	} else {
+		/* negotiation failure */
+		state->link = false;
+	}
+
+	linkmode_resolve_pause(state->advertising, state->lp_advertising,
+			       &tx_pause, &rx_pause);
+
+	if (tx_pause)
+		state->pause |= MLO_PAUSE_TX;
+	if (rx_pause)
+		state->pause |= MLO_PAUSE_RX;
+}
+
+static void phylink_decode_sgmii_word(struct phylink_link_state *state,
+				      uint16_t config_reg)
+{
+	if (!(config_reg & LPA_SGMII_LINK)) {
+		state->link = false;
+		return;
+	}
+
+	switch (config_reg & LPA_SGMII_SPD_MASK) {
+	case LPA_SGMII_10:
+		state->speed = SPEED_10;
+		break;
+	case LPA_SGMII_100:
+		state->speed = SPEED_100;
+		break;
+	case LPA_SGMII_1000:
+		state->speed = SPEED_1000;
+		break;
+	default:
+		state->link = false;
+		return;
+	}
+	if (config_reg & LPA_SGMII_FULL_DUPLEX)
+		state->duplex = DUPLEX_FULL;
+	else
+		state->duplex = DUPLEX_HALF;
+}
+
+/**
+ * phylink_mii_c22_pcs_get_state() - read the MAC PCS state
+ * @pcs: a pointer to a &struct mdio_device.
+ * @state: a pointer to a &struct phylink_link_state.
+ *
+ * Helper for MAC PCS supporting the 802.3 clause 22 register set for
+ * clause 37 negotiation and/or SGMII control.
+ *
+ * Read the MAC PCS state from the MII device configured in @config and
+ * parse the Clause 37 or Cisco SGMII link partner negotiation word into
+ * the phylink @state structure. This is suitable to be directly plugged
+ * into the mac_pcs_get_state() member of the struct phylink_mac_ops
+ * structure.
+ */
+void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs,
+				   struct phylink_link_state *state)
+{
+	struct mii_bus *bus = pcs->bus;
+	int addr = pcs->addr;
+	int bmsr, lpa;
+
+	bmsr = mdiobus_read(bus, addr, MII_BMSR);
+	lpa = mdiobus_read(bus, addr, MII_LPA);
+	if (bmsr < 0 || lpa < 0) {
+		state->link = false;
+		return;
+	}
+
+	state->link = !!(bmsr & BMSR_LSTATUS);
+	state->an_complete = !!(bmsr & BMSR_ANEGCOMPLETE);
+	if (!state->link)
+		return;
+
+	switch (state->interface) {
+	case PHY_INTERFACE_MODE_1000BASEX:
+		phylink_decode_c37_word(state, lpa, SPEED_1000);
+		break;
+
+	case PHY_INTERFACE_MODE_2500BASEX:
+		phylink_decode_c37_word(state, lpa, SPEED_2500);
+		break;
+
+	case PHY_INTERFACE_MODE_SGMII:
+		phylink_decode_sgmii_word(state, lpa);
+		break;
+
+	default:
+		state->link = false;
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(phylink_mii_c22_pcs_get_state);
+
+/**
+ * phylink_mii_c22_pcs_set_advertisement() - configure the clause 37 PCS
+ *	advertisement
+ * @pcs: a pointer to a &struct mdio_device.
+ * @state: a pointer to the state being configured.
+ *
+ * Helper for MAC PCS supporting the 802.3 clause 22 register set for
+ * clause 37 negotiation and/or SGMII control.
+ *
+ * Configure the clause 37 PCS advertisement as specified by @state. This
+ * does not trigger a renegotiation; phylink will do that via the
+ * mac_an_restart() method of the struct phylink_mac_ops structure.
+ *
+ * Returns negative error code on failure to configure the advertisement,
+ * zero if no change has been made, or one if the advertisement has changed.
+ */
+int phylink_mii_c22_pcs_set_advertisement(struct mdio_device *pcs,
+					const struct phylink_link_state *state)
+{
+	struct mii_bus *bus = pcs->bus;
+	int addr = pcs->addr;
+	int val, ret;
+	u16 adv;
+
+	switch (state->interface) {
+	case PHY_INTERFACE_MODE_1000BASEX:
+	case PHY_INTERFACE_MODE_2500BASEX:
+		adv = ADVERTISE_1000XFULL;
+		if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT,
+				      state->advertising))
+			adv |= ADVERTISE_1000XPAUSE;
+		if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+				      state->advertising))
+			adv |= ADVERTISE_1000XPSE_ASYM;
+
+		val = mdiobus_read(bus, addr, MII_ADVERTISE);
+		if (val < 0)
+			return val;
+
+		if (val == adv)
+			return 0;
+
+		ret = mdiobus_write(bus, addr, MII_ADVERTISE, adv);
+		if (ret < 0)
+			return ret;
+
+		return 1;
+
+	case PHY_INTERFACE_MODE_SGMII:
+		val = mdiobus_read(bus, addr, MII_ADVERTISE);
+		if (val < 0)
+			return val;
+
+		if (val == 0x0001)
+			return 0;
+
+		ret = mdiobus_write(bus, addr, MII_ADVERTISE, 0x0001);
+		if (ret < 0)
+			return ret;
+
+		return 1;
+
+	default:
+		/* Nothing to do for other modes */
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(phylink_mii_c22_pcs_set_advertisement);
+
+/**
+ * phylink_mii_c22_pcs_an_restart() - restart 802.3z autonegotiation
+ * @pcs: a pointer to a &struct mdio_device.
+ *
+ * Helper for MAC PCS supporting the 802.3 clause 22 register set for
+ * clause 37 negotiation.
+ *
+ * Restart the clause 37 negotiation with the link partner. This is
+ * suitable to be directly plugged into the mac_pcs_get_state() member
+ * of the struct phylink_mac_ops structure.
+ */
+void phylink_mii_c22_pcs_an_restart(struct mdio_device *pcs)
+{
+	struct mii_bus *bus = pcs->bus;
+	int val, addr = pcs->addr;
+
+	val = mdiobus_read(bus, addr, MII_BMCR);
+	if (val >= 0) {
+		val |= BMCR_ANRESTART;
+
+		mdiobus_write(bus, addr, MII_BMCR, val);
+	}
+}
+EXPORT_SYMBOL_GPL(phylink_mii_c22_pcs_an_restart);
+
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 2180eb1aa254..de591c2fb37e 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -317,4 +317,10 @@ int phylink_mii_ioctl(struct phylink *, struct ifreq *, int);
 void phylink_set_port_modes(unsigned long *bits);
 void phylink_helper_basex_speed(struct phylink_link_state *state);
 
+void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs,
+				   struct phylink_link_state *state);
+int phylink_mii_c22_pcs_set_advertisement(struct mdio_device *pcs,
+					const struct phylink_link_state *state);
+void phylink_mii_c22_pcs_an_restart(struct mdio_device *pcs);
+
 #endif
diff --git a/include/uapi/linux/mii.h b/include/uapi/linux/mii.h
index 0b9c3beda345..90f9b4e1ba27 100644
--- a/include/uapi/linux/mii.h
+++ b/include/uapi/linux/mii.h
@@ -134,11 +134,16 @@
 /* MAC and PHY tx_config_Reg[15:0] for SGMII in-band auto-negotiation.*/
 #define ADVERTISE_SGMII		0x0001	/* MAC can do SGMII            */
 #define LPA_SGMII		0x0001	/* PHY can do SGMII            */
+#define LPA_SGMII_SPD_MASK	0x0c00	/* SGMII speed mask            */
+#define LPA_SGMII_FULL_DUPLEX	0x1000	/* SGMII full duplex           */
 #define LPA_SGMII_DPX_SPD_MASK	0x1C00	/* SGMII duplex and speed bits */
+#define LPA_SGMII_10		0x0000	/* 10Mbps                      */
 #define LPA_SGMII_10HALF	0x0000	/* Can do 10mbps half-duplex   */
 #define LPA_SGMII_10FULL	0x1000	/* Can do 10mbps full-duplex   */
+#define LPA_SGMII_100		0x0400	/* 100Mbps                     */
 #define LPA_SGMII_100HALF	0x0400	/* Can do 100mbps half-duplex  */
 #define LPA_SGMII_100FULL	0x1400	/* Can do 100mbps full-duplex  */
+#define LPA_SGMII_1000		0x0800	/* 1000Mbps                    */
 #define LPA_SGMII_1000HALF	0x0800	/* Can do 1000mbps half-duplex */
 #define LPA_SGMII_1000FULL	0x1800	/* Can do 1000mbps full-duplex */
 #define LPA_SGMII_LINK		0x8000	/* PHY link with copper-side partner */
-- 
cgit v1.2.3


From b8679ef8bedfe2bae90c97bc4c8a1826cfd98bba Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 17 Mar 2020 14:52:41 +0000
Subject: net: phylink: pcs: add 802.3 clause 45 helpers

Implement helpers for PCS accessed via the MII bus using 802.3 clause
45 cycles for 10GBASE-R. Only link up/down is supported, 10G full
duplex is assumed.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 30 ++++++++++++++++++++++++++++++
 include/linux/phylink.h   |  2 ++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index ced99e8fda31..fed0c5907c6a 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -2274,4 +2274,34 @@ void phylink_mii_c22_pcs_an_restart(struct mdio_device *pcs)
 }
 EXPORT_SYMBOL_GPL(phylink_mii_c22_pcs_an_restart);
 
+#define C45_ADDR(d,a)	(MII_ADDR_C45 | (d) << 16 | (a))
+void phylink_mii_c45_pcs_get_state(struct mdio_device *pcs,
+				   struct phylink_link_state *state)
+{
+	struct mii_bus *bus = pcs->bus;
+	int addr = pcs->addr;
+	int stat;
+
+	stat = mdiobus_read(bus, addr, C45_ADDR(MDIO_MMD_PCS, MDIO_STAT1));
+	if (stat < 0) {
+		state->link = false;
+		return;
+	}
+
+	state->link = !!(stat & MDIO_STAT1_LSTATUS);
+	if (!state->link)
+		return;
+
+	switch (state->interface) {
+	case PHY_INTERFACE_MODE_10GBASER:
+		state->speed = SPEED_10000;
+		state->duplex = DUPLEX_FULL;
+		break;
+
+	default:
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(phylink_mii_c45_pcs_get_state);
+
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index de591c2fb37e..8fa6df3b881b 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -323,4 +323,6 @@ int phylink_mii_c22_pcs_set_advertisement(struct mdio_device *pcs,
 					const struct phylink_link_state *state);
 void phylink_mii_c22_pcs_an_restart(struct mdio_device *pcs);
 
+void phylink_mii_c45_pcs_get_state(struct mdio_device *pcs,
+				   struct phylink_link_state *state);
 #endif
-- 
cgit v1.2.3


From 357b6cc5834eabc1be7c28a9faae7da061df097d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 18 Mar 2020 10:33:22 +0100
Subject: netfilter: revert introduction of egress hook

This reverts the following commits:

  8537f78647c0 ("netfilter: Introduce egress hook")
  5418d3881e1f ("netfilter: Generalize ingress hook")
  b030f194aed2 ("netfilter: Rename ingress hook include file")

>From the discussion in [0], the author's main motivation to add a hook
in fast path is for an out of tree kernel module, which is a red flag
to begin with. Other mentioned potential use cases like NAT{64,46}
is on future extensions w/o concrete code in the tree yet. Revert as
suggested [1] given the weak justification to add more hooks to critical
fast-path.

  [0] https://lore.kernel.org/netdev/cover.1583927267.git.lukas@wunner.de/
  [1] https://lore.kernel.org/netdev/20200318.011152.72770718915606186.davem@davemloft.net/

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Miller <davem@davemloft.net>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Nacked-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h         |   4 --
 include/linux/netfilter_ingress.h |  58 ++++++++++++++++++++++
 include/linux/netfilter_netdev.h  | 102 --------------------------------------
 include/uapi/linux/netfilter.h    |   1 -
 net/core/dev.c                    |  27 ++--------
 net/netfilter/Kconfig             |   8 ---
 net/netfilter/core.c              |  24 ++-------
 net/netfilter/nft_chain_filter.c  |   4 +-
 8 files changed, 68 insertions(+), 160 deletions(-)
 create mode 100644 include/linux/netfilter_ingress.h
 delete mode 100644 include/linux/netfilter_netdev.h

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 15f1e32b430c..654808bfad83 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1751,7 +1751,6 @@ enum netdev_priv_flags {
  *	@xps_maps:	XXX: need comments on this one
  *	@miniq_egress:		clsact qdisc specific data for
  *				egress processing
- *	@nf_hooks_egress:	netfilter hooks executed for egress packets
  *	@qdisc_hash:		qdisc hash table
  *	@watchdog_timeo:	Represents the timeout that is used by
  *				the watchdog (see dev_watchdog())
@@ -2027,9 +2026,6 @@ struct net_device {
 #ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc __rcu	*miniq_egress;
 #endif
-#ifdef CONFIG_NETFILTER_EGRESS
-	struct nf_hook_entries __rcu *nf_hooks_egress;
-#endif
 
 #ifdef CONFIG_NET_SCHED
 	DECLARE_HASHTABLE	(qdisc_hash, 4);
diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
new file mode 100644
index 000000000000..a13774be2eb5
--- /dev/null
+++ b/include/linux/netfilter_ingress.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NETFILTER_INGRESS_H_
+#define _NETFILTER_INGRESS_H_
+
+#include <linux/netfilter.h>
+#include <linux/netdevice.h>
+
+#ifdef CONFIG_NETFILTER_INGRESS
+static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
+{
+#ifdef CONFIG_JUMP_LABEL
+	if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS]))
+		return false;
+#endif
+	return rcu_access_pointer(skb->dev->nf_hooks_ingress);
+}
+
+/* caller must hold rcu_read_lock */
+static inline int nf_hook_ingress(struct sk_buff *skb)
+{
+	struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress);
+	struct nf_hook_state state;
+	int ret;
+
+	/* Must recheck the ingress hook head, in the event it became NULL
+	 * after the check in nf_hook_ingress_active evaluated to true.
+	 */
+	if (unlikely(!e))
+		return 0;
+
+	nf_hook_state_init(&state, NF_NETDEV_INGRESS,
+			   NFPROTO_NETDEV, skb->dev, NULL, NULL,
+			   dev_net(skb->dev), NULL);
+	ret = nf_hook_slow(skb, &state, e, 0);
+	if (ret == 0)
+		return -1;
+
+	return ret;
+}
+
+static inline void nf_hook_ingress_init(struct net_device *dev)
+{
+	RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
+}
+#else /* CONFIG_NETFILTER_INGRESS */
+static inline int nf_hook_ingress_active(struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline int nf_hook_ingress(struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline void nf_hook_ingress_init(struct net_device *dev) {}
+#endif /* CONFIG_NETFILTER_INGRESS */
+#endif /* _NETFILTER_INGRESS_H_ */
diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h
deleted file mode 100644
index 92d3611a782e..000000000000
--- a/include/linux/netfilter_netdev.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NETFILTER_NETDEV_H_
-#define _NETFILTER_NETDEV_H_
-
-#include <linux/netfilter.h>
-#include <linux/netdevice.h>
-
-#ifdef CONFIG_NETFILTER
-static __always_inline bool nf_hook_netdev_active(enum nf_dev_hooks hooknum,
-					  struct nf_hook_entries __rcu *hooks)
-{
-#ifdef CONFIG_JUMP_LABEL
-	if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][hooknum]))
-		return false;
-#endif
-	return rcu_access_pointer(hooks);
-}
-
-/* caller must hold rcu_read_lock */
-static __always_inline int nf_hook_netdev(struct sk_buff *skb,
-					  enum nf_dev_hooks hooknum,
-					  struct nf_hook_entries __rcu *hooks)
-{
-	struct nf_hook_entries *e = rcu_dereference(hooks);
-	struct nf_hook_state state;
-	int ret;
-
-	/* Must recheck the hook head, in the event it became NULL
-	 * after the check in nf_hook_netdev_active evaluated to true.
-	 */
-	if (unlikely(!e))
-		return 0;
-
-	nf_hook_state_init(&state, hooknum,
-			   NFPROTO_NETDEV, skb->dev, NULL, NULL,
-			   dev_net(skb->dev), NULL);
-	ret = nf_hook_slow(skb, &state, e, 0);
-	if (ret == 0)
-		return -1;
-
-	return ret;
-}
-#endif /* CONFIG_NETFILTER */
-
-static inline void nf_hook_netdev_init(struct net_device *dev)
-{
-#ifdef CONFIG_NETFILTER_INGRESS
-	RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
-#endif
-#ifdef CONFIG_NETFILTER_EGRESS
-	RCU_INIT_POINTER(dev->nf_hooks_egress, NULL);
-#endif
-}
-
-#ifdef CONFIG_NETFILTER_INGRESS
-static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
-{
-	return nf_hook_netdev_active(NF_NETDEV_INGRESS,
-				     skb->dev->nf_hooks_ingress);
-}
-
-static inline int nf_hook_ingress(struct sk_buff *skb)
-{
-	return nf_hook_netdev(skb, NF_NETDEV_INGRESS,
-			      skb->dev->nf_hooks_ingress);
-}
-#else /* CONFIG_NETFILTER_INGRESS */
-static inline int nf_hook_ingress_active(struct sk_buff *skb)
-{
-	return 0;
-}
-
-static inline int nf_hook_ingress(struct sk_buff *skb)
-{
-	return 0;
-}
-#endif /* CONFIG_NETFILTER_INGRESS */
-
-#ifdef CONFIG_NETFILTER_EGRESS
-static inline bool nf_hook_egress_active(const struct sk_buff *skb)
-{
-	return nf_hook_netdev_active(NF_NETDEV_EGRESS,
-				     skb->dev->nf_hooks_egress);
-}
-
-static inline int nf_hook_egress(struct sk_buff *skb)
-{
-	return nf_hook_netdev(skb, NF_NETDEV_EGRESS,
-			      skb->dev->nf_hooks_egress);
-}
-#else /* CONFIG_NETFILTER_EGRESS */
-static inline int nf_hook_egress_active(struct sk_buff *skb)
-{
-	return 0;
-}
-
-static inline int nf_hook_egress(struct sk_buff *skb)
-{
-	return 0;
-}
-#endif /* CONFIG_NETFILTER_EGRESS */
-#endif /* _NETFILTER_INGRESS_H_ */
diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h
index d1616574c54f..ca9e63d6e0e4 100644
--- a/include/uapi/linux/netfilter.h
+++ b/include/uapi/linux/netfilter.h
@@ -50,7 +50,6 @@ enum nf_inet_hooks {
 
 enum nf_dev_hooks {
 	NF_NETDEV_INGRESS,
-	NF_NETDEV_EGRESS,
 	NF_NETDEV_NUMHOOKS
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index aeb8ccbbe93b..021e18251465 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,7 +135,7 @@
 #include <linux/if_macvlan.h>
 #include <linux/errqueue.h>
 #include <linux/hrtimer.h>
-#include <linux/netfilter_netdev.h>
+#include <linux/netfilter_ingress.h>
 #include <linux/crash_dump.h>
 #include <linux/sctp.h>
 #include <net/udp_tunnel.h>
@@ -3773,7 +3773,6 @@ EXPORT_SYMBOL(dev_loopback_xmit);
 static struct sk_buff *
 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 {
-#ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
 	struct tcf_result cl_res;
 
@@ -3807,24 +3806,11 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 	default:
 		break;
 	}
-#endif /* CONFIG_NET_CLS_ACT */
+
 	return skb;
 }
 #endif /* CONFIG_NET_EGRESS */
 
-static inline int nf_egress(struct sk_buff *skb)
-{
-	if (nf_hook_egress_active(skb)) {
-		int ret;
-
-		rcu_read_lock();
-		ret = nf_hook_egress(skb);
-		rcu_read_unlock();
-		return ret;
-	}
-	return 0;
-}
-
 #ifdef CONFIG_XPS
 static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
 			       struct xps_dev_maps *dev_maps, unsigned int tci)
@@ -4011,16 +3997,13 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
 	qdisc_pkt_len_init(skb);
 #ifdef CONFIG_NET_CLS_ACT
 	skb->tc_at_ingress = 0;
-#endif
-#ifdef CONFIG_NET_EGRESS
+# ifdef CONFIG_NET_EGRESS
 	if (static_branch_unlikely(&egress_needed_key)) {
-		if (nf_egress(skb) < 0)
-			goto out;
-
 		skb = sch_handle_egress(skb, &rc, dev);
 		if (!skb)
 			goto out;
 	}
+# endif
 #endif
 	/* If device/qdisc don't need skb->dst, release it right now while
 	 * its hot in this cpu cache.
@@ -9867,7 +9850,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	if (!dev->ethtool_ops)
 		dev->ethtool_ops = &default_ethtool_ops;
 
-	nf_hook_netdev_init(dev);
+	nf_hook_ingress_init(dev);
 
 	return dev;
 
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f4c68f60f241..468fea1aebba 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -10,14 +10,6 @@ config NETFILTER_INGRESS
 	  This allows you to classify packets from ingress using the Netfilter
 	  infrastructure.
 
-config NETFILTER_EGRESS
-	bool "Netfilter egress support"
-	default y
-	select NET_EGRESS
-	help
-	  This allows you to classify packets before transmission using the
-	  Netfilter infrastructure.
-
 config NETFILTER_NETLINK
 	tristate
 
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 85e9c959aba7..78f046ec506f 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -306,12 +306,6 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
 		if (dev && dev_net(dev) == net)
 			return &dev->nf_hooks_ingress;
 	}
-#endif
-#ifdef CONFIG_NETFILTER_EGRESS
-	if (hooknum == NF_NETDEV_EGRESS) {
-		if (dev && dev_net(dev) == net)
-			return &dev->nf_hooks_egress;
-	}
 #endif
 	WARN_ON_ONCE(1);
 	return NULL;
@@ -324,13 +318,11 @@ static int __nf_register_net_hook(struct net *net, int pf,
 	struct nf_hook_entries __rcu **pp;
 
 	if (pf == NFPROTO_NETDEV) {
-		if ((!IS_ENABLED(CONFIG_NETFILTER_INGRESS) &&
-		     reg->hooknum == NF_NETDEV_INGRESS) ||
-		    (!IS_ENABLED(CONFIG_NETFILTER_EGRESS) &&
-		     reg->hooknum == NF_NETDEV_EGRESS))
+#ifndef CONFIG_NETFILTER_INGRESS
+		if (reg->hooknum == NF_NETDEV_INGRESS)
 			return -EOPNOTSUPP;
-		if ((reg->hooknum != NF_NETDEV_INGRESS &&
-		     reg->hooknum != NF_NETDEV_EGRESS) ||
+#endif
+		if (reg->hooknum != NF_NETDEV_INGRESS ||
 		    !reg->dev || dev_net(reg->dev) != net)
 			return -EINVAL;
 	}
@@ -356,10 +348,6 @@ static int __nf_register_net_hook(struct net *net, int pf,
 	if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
 		net_inc_ingress_queue();
 #endif
-#ifdef CONFIG_NETFILTER_EGRESS
-	if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS)
-		net_inc_egress_queue();
-#endif
 #ifdef CONFIG_JUMP_LABEL
 	static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]);
 #endif
@@ -418,10 +406,6 @@ static void __nf_unregister_net_hook(struct net *net, int pf,
 		if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
 			net_dec_ingress_queue();
 #endif
-#ifdef CONFIG_NETFILTER_EGRESS
-		if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS)
-			net_dec_egress_queue();
-#endif
 #ifdef CONFIG_JUMP_LABEL
 		static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]);
 #endif
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index 67ce6dbb5496..c78d01bc02e9 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -277,11 +277,9 @@ static const struct nft_chain_type nft_chain_filter_netdev = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
 	.family		= NFPROTO_NETDEV,
-	.hook_mask	= (1 << NF_NETDEV_INGRESS) |
-			  (1 << NF_NETDEV_EGRESS),
+	.hook_mask	= (1 << NF_NETDEV_INGRESS),
 	.hooks		= {
 		[NF_NETDEV_INGRESS]	= nft_do_chain_netdev,
-		[NF_NETDEV_EGRESS]	= nft_do_chain_netdev,
 	},
 };
 
-- 
cgit v1.2.3


From 8511d72f14bc7b94a96bb2990615e1cd7c1dd21e Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 19 Mar 2020 15:47:41 +0100
Subject: sysfs: fix static inline declaration of sysfs_groups_change_owner()

The CONFIG_SYSFS declaration of sysfs_group_change_owner() is different
from the !CONFIG_SYSFS version and thus causes build failurs when
!CONFIG_SYSFS is set.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Fixes: 303a42769c4c ("sysfs: add sysfs_group{s}_change_owner()")
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysfs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 9e531ec76274..4beb51009b62 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -562,8 +562,8 @@ static inline int sysfs_groups_change_owner(struct kobject *kobj,
 }
 
 static inline int sysfs_group_change_owner(struct kobject *kobj,
-			 const struct attribute_group **groups,
-			 kuid_t kuid, kgid_t kgid)
+					   const struct attribute_group *groups,
+					   kuid_t kuid, kgid_t kgid)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 5eee3bb7103f4a66e4b90c2817f5e72509a2a607 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 20 Mar 2020 17:51:38 +0100
Subject: net: phy: add and use phy_check_downshift

So far PHY drivers have to check whether a downshift occurred to be
able to notify the user. To make life of drivers authors a little bit
easier move the downshift notification to phylib. phy_check_downshift()
compares the highest mutually advertised speed with the actual value
of phydev->speed (typically read by the PHY driver from a
vendor-specific register) to detect a downshift.

v2:
- Add downshift hint to phy_print_status

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-core.c | 38 ++++++++++++++++++++++++++++++++++++++
 drivers/net/phy/phy.c      |  4 +++-
 include/linux/phy.h        |  3 +++
 3 files changed, 44 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index 94cd85b1e49b..66b8c61ca74c 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -329,6 +329,44 @@ void phy_resolve_aneg_linkmode(struct phy_device *phydev)
 }
 EXPORT_SYMBOL_GPL(phy_resolve_aneg_linkmode);
 
+/**
+ * phy_check_downshift - check whether downshift occurred
+ * @phydev: The phy_device struct
+ *
+ * Check whether a downshift to a lower speed occurred. If this should be the
+ * case warn the user.
+ * Prerequisite for detecting downshift is that PHY driver implements the
+ * read_status callback and sets phydev->speed to the actual link speed.
+ */
+void phy_check_downshift(struct phy_device *phydev)
+{
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(common);
+	int i, speed = SPEED_UNKNOWN;
+
+	phydev->downshifted_rate = 0;
+
+	if (phydev->autoneg == AUTONEG_DISABLE ||
+	    phydev->speed == SPEED_UNKNOWN)
+		return;
+
+	linkmode_and(common, phydev->lp_advertising, phydev->advertising);
+
+	for (i = 0; i < ARRAY_SIZE(settings); i++)
+		if (test_bit(settings[i].bit, common)) {
+			speed = settings[i].speed;
+			break;
+		}
+
+	if (speed == SPEED_UNKNOWN || phydev->speed >= speed)
+		return;
+
+	phydev_warn(phydev, "Downshift occurred from negotiated speed %s to actual speed %s, check cabling!\n",
+		    phy_speed_to_str(speed), phy_speed_to_str(phydev->speed));
+
+	phydev->downshifted_rate = 1;
+}
+EXPORT_SYMBOL_GPL(phy_check_downshift);
+
 static int phy_resolve_min_speed(struct phy_device *phydev, bool fdx_only)
 {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(common);
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index d71212a418f3..72c69a9c8a98 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -96,9 +96,10 @@ void phy_print_status(struct phy_device *phydev)
 {
 	if (phydev->link) {
 		netdev_info(phydev->attached_dev,
-			"Link is Up - %s/%s - flow control %s\n",
+			"Link is Up - %s/%s %s- flow control %s\n",
 			phy_speed_to_str(phydev->speed),
 			phy_duplex_to_str(phydev->duplex),
+			phydev->downshifted_rate ? "(downshifted) " : "",
 			phy_pause_str(phydev));
 	} else	{
 		netdev_info(phydev->attached_dev, "Link is Down\n");
@@ -507,6 +508,7 @@ static int phy_check_link_status(struct phy_device *phydev)
 		return err;
 
 	if (phydev->link && phydev->state != PHY_RUNNING) {
+		phy_check_downshift(phydev);
 		phydev->state = PHY_RUNNING;
 		phy_link_up(phydev);
 	} else if (!phydev->link && phydev->state != PHY_NOLINK) {
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 36d9dea04016..99b5e3c4b621 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -365,6 +365,7 @@ struct macsec_ops;
  * suspended_by_mdio_bus: Set to true if this phy was suspended by MDIO bus.
  * sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal.
  * loopback_enabled: Set true if this phy has been loopbacked successfully.
+ * downshifted_rate: Set true if link speed has been downshifted.
  * state: state of the PHY for management purposes
  * dev_flags: Device-specific flags used by the PHY driver.
  * irq: IRQ number of the PHY's interrupt (-1 if none)
@@ -405,6 +406,7 @@ struct phy_device {
 	unsigned suspended_by_mdio_bus:1;
 	unsigned sysfs_links:1;
 	unsigned loopback_enabled:1;
+	unsigned downshifted_rate:1;
 
 	unsigned autoneg:1;
 	/* The most recently read link state */
@@ -698,6 +700,7 @@ static inline bool phy_is_started(struct phy_device *phydev)
 
 void phy_resolve_aneg_pause(struct phy_device *phydev);
 void phy_resolve_aneg_linkmode(struct phy_device *phydev);
+void phy_check_downshift(struct phy_device *phydev);
 
 /**
  * phy_read - Convenience function for reading a given PHY register
-- 
cgit v1.2.3


From 5f5323a14cad19323060a8cbf9d96f2280a462dd Mon Sep 17 00:00:00 2001
From: Dejin Zheng <zhengdejin5@gmail.com>
Date: Mon, 23 Mar 2020 23:05:51 +0800
Subject: iopoll: introduce read_poll_timeout macro

this macro is an extension of readx_poll_timeout macro. the accessor
function op just supports only one parameter in the readx_poll_timeout
macro, but this macro can supports multiple variable parameters for
it. so functions like phy_read(struct phy_device *phydev, u32 regnum)
and phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum) can
also use this poll timeout core. and also expand it can sleep some time
before read operation.

Signed-off-by: Dejin Zheng <zhengdejin5@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/iopoll.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h
index 35e15dfd4155..70f89b389648 100644
--- a/include/linux/iopoll.h
+++ b/include/linux/iopoll.h
@@ -13,6 +13,50 @@
 #include <linux/errno.h>
 #include <linux/io.h>
 
+/**
+ * read_poll_timeout - Periodically poll an address until a condition is
+ *			met or a timeout occurs
+ * @op: accessor function (takes @args as its arguments)
+ * @val: Variable to read the value into
+ * @cond: Break condition (usually involving @val)
+ * @sleep_us: Maximum time to sleep between reads in us (0
+ *            tight-loops).  Should be less than ~20ms since usleep_range
+ *            is used (see Documentation/timers/timers-howto.rst).
+ * @timeout_us: Timeout in us, 0 means never timeout
+ * @sleep_before_read: if it is true, sleep @sleep_us before read.
+ * @args: arguments for @op poll
+ *
+ * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
+ * case, the last read value at @args is stored in @val. Must not
+ * be called from atomic context if sleep_us or timeout_us are used.
+ *
+ * When available, you'll probably want to use one of the specialized
+ * macros defined below rather than this macro directly.
+ */
+#define read_poll_timeout(op, val, cond, sleep_us, timeout_us, \
+				sleep_before_read, args...) \
+({ \
+	u64 __timeout_us = (timeout_us); \
+	unsigned long __sleep_us = (sleep_us); \
+	ktime_t __timeout = ktime_add_us(ktime_get(), __timeout_us); \
+	might_sleep_if((__sleep_us) != 0); \
+	if (sleep_before_read && __sleep_us) \
+		usleep_range((__sleep_us >> 2) + 1, __sleep_us); \
+	for (;;) { \
+		(val) = op(args); \
+		if (cond) \
+			break; \
+		if (__timeout_us && \
+		    ktime_compare(ktime_get(), __timeout) > 0) { \
+			(val) = op(args); \
+			break; \
+		} \
+		if (__sleep_us) \
+			usleep_range((__sleep_us >> 2) + 1, __sleep_us); \
+	} \
+	(cond) ? 0 : -ETIMEDOUT; \
+})
+
 /**
  * readx_poll_timeout - Periodically poll an address until a condition is met or a timeout occurs
  * @op: accessor function (takes @addr as its only argument)
-- 
cgit v1.2.3


From eaa6b01024a74ab5f3064f17dd88596284f497c4 Mon Sep 17 00:00:00 2001
From: Dejin Zheng <zhengdejin5@gmail.com>
Date: Mon, 23 Mar 2020 23:05:52 +0800
Subject: iopoll: redefined readx_poll_timeout macro to simplify the code

redefined readx_poll_timeout macro by read_poll_timeout to
simplify the code.

Signed-off-by: Dejin Zheng <zhengdejin5@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/iopoll.h | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h
index 70f89b389648..cb20c733b15a 100644
--- a/include/linux/iopoll.h
+++ b/include/linux/iopoll.h
@@ -76,25 +76,7 @@
  * macros defined below rather than this macro directly.
  */
 #define readx_poll_timeout(op, addr, val, cond, sleep_us, timeout_us)	\
-({ \
-	u64 __timeout_us = (timeout_us); \
-	unsigned long __sleep_us = (sleep_us); \
-	ktime_t __timeout = ktime_add_us(ktime_get(), __timeout_us); \
-	might_sleep_if((__sleep_us) != 0); \
-	for (;;) { \
-		(val) = op(addr); \
-		if (cond) \
-			break; \
-		if (__timeout_us && \
-		    ktime_compare(ktime_get(), __timeout) > 0) { \
-			(val) = op(addr); \
-			break; \
-		} \
-		if (__sleep_us) \
-			usleep_range((__sleep_us >> 2) + 1, __sleep_us); \
-	} \
-	(cond) ? 0 : -ETIMEDOUT; \
-})
+	read_poll_timeout(op, val, cond, sleep_us, timeout_us, false, addr)
 
 /**
  * readx_poll_timeout_atomic - Periodically poll an address until a condition is met or a timeout occurs
-- 
cgit v1.2.3


From bd971ff0b73927b91f4520621d49d3a801ee4837 Mon Sep 17 00:00:00 2001
From: Dejin Zheng <zhengdejin5@gmail.com>
Date: Mon, 23 Mar 2020 23:05:53 +0800
Subject: net: phy: introduce phy_read_mmd_poll_timeout macro

it is sometimes necessary to poll a phy register by phy_read_mmd()
function until its value satisfies some condition. introduce
phy_read_mmd_poll_timeout() macros that do this.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Dejin Zheng <zhengdejin5@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 99b5e3c4b621..3984f375126e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -24,6 +24,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/irqreturn.h>
+#include <linux/iopoll.h>
 
 #include <linux/atomic.h>
 
@@ -787,6 +788,19 @@ static inline int __phy_modify_changed(struct phy_device *phydev, u32 regnum,
  */
 int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum);
 
+#define phy_read_mmd_poll_timeout(phydev, devaddr, regnum, val, cond, \
+				  sleep_us, timeout_us, sleep_before_read) \
+({ \
+	int __ret = read_poll_timeout(phy_read_mmd, val, (cond) || val < 0, \
+				  sleep_us, timeout_us, sleep_before_read, \
+				  phydev, devaddr, regnum); \
+	if (val <  0) \
+		__ret = val; \
+	if (__ret) \
+		phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \
+	__ret; \
+})
+
 /**
  * __phy_read_mmd - Convenience function for reading a register
  * from an MMD on a given PHY.
-- 
cgit v1.2.3


From fcbd30d09ba05389cb40cc1769b565df62aead35 Mon Sep 17 00:00:00 2001
From: Dejin Zheng <zhengdejin5@gmail.com>
Date: Mon, 23 Mar 2020 23:05:57 +0800
Subject: net: phy: introduce phy_read_poll_timeout macro

it is sometimes necessary to poll a phy register by phy_read()
function until its value satisfies some condition. introduce
phy_read_poll_timeout() macros that do this.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Dejin Zheng <zhengdejin5@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 3984f375126e..2432ca463ddc 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -717,6 +717,19 @@ static inline int phy_read(struct phy_device *phydev, u32 regnum)
 	return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, regnum);
 }
 
+#define phy_read_poll_timeout(phydev, regnum, val, cond, sleep_us, \
+				timeout_us, sleep_before_read) \
+({ \
+	int __ret = read_poll_timeout(phy_read, val, (cond) || val < 0, \
+		sleep_us, timeout_us, sleep_before_read, phydev, regnum); \
+	if (val <  0) \
+		__ret = val; \
+	if (__ret) \
+		phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \
+	__ret; \
+})
+
+
 /**
  * __phy_read - convenience function for reading a given PHY register
  * @phydev: the phy_device struct
-- 
cgit v1.2.3


From 5908220b2b3d6918f88cd645a39e1dcb84d1c5d9 Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Wed, 25 Mar 2020 15:52:30 +0300
Subject: net: introduce the MACSEC netdev feature

This patch introduce a new netdev feature, which will be used by drivers
to state they can perform MACsec transformations in hardware.

The patchset was gathered by Mark, macsec functinality itself
was implemented by Dmitry, Mark and Pavel Belous.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: Mark Starovoytov <mstarovoitov@marvell.com>
Signed-off-by: Igor Russkikh <irusskikh@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h | 3 +++
 net/ethtool/common.c            | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 34d050bb1ae6..9d53c5ad272c 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -83,6 +83,8 @@ enum {
 	NETIF_F_HW_TLS_RECORD_BIT,	/* Offload TLS record */
 	NETIF_F_GRO_FRAGLIST_BIT,	/* Fraglist GRO */
 
+	NETIF_F_HW_MACSEC_BIT,		/* Offload MACsec operations */
+
 	/*
 	 * Add your fresh new feature above and remember to update
 	 * netdev_features_strings[] in net/core/ethtool.c and maybe
@@ -154,6 +156,7 @@ enum {
 #define NETIF_F_HW_TLS_RX	__NETIF_F(HW_TLS_RX)
 #define NETIF_F_GRO_FRAGLIST	__NETIF_F(GRO_FRAGLIST)
 #define NETIF_F_GSO_FRAGLIST	__NETIF_F(GSO_FRAGLIST)
+#define NETIF_F_HW_MACSEC	__NETIF_F(HW_MACSEC)
 
 /* Finds the next feature with the highest number of the range of start till 0.
  */
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index dab047eec943..51a0941fc62f 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -60,6 +60,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
 	[NETIF_F_HW_TLS_TX_BIT] =	 "tls-hw-tx-offload",
 	[NETIF_F_HW_TLS_RX_BIT] =	 "tls-hw-rx-offload",
 	[NETIF_F_GRO_FRAGLIST_BIT] =	 "rx-gro-list",
+	[NETIF_F_HW_MACSEC_BIT] =	 "macsec-hw-offload",
 };
 
 const char
-- 
cgit v1.2.3


From 30e9bb8472f4454d0544020574bb03d96ffa0e52 Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Wed, 25 Mar 2020 15:52:31 +0300
Subject: net: add a reference to MACsec ops in net_device

This patch adds a reference to MACsec ops to the net_device structure,
allowing net device drivers to implement offloading operations for
MACsec.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Signed-off-by: Mark Starovoytov <mstarovoitov@marvell.com>
Signed-off-by: Igor Russkikh <irusskikh@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 654808bfad83..b521500b244d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -53,6 +53,8 @@ struct netpoll_info;
 struct device;
 struct phy_device;
 struct dsa_port;
+struct macsec_context;
+struct macsec_ops;
 
 struct sfp_bus;
 /* 802.11 specific */
@@ -1819,6 +1821,8 @@ enum netdev_priv_flags {
  *				that follow this device when it is moved
  *				to another network namespace.
  *
+ *	@macsec_ops:    MACsec offloading ops
+ *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
  */
@@ -2113,6 +2117,11 @@ struct net_device {
 	unsigned		wol_enabled:1;
 
 	struct list_head	net_notifier_list;
+
+#if IS_ENABLED(CONFIG_MACSEC)
+	/* MACsec management functions */
+	const struct macsec_ops *macsec_ops;
+#endif
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
-- 
cgit v1.2.3


From 16efafa31bc1fd7c8646dccbf30eeef3ad495d5a Mon Sep 17 00:00:00 2001
From: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Date: Fri, 27 Mar 2020 15:05:32 +0530
Subject: PCI: Add new PCI_VPD_RO_KEYWORD_SERIALNO macro

This patch adds a new macro for serial number keyword.

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Vasundhara Volam <vasundhara-v.volam@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index fc54b8922e66..a048fba311d2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2184,6 +2184,7 @@ int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask);
 #define PCI_VPD_INFO_FLD_HDR_SIZE	3
 
 #define PCI_VPD_RO_KEYWORD_PARTNO	"PN"
+#define PCI_VPD_RO_KEYWORD_SERIALNO	"SN"
 #define PCI_VPD_RO_KEYWORD_MFR_ID	"MN"
 #define PCI_VPD_RO_KEYWORD_VENDOR0	"V0"
 #define PCI_VPD_RO_KEYWORD_CHKSUM	"RV"
-- 
cgit v1.2.3


From ab41ca3455a208392ce95f4086d5708dc37bff86 Mon Sep 17 00:00:00 2001
From: Murali Krishna Policharla <murali.policharla@broadcom.com>
Date: Fri, 27 Mar 2020 21:55:40 +0200
Subject: net: phy: bcm7xx: add jumbo frame configuration to PHY

The BCM7XX PHY family requires special configuration to pass jumbo
frames. Do that during initial PHY setup.

Signed-off-by: Murali Krishna Policharla <murali.policharla@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/bcm-phy-lib.c | 22 ++++++++++++++++++++++
 drivers/net/phy/bcm-phy-lib.h |  1 +
 drivers/net/phy/bcm7xxx.c     |  4 ++++
 include/linux/brcmphy.h       |  2 ++
 4 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/bcm-phy-lib.c b/drivers/net/phy/bcm-phy-lib.c
index e0d3310957ff..e77b274a09fd 100644
--- a/drivers/net/phy/bcm-phy-lib.c
+++ b/drivers/net/phy/bcm-phy-lib.c
@@ -423,6 +423,28 @@ int bcm_phy_28nm_a0b0_afe_config_init(struct phy_device *phydev)
 }
 EXPORT_SYMBOL_GPL(bcm_phy_28nm_a0b0_afe_config_init);
 
+int bcm_phy_enable_jumbo(struct phy_device *phydev)
+{
+	int ret;
+
+	ret = bcm54xx_auxctl_read(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_AUXCTL);
+	if (ret < 0)
+		return ret;
+
+	/* Enable extended length packet reception */
+	ret = bcm54xx_auxctl_write(phydev, MII_BCM54XX_AUXCTL_SHDWSEL_AUXCTL,
+				   ret | MII_BCM54XX_AUXCTL_ACTL_EXT_PKT_LEN);
+	if (ret < 0)
+		return ret;
+
+	/* Enable the elastic FIFO for raising the transmission limit from
+	 * 4.5KB to 10KB, at the expense of an additional 16 ns in propagation
+	 * latency.
+	 */
+	return phy_set_bits(phydev, MII_BCM54XX_ECR, MII_BCM54XX_ECR_FIFOE);
+}
+EXPORT_SYMBOL_GPL(bcm_phy_enable_jumbo);
+
 MODULE_DESCRIPTION("Broadcom PHY Library");
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Broadcom Corporation");
diff --git a/drivers/net/phy/bcm-phy-lib.h b/drivers/net/phy/bcm-phy-lib.h
index c86fb9d1240c..129df819be8c 100644
--- a/drivers/net/phy/bcm-phy-lib.h
+++ b/drivers/net/phy/bcm-phy-lib.h
@@ -65,5 +65,6 @@ void bcm_phy_get_stats(struct phy_device *phydev, u64 *shadow,
 		       struct ethtool_stats *stats, u64 *data);
 void bcm_phy_r_rc_cal_reset(struct phy_device *phydev);
 int bcm_phy_28nm_a0b0_afe_config_init(struct phy_device *phydev);
+int bcm_phy_enable_jumbo(struct phy_device *phydev);
 
 #endif /* _LINUX_BCM_PHY_LIB_H */
diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c
index af8eabe7a6d4..692048d86ab1 100644
--- a/drivers/net/phy/bcm7xxx.c
+++ b/drivers/net/phy/bcm7xxx.c
@@ -178,6 +178,10 @@ static int bcm7xxx_28nm_config_init(struct phy_device *phydev)
 		break;
 	}
 
+	if (ret)
+		return ret;
+
+	ret =  bcm_phy_enable_jumbo(phydev);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index b475e7f20d28..6462c5447872 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -79,6 +79,7 @@
 #define MII_BCM54XX_ECR		0x10	/* BCM54xx extended control register */
 #define MII_BCM54XX_ECR_IM	0x1000	/* Interrupt mask */
 #define MII_BCM54XX_ECR_IF	0x0800	/* Interrupt force */
+#define MII_BCM54XX_ECR_FIFOE	0x0001	/* FIFO elasticity */
 
 #define MII_BCM54XX_ESR		0x11	/* BCM54xx extended status register */
 #define MII_BCM54XX_ESR_IS	0x1000	/* Interrupt status */
@@ -119,6 +120,7 @@
 #define MII_BCM54XX_AUXCTL_SHDWSEL_AUXCTL	0x00
 #define MII_BCM54XX_AUXCTL_ACTL_TX_6DB		0x0400
 #define MII_BCM54XX_AUXCTL_ACTL_SMDSP_ENA	0x0800
+#define MII_BCM54XX_AUXCTL_ACTL_EXT_PKT_LEN	0x4000
 
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC			0x07
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_WIRESPEED_EN	0x0010
-- 
cgit v1.2.3


From f318903c0bf42448b4c884732df2bbb0ef7a2284 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 27 Mar 2020 16:58:52 +0100
Subject: bpf: Add netns cookie and enable it for bpf cgroup hooks

In Cilium we're mainly using BPF cgroup hooks today in order to implement
kube-proxy free Kubernetes service translation for ClusterIP, NodePort (*),
ExternalIP, and LoadBalancer as well as HostPort mapping [0] for all traffic
between Cilium managed nodes. While this works in its current shape and avoids
packet-level NAT for inter Cilium managed node traffic, there is one major
limitation we're facing today, that is, lack of netns awareness.

In Kubernetes, the concept of Pods (which hold one or multiple containers)
has been built around network namespaces, so while we can use the global scope
of attaching to root BPF cgroup hooks also to our advantage (e.g. for exposing
NodePort ports on loopback addresses), we also have the need to differentiate
between initial network namespaces and non-initial one. For example, ExternalIP
services mandate that non-local service IPs are not to be translated from the
host (initial) network namespace as one example. Right now, we have an ugly
work-around in place where non-local service IPs for ExternalIP services are
not xlated from connect() and friends BPF hooks but instead via less efficient
packet-level NAT on the veth tc ingress hook for Pod traffic.

On top of determining whether we're in initial or non-initial network namespace
we also have a need for a socket-cookie like mechanism for network namespaces
scope. Socket cookies have the nice property that they can be combined as part
of the key structure e.g. for BPF LRU maps without having to worry that the
cookie could be recycled. We are planning to use this for our sessionAffinity
implementation for services. Therefore, add a new bpf_get_netns_cookie() helper
which would resolve both use cases at once: bpf_get_netns_cookie(NULL) would
provide the cookie for the initial network namespace while passing the context
instead of NULL would provide the cookie from the application's network namespace.
We're using a hole, so no size increase; the assignment happens only once.
Therefore this allows for a comparison on initial namespace as well as regular
cookie usage as we have today with socket cookies. We could later on enable
this helper for other program types as well as we would see need.

  (*) Both externalTrafficPolicy={Local|Cluster} types
  [0] https://github.com/cilium/cilium/blob/master/bpf/bpf_sock.c

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/c47d2346982693a9cf9da0e12690453aded4c788.1585323121.git.daniel@iogearbox.net
---
 include/linux/bpf.h            |  1 +
 include/net/net_namespace.h    | 10 ++++++++++
 include/uapi/linux/bpf.h       | 16 +++++++++++++++-
 kernel/bpf/verifier.c          | 16 ++++++++++------
 net/core/filter.c              | 37 +++++++++++++++++++++++++++++++++++++
 net/core/net_namespace.c       | 15 +++++++++++++++
 tools/include/uapi/linux/bpf.h | 16 +++++++++++++++-
 7 files changed, 103 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bdb981c204fa..78046c570596 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -233,6 +233,7 @@ enum bpf_arg_type {
 	ARG_CONST_SIZE_OR_ZERO,	/* number of bytes accessed from memory or 0 */
 
 	ARG_PTR_TO_CTX,		/* pointer to context */
+	ARG_PTR_TO_CTX_OR_NULL,	/* pointer to context or NULL */
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
 	ARG_PTR_TO_SPIN_LOCK,	/* pointer to bpf_spin_lock */
 	ARG_PTR_TO_SOCK_COMMON,	/* pointer to sock_common */
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 854d39ef1ca3..1c6edfdb9a2c 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -168,6 +168,9 @@ struct net {
 #ifdef CONFIG_XFRM
 	struct netns_xfrm	xfrm;
 #endif
+
+	atomic64_t		net_cookie; /* written once */
+
 #if IS_ENABLED(CONFIG_IP_VS)
 	struct netns_ipvs	*ipvs;
 #endif
@@ -273,6 +276,8 @@ static inline int check_net(const struct net *net)
 
 void net_drop_ns(void *);
 
+u64 net_gen_cookie(struct net *net);
+
 #else
 
 static inline struct net *get_net(struct net *net)
@@ -300,6 +305,11 @@ static inline int check_net(const struct net *net)
 	return 1;
 }
 
+static inline u64 net_gen_cookie(struct net *net)
+{
+	return 0;
+}
+
 #define net_drop_ns NULL
 #endif
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5d01c5c7e598..bd81c4555206 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2950,6 +2950,19 @@ union bpf_attr {
  *		restricted to raw_tracepoint bpf programs.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_get_netns_cookie(void *ctx)
+ * 	Description
+ * 		Retrieve the cookie (generated by the kernel) of the network
+ * 		namespace the input *ctx* is associated with. The network
+ * 		namespace cookie remains stable for its lifetime and provides
+ * 		a global identifier that can be assumed unique. If *ctx* is
+ * 		NULL, then the helper returns the cookie for the initial
+ * 		network namespace. The cookie itself is very similar to that
+ * 		of bpf_get_socket_cookie() helper, but for network namespaces
+ * 		instead of sockets.
+ * 	Return
+ * 		A 8-byte long opaque number.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3073,7 +3086,8 @@ union bpf_attr {
 	FN(jiffies64),			\
 	FN(read_branch_records),	\
 	FN(get_ns_current_pid_tgid),	\
-	FN(xdp_output),
+	FN(xdp_output),			\
+	FN(get_netns_cookie),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2ea2a868324e..46ba86c540e2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3461,13 +3461,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		expected_type = CONST_PTR_TO_MAP;
 		if (type != expected_type)
 			goto err_type;
-	} else if (arg_type == ARG_PTR_TO_CTX) {
+	} else if (arg_type == ARG_PTR_TO_CTX ||
+		   arg_type == ARG_PTR_TO_CTX_OR_NULL) {
 		expected_type = PTR_TO_CTX;
-		if (type != expected_type)
-			goto err_type;
-		err = check_ctx_reg(env, reg, regno);
-		if (err < 0)
-			return err;
+		if (!(register_is_null(reg) &&
+		      arg_type == ARG_PTR_TO_CTX_OR_NULL)) {
+			if (type != expected_type)
+				goto err_type;
+			err = check_ctx_reg(env, reg, regno);
+			if (err < 0)
+				return err;
+		}
 	} else if (arg_type == ARG_PTR_TO_SOCK_COMMON) {
 		expected_type = PTR_TO_SOCK_COMMON;
 		/* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */
diff --git a/net/core/filter.c b/net/core/filter.c
index 6cb7e0e24473..e249a499cbe5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4141,6 +4141,39 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+static u64 __bpf_get_netns_cookie(struct sock *sk)
+{
+#ifdef CONFIG_NET_NS
+	return net_gen_cookie(sk ? sk->sk_net.net : &init_net);
+#else
+	return 0;
+#endif
+}
+
+BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
+{
+	return __bpf_get_netns_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
+	.func		= bpf_get_netns_cookie_sock,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
+{
+	return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
+	.func		= bpf_get_netns_cookie_sock_addr,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX_OR_NULL,
+};
+
 BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
 {
 	struct sock *sk = sk_to_full_sk(skb->sk);
@@ -5968,6 +6001,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_local_storage_proto;
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_sock_proto;
+	case BPF_FUNC_get_netns_cookie:
+		return &bpf_get_netns_cookie_sock_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
 	default:
@@ -5994,6 +6029,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		}
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_sock_addr_proto;
+	case BPF_FUNC_get_netns_cookie:
+		return &bpf_get_netns_cookie_sock_addr_proto;
 	case BPF_FUNC_get_local_storage:
 		return &bpf_get_local_storage_proto;
 	case BPF_FUNC_perf_event_output:
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 757cc1d084e7..190ca66a383b 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -69,6 +69,20 @@ EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
 
 static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
 
+static atomic64_t cookie_gen;
+
+u64 net_gen_cookie(struct net *net)
+{
+	while (1) {
+		u64 res = atomic64_read(&net->net_cookie);
+
+		if (res)
+			return res;
+		res = atomic64_inc_return(&cookie_gen);
+		atomic64_cmpxchg(&net->net_cookie, 0, res);
+	}
+}
+
 static struct net_generic *net_alloc_generic(void)
 {
 	struct net_generic *ng;
@@ -1087,6 +1101,7 @@ static int __init net_ns_init(void)
 		panic("Could not allocate generic netns");
 
 	rcu_assign_pointer(init_net.gen, ng);
+	net_gen_cookie(&init_net);
 
 	down_write(&pernet_ops_rwsem);
 	if (setup_net(&init_net, &init_user_ns))
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 5d01c5c7e598..bd81c4555206 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2950,6 +2950,19 @@ union bpf_attr {
  *		restricted to raw_tracepoint bpf programs.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_get_netns_cookie(void *ctx)
+ * 	Description
+ * 		Retrieve the cookie (generated by the kernel) of the network
+ * 		namespace the input *ctx* is associated with. The network
+ * 		namespace cookie remains stable for its lifetime and provides
+ * 		a global identifier that can be assumed unique. If *ctx* is
+ * 		NULL, then the helper returns the cookie for the initial
+ * 		network namespace. The cookie itself is very similar to that
+ * 		of bpf_get_socket_cookie() helper, but for network namespaces
+ * 		instead of sockets.
+ * 	Return
+ * 		A 8-byte long opaque number.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3073,7 +3086,8 @@ union bpf_attr {
 	FN(jiffies64),			\
 	FN(read_branch_records),	\
 	FN(get_ns_current_pid_tgid),	\
-	FN(xdp_output),
+	FN(xdp_output),			\
+	FN(get_netns_cookie),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3


From 0f09abd105da6c37713d2b253730a86cb45e127a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 27 Mar 2020 16:58:54 +0100
Subject: bpf: Enable bpf cgroup hooks to retrieve cgroup v2 and ancestor id

Enable the bpf_get_current_cgroup_id() helper for connect(), sendmsg(),
recvmsg() and bind-related hooks in order to retrieve the cgroup v2
context which can then be used as part of the key for BPF map lookups,
for example. Given these hooks operate in process context 'current' is
always valid and pointing to the app that is performing mentioned
syscalls if it's subject to a v2 cgroup. Also with same motivation of
commit 7723628101aa ("bpf: Introduce bpf_skb_ancestor_cgroup_id helper")
enable retrieval of ancestor from current so the cgroup id can be used
for policy lookups which can then forbid connect() / bind(), for example.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/d2a7ef42530ad299e3cbb245e6c12374b72145ef.1585323121.git.daniel@iogearbox.net
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 21 ++++++++++++++++++++-
 kernel/bpf/core.c              |  1 +
 kernel/bpf/helpers.c           | 18 ++++++++++++++++++
 net/core/filter.c              | 12 ++++++++++++
 tools/include/uapi/linux/bpf.h | 21 ++++++++++++++++++++-
 6 files changed, 72 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 78046c570596..372708eeaecd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1501,6 +1501,7 @@ extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
+extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto;
 extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
 extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
 extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bd81c4555206..222ba11966e3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2963,6 +2963,24 @@ union bpf_attr {
  * 		instead of sockets.
  * 	Return
  * 		A 8-byte long opaque number.
+ *
+ * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level)
+ * 	Description
+ * 		Return id of cgroup v2 that is ancestor of the cgroup associated
+ * 		with the current task at the *ancestor_level*. The root cgroup
+ * 		is at *ancestor_level* zero and each step down the hierarchy
+ * 		increments the level. If *ancestor_level* == level of cgroup
+ * 		associated with the current task, then return value will be the
+ * 		same as that of **bpf_get_current_cgroup_id**\ ().
+ *
+ * 		The helper is useful to implement policies based on cgroups
+ * 		that are upper in hierarchy than immediate cgroup associated
+ * 		with the current task.
+ *
+ * 		The format of returned id and helper limitations are same as in
+ * 		**bpf_get_current_cgroup_id**\ ().
+ * 	Return
+ * 		The id is returned or 0 in case the id could not be retrieved.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3087,7 +3105,8 @@ union bpf_attr {
 	FN(read_branch_records),	\
 	FN(get_ns_current_pid_tgid),	\
 	FN(xdp_output),			\
-	FN(get_netns_cookie),
+	FN(get_netns_cookie),		\
+	FN(get_current_ancestor_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 914f3463aa41..916f5132a984 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2156,6 +2156,7 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
+const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak;
 const struct bpf_func_proto bpf_get_local_storage_proto __weak;
 const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 01878db15eaf..bafc53ddd350 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -340,6 +340,24 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
+BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level)
+{
+	struct cgroup *cgrp = task_dfl_cgroup(current);
+	struct cgroup *ancestor;
+
+	ancestor = cgroup_ancestor(cgrp, ancestor_level);
+	if (!ancestor)
+		return 0;
+	return cgroup_id(ancestor);
+}
+
+const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
+	.func		= bpf_get_current_ancestor_cgroup_id,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+};
+
 #ifdef CONFIG_CGROUP_BPF
 DECLARE_PER_CPU(struct bpf_cgroup_storage*,
 		bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
diff --git a/net/core/filter.c b/net/core/filter.c
index 3083c7746ee0..5cec3ac9e3dd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6018,6 +6018,12 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_netns_cookie_sock_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
+#ifdef CONFIG_CGROUPS
+	case BPF_FUNC_get_current_cgroup_id:
+		return &bpf_get_current_cgroup_id_proto;
+	case BPF_FUNC_get_current_ancestor_cgroup_id:
+		return &bpf_get_current_ancestor_cgroup_id_proto;
+#endif
 #ifdef CONFIG_CGROUP_NET_CLASSID
 	case BPF_FUNC_get_cgroup_classid:
 		return &bpf_get_cgroup_classid_curr_proto;
@@ -6052,6 +6058,12 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_local_storage_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_event_output_data_proto;
+#ifdef CONFIG_CGROUPS
+	case BPF_FUNC_get_current_cgroup_id:
+		return &bpf_get_current_cgroup_id_proto;
+	case BPF_FUNC_get_current_ancestor_cgroup_id:
+		return &bpf_get_current_ancestor_cgroup_id_proto;
+#endif
 #ifdef CONFIG_CGROUP_NET_CLASSID
 	case BPF_FUNC_get_cgroup_classid:
 		return &bpf_get_cgroup_classid_curr_proto;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index bd81c4555206..222ba11966e3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2963,6 +2963,24 @@ union bpf_attr {
  * 		instead of sockets.
  * 	Return
  * 		A 8-byte long opaque number.
+ *
+ * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level)
+ * 	Description
+ * 		Return id of cgroup v2 that is ancestor of the cgroup associated
+ * 		with the current task at the *ancestor_level*. The root cgroup
+ * 		is at *ancestor_level* zero and each step down the hierarchy
+ * 		increments the level. If *ancestor_level* == level of cgroup
+ * 		associated with the current task, then return value will be the
+ * 		same as that of **bpf_get_current_cgroup_id**\ ().
+ *
+ * 		The helper is useful to implement policies based on cgroups
+ * 		that are upper in hierarchy than immediate cgroup associated
+ * 		with the current task.
+ *
+ * 		The format of returned id and helper limitations are same as in
+ * 		**bpf_get_current_cgroup_id**\ ().
+ * 	Return
+ * 		The id is returned or 0 in case the id could not be retrieved.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3087,7 +3105,8 @@ union bpf_attr {
 	FN(read_branch_records),	\
 	FN(get_ns_current_pid_tgid),	\
 	FN(xdp_output),			\
-	FN(get_netns_cookie),
+	FN(get_netns_cookie),		\
+	FN(get_current_ancestor_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3


From 92234c8f15c8d96ad7e52afdc5994cba6be68eb9 Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Wed, 25 Mar 2020 18:23:26 +0100
Subject: xdp: Support specifying expected existing program when attaching XDP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While it is currently possible for userspace to specify that an existing
XDP program should not be replaced when attaching to an interface, there is
no mechanism to safely replace a specific XDP program with another.

This patch adds a new netlink attribute, IFLA_XDP_EXPECTED_FD, which can be
set along with IFLA_XDP_FD. If set, the kernel will check that the program
currently loaded on the interface matches the expected one, and fail the
operation if it does not. This corresponds to a 'cmpxchg' memory operation.
Setting the new attribute with a negative value means that no program is
expected to be attached, which corresponds to setting the UPDATE_IF_NOEXIST
flag.

A new companion flag, XDP_FLAGS_REPLACE, is also added to explicitly
request checking of the EXPECTED_FD attribute. This is needed for userspace
to discover whether the kernel supports the new attribute.

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/bpf/158515700640.92963.3551295145441017022.stgit@toke.dk
---
 include/linux/netdevice.h    |  2 +-
 include/uapi/linux/if_link.h |  4 +++-
 net/core/dev.c               | 26 +++++++++++++++++++++-----
 net/core/rtnetlink.c         | 14 ++++++++++++++
 4 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 654808bfad83..b503d468f0df 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3768,7 +3768,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
-		      int fd, u32 flags);
+		      int fd, int expected_fd, u32 flags);
 u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op,
 		    enum bpf_netdev_command cmd);
 int xdp_umem_query(struct net_device *dev, u16 queue_id);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 61e0801c82df..c2f768c8d65b 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -972,11 +972,12 @@ enum {
 #define XDP_FLAGS_SKB_MODE		(1U << 1)
 #define XDP_FLAGS_DRV_MODE		(1U << 2)
 #define XDP_FLAGS_HW_MODE		(1U << 3)
+#define XDP_FLAGS_REPLACE		(1U << 4)
 #define XDP_FLAGS_MODES			(XDP_FLAGS_SKB_MODE | \
 					 XDP_FLAGS_DRV_MODE | \
 					 XDP_FLAGS_HW_MODE)
 #define XDP_FLAGS_MASK			(XDP_FLAGS_UPDATE_IF_NOEXIST | \
-					 XDP_FLAGS_MODES)
+					 XDP_FLAGS_MODES | XDP_FLAGS_REPLACE)
 
 /* These are stored into IFLA_XDP_ATTACHED on dump. */
 enum {
@@ -996,6 +997,7 @@ enum {
 	IFLA_XDP_DRV_PROG_ID,
 	IFLA_XDP_SKB_PROG_ID,
 	IFLA_XDP_HW_PROG_ID,
+	IFLA_XDP_EXPECTED_FD,
 	__IFLA_XDP_MAX,
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index d84541c24446..651a3c28d33a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8655,15 +8655,17 @@ static void dev_xdp_uninstall(struct net_device *dev)
  *	@dev: device
  *	@extack: netlink extended ack
  *	@fd: new program fd or negative value to clear
+ *	@expected_fd: old program fd that userspace expects to replace or clear
  *	@flags: xdp-related flags
  *
  *	Set or clear a bpf program for a device
  */
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
-		      int fd, u32 flags)
+		      int fd, int expected_fd, u32 flags)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	enum bpf_netdev_command query;
+	u32 prog_id, expected_id = 0;
 	struct bpf_prog *prog = NULL;
 	bpf_op_t bpf_op, bpf_chk;
 	bool offload;
@@ -8684,15 +8686,29 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 	if (bpf_op == bpf_chk)
 		bpf_chk = generic_xdp_install;
 
-	if (fd >= 0) {
-		u32 prog_id;
+	prog_id = __dev_xdp_query(dev, bpf_op, query);
+	if (flags & XDP_FLAGS_REPLACE) {
+		if (expected_fd >= 0) {
+			prog = bpf_prog_get_type_dev(expected_fd,
+						     BPF_PROG_TYPE_XDP,
+						     bpf_op == ops->ndo_bpf);
+			if (IS_ERR(prog))
+				return PTR_ERR(prog);
+			expected_id = prog->aux->id;
+			bpf_prog_put(prog);
+		}
 
+		if (prog_id != expected_id) {
+			NL_SET_ERR_MSG(extack, "Active program does not match expected");
+			return -EEXIST;
+		}
+	}
+	if (fd >= 0) {
 		if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
 			NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
 			return -EEXIST;
 		}
 
-		prog_id = __dev_xdp_query(dev, bpf_op, query);
 		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
 			NL_SET_ERR_MSG(extack, "XDP program already attached");
 			return -EBUSY;
@@ -8715,7 +8731,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 			return 0;
 		}
 	} else {
-		if (!__dev_xdp_query(dev, bpf_op, query))
+		if (!prog_id)
 			return 0;
 	}
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 14e6ea21c378..709ebbf8ab5b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1872,7 +1872,9 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
 };
 
 static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
+	[IFLA_XDP_UNSPEC]	= { .strict_start_type = IFLA_XDP_EXPECTED_FD },
 	[IFLA_XDP_FD]		= { .type = NLA_S32 },
+	[IFLA_XDP_EXPECTED_FD]	= { .type = NLA_S32 },
 	[IFLA_XDP_ATTACHED]	= { .type = NLA_U8 },
 	[IFLA_XDP_FLAGS]	= { .type = NLA_U32 },
 	[IFLA_XDP_PROG_ID]	= { .type = NLA_U32 },
@@ -2799,8 +2801,20 @@ static int do_setlink(const struct sk_buff *skb,
 		}
 
 		if (xdp[IFLA_XDP_FD]) {
+			int expected_fd = -1;
+
+			if (xdp_flags & XDP_FLAGS_REPLACE) {
+				if (!xdp[IFLA_XDP_EXPECTED_FD]) {
+					err = -EINVAL;
+					goto errout;
+				}
+				expected_fd =
+					nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]);
+			}
+
 			err = dev_change_xdp_fd(dev, extack,
 						nla_get_s32(xdp[IFLA_XDP_FD]),
+						expected_fd,
 						xdp_flags);
 			if (err)
 				goto errout;
-- 
cgit v1.2.3


From fc611f47f2188ade2b48ff6902d5cce8baac0c58 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@google.com>
Date: Sun, 29 Mar 2020 01:43:49 +0100
Subject: bpf: Introduce BPF_PROG_TYPE_LSM

Introduce types and configs for bpf programs that can be attached to
LSM hooks. The programs can be enabled by the config option
CONFIG_BPF_LSM.

Signed-off-by: KP Singh <kpsingh@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Brendan Jackman <jackmanb@google.com>
Reviewed-by: Florent Revest <revest@google.com>
Reviewed-by: Thomas Garnier <thgarnie@google.com>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: James Morris <jamorris@linux.microsoft.com>
Link: https://lore.kernel.org/bpf/20200329004356.27286-2-kpsingh@chromium.org
---
 MAINTAINERS                    |  1 +
 include/linux/bpf.h            |  3 +++
 include/linux/bpf_types.h      |  4 ++++
 include/uapi/linux/bpf.h       |  2 ++
 init/Kconfig                   | 12 ++++++++++++
 kernel/bpf/Makefile            |  1 +
 kernel/bpf/bpf_lsm.c           | 17 +++++++++++++++++
 kernel/trace/bpf_trace.c       | 12 ++++++------
 tools/include/uapi/linux/bpf.h |  2 ++
 tools/lib/bpf/libbpf_probes.c  |  1 +
 10 files changed, 49 insertions(+), 6 deletions(-)
 create mode 100644 kernel/bpf/bpf_lsm.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 5dbee41045bc..3197fe9256b2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3147,6 +3147,7 @@ R:	Martin KaFai Lau <kafai@fb.com>
 R:	Song Liu <songliubraving@fb.com>
 R:	Yonghong Song <yhs@fb.com>
 R:	Andrii Nakryiko <andriin@fb.com>
+R:	KP Singh <kpsingh@chromium.org>
 L:	netdev@vger.kernel.org
 L:	bpf@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 372708eeaecd..3bde59a8453b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1515,6 +1515,9 @@ extern const struct bpf_func_proto bpf_tcp_sock_proto;
 extern const struct bpf_func_proto bpf_jiffies64_proto;
 extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto;
 
+const struct bpf_func_proto *bpf_tracing_func_proto(
+	enum bpf_func_id func_id, const struct bpf_prog *prog);
+
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index c81d4ece79a4..ba0c2d56f8a3 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -70,6 +70,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops,
 	      void *, void *)
 BPF_PROG_TYPE(BPF_PROG_TYPE_EXT, bpf_extension,
 	      void *, void *)
+#ifdef CONFIG_BPF_LSM
+BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm,
+	       void *, void *)
+#endif /* CONFIG_BPF_LSM */
 #endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 222ba11966e3..f1fbc36f58d3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -181,6 +181,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_TRACING,
 	BPF_PROG_TYPE_STRUCT_OPS,
 	BPF_PROG_TYPE_EXT,
+	BPF_PROG_TYPE_LSM,
 };
 
 enum bpf_attach_type {
@@ -211,6 +212,7 @@ enum bpf_attach_type {
 	BPF_TRACE_FENTRY,
 	BPF_TRACE_FEXIT,
 	BPF_MODIFY_RETURN,
+	BPF_LSM_MAC,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/init/Kconfig b/init/Kconfig
index 20a6ac33761c..deae572d1927 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1616,6 +1616,18 @@ config KALLSYMS_BASE_RELATIVE
 # end of the "standard kernel features (expert users)" menu
 
 # syscall, maps, verifier
+
+config BPF_LSM
+	bool "LSM Instrumentation with BPF"
+	depends on BPF_SYSCALL
+	depends on SECURITY
+	depends on BPF_JIT
+	help
+	  Enables instrumentation of the security hooks with eBPF programs for
+	  implementing dynamic MAC and Audit Policies.
+
+	  If you are unsure how to answer this question, answer N.
+
 config BPF_SYSCALL
 	bool "Enable bpf() system call"
 	select BPF
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 046ce5d98033..f2d7be596966 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -29,4 +29,5 @@ obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
 endif
 ifeq ($(CONFIG_BPF_JIT),y)
 obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
+obj-${CONFIG_BPF_LSM} += bpf_lsm.o
 endif
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
new file mode 100644
index 000000000000..82875039ca90
--- /dev/null
+++ b/kernel/bpf/bpf_lsm.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2020 Google LLC.
+ */
+
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+
+const struct bpf_prog_ops lsm_prog_ops = {
+};
+
+const struct bpf_verifier_ops lsm_verifier_ops = {
+	.get_func_proto = bpf_tracing_func_proto,
+	.is_valid_access = btf_ctx_access,
+};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e619eedb5919..37ffceab608f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -779,8 +779,8 @@ static const struct bpf_func_proto bpf_send_signal_thread_proto = {
 	.arg1_type	= ARG_ANYTHING,
 };
 
-static const struct bpf_func_proto *
-tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+const struct bpf_func_proto *
+bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_map_lookup_elem:
@@ -865,7 +865,7 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_override_return_proto;
 #endif
 	default:
-		return tracing_func_proto(func_id, prog);
+		return bpf_tracing_func_proto(func_id, prog);
 	}
 }
 
@@ -975,7 +975,7 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_stack:
 		return &bpf_get_stack_proto_tp;
 	default:
-		return tracing_func_proto(func_id, prog);
+		return bpf_tracing_func_proto(func_id, prog);
 	}
 }
 
@@ -1082,7 +1082,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_read_branch_records:
 		return &bpf_read_branch_records_proto;
 	default:
-		return tracing_func_proto(func_id, prog);
+		return bpf_tracing_func_proto(func_id, prog);
 	}
 }
 
@@ -1210,7 +1210,7 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_stack:
 		return &bpf_get_stack_proto_raw_tp;
 	default:
-		return tracing_func_proto(func_id, prog);
+		return bpf_tracing_func_proto(func_id, prog);
 	}
 }
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 222ba11966e3..f1fbc36f58d3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -181,6 +181,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_TRACING,
 	BPF_PROG_TYPE_STRUCT_OPS,
 	BPF_PROG_TYPE_EXT,
+	BPF_PROG_TYPE_LSM,
 };
 
 enum bpf_attach_type {
@@ -211,6 +212,7 @@ enum bpf_attach_type {
 	BPF_TRACE_FENTRY,
 	BPF_TRACE_FEXIT,
 	BPF_MODIFY_RETURN,
+	BPF_LSM_MAC,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index b782ebef6ac9..2c92059c0c90 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -108,6 +108,7 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
 	case BPF_PROG_TYPE_TRACING:
 	case BPF_PROG_TYPE_STRUCT_OPS:
 	case BPF_PROG_TYPE_EXT:
+	case BPF_PROG_TYPE_LSM:
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 98e828a0650f348be85728c69875260cf78069e6 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@google.com>
Date: Sun, 29 Mar 2020 01:43:50 +0100
Subject: security: Refactor declaration of LSM hooks

The information about the different types of LSM hooks is scattered
in two locations i.e. union security_list_options and
struct security_hook_heads. Rather than duplicating this information
even further for BPF_PROG_TYPE_LSM, define all the hooks with the
LSM_HOOK macro in lsm_hook_defs.h which is then used to generate all
the data structures required by the LSM framework.

The LSM hooks are defined as:

  LSM_HOOK(<return_type>, <default_value>, <hook_name>, args...)

with <default_value> acccessible in security.c as:

  LSM_RET_DEFAULT(<hook_name>)

Signed-off-by: KP Singh <kpsingh@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Brendan Jackman <jackmanb@google.com>
Reviewed-by: Florent Revest <revest@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: James Morris <jamorris@linux.microsoft.com>
Link: https://lore.kernel.org/bpf/20200329004356.27286-3-kpsingh@chromium.org
---
 include/linux/lsm_hook_defs.h | 381 +++++++++++++++++++++++++
 include/linux/lsm_hooks.h     | 628 +-----------------------------------------
 security/security.c           |  41 ++-
 3 files changed, 423 insertions(+), 627 deletions(-)
 create mode 100644 include/linux/lsm_hook_defs.h

(limited to 'include/linux')

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
new file mode 100644
index 000000000000..9cd4455528e5
--- /dev/null
+++ b/include/linux/lsm_hook_defs.h
@@ -0,0 +1,381 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Linux Security Module Hook declarations.
+ *
+ * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com>
+ * Copyright (C) 2001 Greg Kroah-Hartman <greg@kroah.com>
+ * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com>
+ * Copyright (C) 2001 James Morris <jmorris@intercode.com.au>
+ * Copyright (C) 2001 Silicon Graphics, Inc. (Trust Technology Group)
+ * Copyright (C) 2015 Intel Corporation.
+ * Copyright (C) 2015 Casey Schaufler <casey@schaufler-ca.com>
+ * Copyright (C) 2016 Mellanox Techonologies
+ * Copyright (C) 2020 Google LLC.
+ */
+
+/*
+ * The macro LSM_HOOK is used to define the data structures required by the
+ * the LSM framework using the pattern:
+ *
+ *	LSM_HOOK(<return_type>, <default_value>, <hook_name>, args...)
+ *
+ * struct security_hook_heads {
+ *   #define LSM_HOOK(RET, DEFAULT, NAME, ...) struct hlist_head NAME;
+ *   #include <linux/lsm_hook_defs.h>
+ *   #undef LSM_HOOK
+ * };
+ */
+LSM_HOOK(int, 0, binder_set_context_mgr, struct task_struct *mgr)
+LSM_HOOK(int, 0, binder_transaction, struct task_struct *from,
+	 struct task_struct *to)
+LSM_HOOK(int, 0, binder_transfer_binder, struct task_struct *from,
+	 struct task_struct *to)
+LSM_HOOK(int, 0, binder_transfer_file, struct task_struct *from,
+	 struct task_struct *to, struct file *file)
+LSM_HOOK(int, 0, ptrace_access_check, struct task_struct *child,
+	 unsigned int mode)
+LSM_HOOK(int, 0, ptrace_traceme, struct task_struct *parent)
+LSM_HOOK(int, 0, capget, struct task_struct *target, kernel_cap_t *effective,
+	 kernel_cap_t *inheritable, kernel_cap_t *permitted)
+LSM_HOOK(int, 0, capset, struct cred *new, const struct cred *old,
+	 const kernel_cap_t *effective, const kernel_cap_t *inheritable,
+	 const kernel_cap_t *permitted)
+LSM_HOOK(int, 0, capable, const struct cred *cred, struct user_namespace *ns,
+	 int cap, unsigned int opts)
+LSM_HOOK(int, 0, quotactl, int cmds, int type, int id, struct super_block *sb)
+LSM_HOOK(int, 0, quota_on, struct dentry *dentry)
+LSM_HOOK(int, 0, syslog, int type)
+LSM_HOOK(int, 0, settime, const struct timespec64 *ts,
+	 const struct timezone *tz)
+LSM_HOOK(int, 0, vm_enough_memory, struct mm_struct *mm, long pages)
+LSM_HOOK(int, 0, bprm_set_creds, struct linux_binprm *bprm)
+LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm)
+LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, struct linux_binprm *bprm)
+LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm)
+LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc,
+	 struct fs_context *src_sc)
+LSM_HOOK(int, 0, fs_context_parse_param, struct fs_context *fc,
+	 struct fs_parameter *param)
+LSM_HOOK(int, 0, sb_alloc_security, struct super_block *sb)
+LSM_HOOK(void, LSM_RET_VOID, sb_free_security, struct super_block *sb)
+LSM_HOOK(void, LSM_RET_VOID, sb_free_mnt_opts, void *mnt_opts)
+LSM_HOOK(int, 0, sb_eat_lsm_opts, char *orig, void **mnt_opts)
+LSM_HOOK(int, 0, sb_remount, struct super_block *sb, void *mnt_opts)
+LSM_HOOK(int, 0, sb_kern_mount, struct super_block *sb)
+LSM_HOOK(int, 0, sb_show_options, struct seq_file *m, struct super_block *sb)
+LSM_HOOK(int, 0, sb_statfs, struct dentry *dentry)
+LSM_HOOK(int, 0, sb_mount, const char *dev_name, const struct path *path,
+	 const char *type, unsigned long flags, void *data)
+LSM_HOOK(int, 0, sb_umount, struct vfsmount *mnt, int flags)
+LSM_HOOK(int, 0, sb_pivotroot, const struct path *old_path,
+	 const struct path *new_path)
+LSM_HOOK(int, 0, sb_set_mnt_opts, struct super_block *sb, void *mnt_opts,
+	 unsigned long kern_flags, unsigned long *set_kern_flags)
+LSM_HOOK(int, 0, sb_clone_mnt_opts, const struct super_block *oldsb,
+	 struct super_block *newsb, unsigned long kern_flags,
+	 unsigned long *set_kern_flags)
+LSM_HOOK(int, 0, sb_add_mnt_opt, const char *option, const char *val,
+	 int len, void **mnt_opts)
+LSM_HOOK(int, 0, move_mount, const struct path *from_path,
+	 const struct path *to_path)
+LSM_HOOK(int, 0, dentry_init_security, struct dentry *dentry,
+	 int mode, const struct qstr *name, void **ctx, u32 *ctxlen)
+LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode,
+	 struct qstr *name, const struct cred *old, struct cred *new)
+
+#ifdef CONFIG_SECURITY_PATH
+LSM_HOOK(int, 0, path_unlink, const struct path *dir, struct dentry *dentry)
+LSM_HOOK(int, 0, path_mkdir, const struct path *dir, struct dentry *dentry,
+	 umode_t mode)
+LSM_HOOK(int, 0, path_rmdir, const struct path *dir, struct dentry *dentry)
+LSM_HOOK(int, 0, path_mknod, const struct path *dir, struct dentry *dentry,
+	 umode_t mode, unsigned int dev)
+LSM_HOOK(int, 0, path_truncate, const struct path *path)
+LSM_HOOK(int, 0, path_symlink, const struct path *dir, struct dentry *dentry,
+	 const char *old_name)
+LSM_HOOK(int, 0, path_link, struct dentry *old_dentry,
+	 const struct path *new_dir, struct dentry *new_dentry)
+LSM_HOOK(int, 0, path_rename, const struct path *old_dir,
+	 struct dentry *old_dentry, const struct path *new_dir,
+	 struct dentry *new_dentry)
+LSM_HOOK(int, 0, path_chmod, const struct path *path, umode_t mode)
+LSM_HOOK(int, 0, path_chown, const struct path *path, kuid_t uid, kgid_t gid)
+LSM_HOOK(int, 0, path_chroot, const struct path *path)
+#endif /* CONFIG_SECURITY_PATH */
+
+/* Needed for inode based security check */
+LSM_HOOK(int, 0, path_notify, const struct path *path, u64 mask,
+	 unsigned int obj_type)
+LSM_HOOK(int, 0, inode_alloc_security, struct inode *inode)
+LSM_HOOK(void, LSM_RET_VOID, inode_free_security, struct inode *inode)
+LSM_HOOK(int, 0, inode_init_security, struct inode *inode,
+	 struct inode *dir, const struct qstr *qstr, const char **name,
+	 void **value, size_t *len)
+LSM_HOOK(int, 0, inode_create, struct inode *dir, struct dentry *dentry,
+	 umode_t mode)
+LSM_HOOK(int, 0, inode_link, struct dentry *old_dentry, struct inode *dir,
+	 struct dentry *new_dentry)
+LSM_HOOK(int, 0, inode_unlink, struct inode *dir, struct dentry *dentry)
+LSM_HOOK(int, 0, inode_symlink, struct inode *dir, struct dentry *dentry,
+	 const char *old_name)
+LSM_HOOK(int, 0, inode_mkdir, struct inode *dir, struct dentry *dentry,
+	 umode_t mode)
+LSM_HOOK(int, 0, inode_rmdir, struct inode *dir, struct dentry *dentry)
+LSM_HOOK(int, 0, inode_mknod, struct inode *dir, struct dentry *dentry,
+	 umode_t mode, dev_t dev)
+LSM_HOOK(int, 0, inode_rename, struct inode *old_dir, struct dentry *old_dentry,
+	 struct inode *new_dir, struct dentry *new_dentry)
+LSM_HOOK(int, 0, inode_readlink, struct dentry *dentry)
+LSM_HOOK(int, 0, inode_follow_link, struct dentry *dentry, struct inode *inode,
+	 bool rcu)
+LSM_HOOK(int, 0, inode_permission, struct inode *inode, int mask)
+LSM_HOOK(int, 0, inode_setattr, struct dentry *dentry, struct iattr *attr)
+LSM_HOOK(int, 0, inode_getattr, const struct path *path)
+LSM_HOOK(int, 0, inode_setxattr, struct dentry *dentry, const char *name,
+	 const void *value, size_t size, int flags)
+LSM_HOOK(void, LSM_RET_VOID, inode_post_setxattr, struct dentry *dentry,
+	 const char *name, const void *value, size_t size, int flags)
+LSM_HOOK(int, 0, inode_getxattr, struct dentry *dentry, const char *name)
+LSM_HOOK(int, 0, inode_listxattr, struct dentry *dentry)
+LSM_HOOK(int, 0, inode_removexattr, struct dentry *dentry, const char *name)
+LSM_HOOK(int, 0, inode_need_killpriv, struct dentry *dentry)
+LSM_HOOK(int, 0, inode_killpriv, struct dentry *dentry)
+LSM_HOOK(int, -EOPNOTSUPP, inode_getsecurity, struct inode *inode,
+	 const char *name, void **buffer, bool alloc)
+LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode,
+	 const char *name, const void *value, size_t size, int flags)
+LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer,
+	 size_t buffer_size)
+LSM_HOOK(void, LSM_RET_VOID, inode_getsecid, struct inode *inode, u32 *secid)
+LSM_HOOK(int, 0, inode_copy_up, struct dentry *src, struct cred **new)
+LSM_HOOK(int, 0, inode_copy_up_xattr, const char *name)
+LSM_HOOK(int, 0, kernfs_init_security, struct kernfs_node *kn_dir,
+	 struct kernfs_node *kn)
+LSM_HOOK(int, 0, file_permission, struct file *file, int mask)
+LSM_HOOK(int, 0, file_alloc_security, struct file *file)
+LSM_HOOK(void, LSM_RET_VOID, file_free_security, struct file *file)
+LSM_HOOK(int, 0, file_ioctl, struct file *file, unsigned int cmd,
+	 unsigned long arg)
+LSM_HOOK(int, 0, mmap_addr, unsigned long addr)
+LSM_HOOK(int, 0, mmap_file, struct file *file, unsigned long reqprot,
+	 unsigned long prot, unsigned long flags)
+LSM_HOOK(int, 0, file_mprotect, struct vm_area_struct *vma,
+	 unsigned long reqprot, unsigned long prot)
+LSM_HOOK(int, 0, file_lock, struct file *file, unsigned int cmd)
+LSM_HOOK(int, 0, file_fcntl, struct file *file, unsigned int cmd,
+	 unsigned long arg)
+LSM_HOOK(void, LSM_RET_VOID, file_set_fowner, struct file *file)
+LSM_HOOK(int, 0, file_send_sigiotask, struct task_struct *tsk,
+	 struct fown_struct *fown, int sig)
+LSM_HOOK(int, 0, file_receive, struct file *file)
+LSM_HOOK(int, 0, file_open, struct file *file)
+LSM_HOOK(int, 0, task_alloc, struct task_struct *task,
+	 unsigned long clone_flags)
+LSM_HOOK(void, LSM_RET_VOID, task_free, struct task_struct *task)
+LSM_HOOK(int, 0, cred_alloc_blank, struct cred *cred, gfp_t gfp)
+LSM_HOOK(void, LSM_RET_VOID, cred_free, struct cred *cred)
+LSM_HOOK(int, 0, cred_prepare, struct cred *new, const struct cred *old,
+	 gfp_t gfp)
+LSM_HOOK(void, LSM_RET_VOID, cred_transfer, struct cred *new,
+	 const struct cred *old)
+LSM_HOOK(void, LSM_RET_VOID, cred_getsecid, const struct cred *c, u32 *secid)
+LSM_HOOK(int, 0, kernel_act_as, struct cred *new, u32 secid)
+LSM_HOOK(int, 0, kernel_create_files_as, struct cred *new, struct inode *inode)
+LSM_HOOK(int, 0, kernel_module_request, char *kmod_name)
+LSM_HOOK(int, 0, kernel_load_data, enum kernel_load_data_id id)
+LSM_HOOK(int, 0, kernel_read_file, struct file *file,
+	 enum kernel_read_file_id id)
+LSM_HOOK(int, 0, kernel_post_read_file, struct file *file, char *buf,
+	 loff_t size, enum kernel_read_file_id id)
+LSM_HOOK(int, 0, task_fix_setuid, struct cred *new, const struct cred *old,
+	 int flags)
+LSM_HOOK(int, 0, task_setpgid, struct task_struct *p, pid_t pgid)
+LSM_HOOK(int, 0, task_getpgid, struct task_struct *p)
+LSM_HOOK(int, 0, task_getsid, struct task_struct *p)
+LSM_HOOK(void, LSM_RET_VOID, task_getsecid, struct task_struct *p, u32 *secid)
+LSM_HOOK(int, 0, task_setnice, struct task_struct *p, int nice)
+LSM_HOOK(int, 0, task_setioprio, struct task_struct *p, int ioprio)
+LSM_HOOK(int, 0, task_getioprio, struct task_struct *p)
+LSM_HOOK(int, 0, task_prlimit, const struct cred *cred,
+	 const struct cred *tcred, unsigned int flags)
+LSM_HOOK(int, 0, task_setrlimit, struct task_struct *p, unsigned int resource,
+	 struct rlimit *new_rlim)
+LSM_HOOK(int, 0, task_setscheduler, struct task_struct *p)
+LSM_HOOK(int, 0, task_getscheduler, struct task_struct *p)
+LSM_HOOK(int, 0, task_movememory, struct task_struct *p)
+LSM_HOOK(int, 0, task_kill, struct task_struct *p, struct kernel_siginfo *info,
+	 int sig, const struct cred *cred)
+LSM_HOOK(int, -ENOSYS, task_prctl, int option, unsigned long arg2,
+	 unsigned long arg3, unsigned long arg4, unsigned long arg5)
+LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p,
+	 struct inode *inode)
+LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag)
+LSM_HOOK(void, LSM_RET_VOID, ipc_getsecid, struct kern_ipc_perm *ipcp,
+	 u32 *secid)
+LSM_HOOK(int, 0, msg_msg_alloc_security, struct msg_msg *msg)
+LSM_HOOK(void, LSM_RET_VOID, msg_msg_free_security, struct msg_msg *msg)
+LSM_HOOK(int, 0, msg_queue_alloc_security, struct kern_ipc_perm *perm)
+LSM_HOOK(void, LSM_RET_VOID, msg_queue_free_security,
+	 struct kern_ipc_perm *perm)
+LSM_HOOK(int, 0, msg_queue_associate, struct kern_ipc_perm *perm, int msqflg)
+LSM_HOOK(int, 0, msg_queue_msgctl, struct kern_ipc_perm *perm, int cmd)
+LSM_HOOK(int, 0, msg_queue_msgsnd, struct kern_ipc_perm *perm,
+	 struct msg_msg *msg, int msqflg)
+LSM_HOOK(int, 0, msg_queue_msgrcv, struct kern_ipc_perm *perm,
+	 struct msg_msg *msg, struct task_struct *target, long type, int mode)
+LSM_HOOK(int, 0, shm_alloc_security, struct kern_ipc_perm *perm)
+LSM_HOOK(void, LSM_RET_VOID, shm_free_security, struct kern_ipc_perm *perm)
+LSM_HOOK(int, 0, shm_associate, struct kern_ipc_perm *perm, int shmflg)
+LSM_HOOK(int, 0, shm_shmctl, struct kern_ipc_perm *perm, int cmd)
+LSM_HOOK(int, 0, shm_shmat, struct kern_ipc_perm *perm, char __user *shmaddr,
+	 int shmflg)
+LSM_HOOK(int, 0, sem_alloc_security, struct kern_ipc_perm *perm)
+LSM_HOOK(void, LSM_RET_VOID, sem_free_security, struct kern_ipc_perm *perm)
+LSM_HOOK(int, 0, sem_associate, struct kern_ipc_perm *perm, int semflg)
+LSM_HOOK(int, 0, sem_semctl, struct kern_ipc_perm *perm, int cmd)
+LSM_HOOK(int, 0, sem_semop, struct kern_ipc_perm *perm, struct sembuf *sops,
+	 unsigned nsops, int alter)
+LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb)
+LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry,
+	 struct inode *inode)
+LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, char *name,
+	 char **value)
+LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size)
+LSM_HOOK(int, 0, ismaclabel, const char *name)
+LSM_HOOK(int, 0, secid_to_secctx, u32 secid, char **secdata,
+	 u32 *seclen)
+LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid)
+LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen)
+LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode)
+LSM_HOOK(int, 0, inode_notifysecctx, struct inode *inode, void *ctx, u32 ctxlen)
+LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen)
+LSM_HOOK(int, 0, inode_getsecctx, struct inode *inode, void **ctx,
+	 u32 *ctxlen)
+
+#ifdef CONFIG_SECURITY_NETWORK
+LSM_HOOK(int, 0, unix_stream_connect, struct sock *sock, struct sock *other,
+	 struct sock *newsk)
+LSM_HOOK(int, 0, unix_may_send, struct socket *sock, struct socket *other)
+LSM_HOOK(int, 0, socket_create, int family, int type, int protocol, int kern)
+LSM_HOOK(int, 0, socket_post_create, struct socket *sock, int family, int type,
+	 int protocol, int kern)
+LSM_HOOK(int, 0, socket_socketpair, struct socket *socka, struct socket *sockb)
+LSM_HOOK(int, 0, socket_bind, struct socket *sock, struct sockaddr *address,
+	 int addrlen)
+LSM_HOOK(int, 0, socket_connect, struct socket *sock, struct sockaddr *address,
+	 int addrlen)
+LSM_HOOK(int, 0, socket_listen, struct socket *sock, int backlog)
+LSM_HOOK(int, 0, socket_accept, struct socket *sock, struct socket *newsock)
+LSM_HOOK(int, 0, socket_sendmsg, struct socket *sock, struct msghdr *msg,
+	 int size)
+LSM_HOOK(int, 0, socket_recvmsg, struct socket *sock, struct msghdr *msg,
+	 int size, int flags)
+LSM_HOOK(int, 0, socket_getsockname, struct socket *sock)
+LSM_HOOK(int, 0, socket_getpeername, struct socket *sock)
+LSM_HOOK(int, 0, socket_getsockopt, struct socket *sock, int level, int optname)
+LSM_HOOK(int, 0, socket_setsockopt, struct socket *sock, int level, int optname)
+LSM_HOOK(int, 0, socket_shutdown, struct socket *sock, int how)
+LSM_HOOK(int, 0, socket_sock_rcv_skb, struct sock *sk, struct sk_buff *skb)
+LSM_HOOK(int, 0, socket_getpeersec_stream, struct socket *sock,
+	 char __user *optval, int __user *optlen, unsigned len)
+LSM_HOOK(int, 0, socket_getpeersec_dgram, struct socket *sock,
+	 struct sk_buff *skb, u32 *secid)
+LSM_HOOK(int, 0, sk_alloc_security, struct sock *sk, int family, gfp_t priority)
+LSM_HOOK(void, LSM_RET_VOID, sk_free_security, struct sock *sk)
+LSM_HOOK(void, LSM_RET_VOID, sk_clone_security, const struct sock *sk,
+	 struct sock *newsk)
+LSM_HOOK(void, LSM_RET_VOID, sk_getsecid, struct sock *sk, u32 *secid)
+LSM_HOOK(void, LSM_RET_VOID, sock_graft, struct sock *sk, struct socket *parent)
+LSM_HOOK(int, 0, inet_conn_request, struct sock *sk, struct sk_buff *skb,
+	 struct request_sock *req)
+LSM_HOOK(void, LSM_RET_VOID, inet_csk_clone, struct sock *newsk,
+	 const struct request_sock *req)
+LSM_HOOK(void, LSM_RET_VOID, inet_conn_established, struct sock *sk,
+	 struct sk_buff *skb)
+LSM_HOOK(int, 0, secmark_relabel_packet, u32 secid)
+LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_inc, void)
+LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_dec, void)
+LSM_HOOK(void, LSM_RET_VOID, req_classify_flow, const struct request_sock *req,
+	 struct flowi *fl)
+LSM_HOOK(int, 0, tun_dev_alloc_security, void **security)
+LSM_HOOK(void, LSM_RET_VOID, tun_dev_free_security, void *security)
+LSM_HOOK(int, 0, tun_dev_create, void)
+LSM_HOOK(int, 0, tun_dev_attach_queue, void *security)
+LSM_HOOK(int, 0, tun_dev_attach, struct sock *sk, void *security)
+LSM_HOOK(int, 0, tun_dev_open, void *security)
+LSM_HOOK(int, 0, sctp_assoc_request, struct sctp_endpoint *ep,
+	 struct sk_buff *skb)
+LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname,
+	 struct sockaddr *address, int addrlen)
+LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_endpoint *ep,
+	 struct sock *sk, struct sock *newsk)
+#endif /* CONFIG_SECURITY_NETWORK */
+
+#ifdef CONFIG_SECURITY_INFINIBAND
+LSM_HOOK(int, 0, ib_pkey_access, void *sec, u64 subnet_prefix, u16 pkey)
+LSM_HOOK(int, 0, ib_endport_manage_subnet, void *sec, const char *dev_name,
+	 u8 port_num)
+LSM_HOOK(int, 0, ib_alloc_security, void **sec)
+LSM_HOOK(void, LSM_RET_VOID, ib_free_security, void *sec)
+#endif /* CONFIG_SECURITY_INFINIBAND */
+
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+LSM_HOOK(int, 0, xfrm_policy_alloc_security, struct xfrm_sec_ctx **ctxp,
+	 struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp)
+LSM_HOOK(int, 0, xfrm_policy_clone_security, struct xfrm_sec_ctx *old_ctx,
+	 struct xfrm_sec_ctx **new_ctx)
+LSM_HOOK(void, LSM_RET_VOID, xfrm_policy_free_security,
+	 struct xfrm_sec_ctx *ctx)
+LSM_HOOK(int, 0, xfrm_policy_delete_security, struct xfrm_sec_ctx *ctx)
+LSM_HOOK(int, 0, xfrm_state_alloc, struct xfrm_state *x,
+	 struct xfrm_user_sec_ctx *sec_ctx)
+LSM_HOOK(int, 0, xfrm_state_alloc_acquire, struct xfrm_state *x,
+	 struct xfrm_sec_ctx *polsec, u32 secid)
+LSM_HOOK(void, LSM_RET_VOID, xfrm_state_free_security, struct xfrm_state *x)
+LSM_HOOK(int, 0, xfrm_state_delete_security, struct xfrm_state *x)
+LSM_HOOK(int, 0, xfrm_policy_lookup, struct xfrm_sec_ctx *ctx, u32 fl_secid,
+	 u8 dir)
+LSM_HOOK(int, 1, xfrm_state_pol_flow_match, struct xfrm_state *x,
+	 struct xfrm_policy *xp, const struct flowi *fl)
+LSM_HOOK(int, 0, xfrm_decode_session, struct sk_buff *skb, u32 *secid,
+	 int ckall)
+#endif /* CONFIG_SECURITY_NETWORK_XFRM */
+
+/* key management security hooks */
+#ifdef CONFIG_KEYS
+LSM_HOOK(int, 0, key_alloc, struct key *key, const struct cred *cred,
+	 unsigned long flags)
+LSM_HOOK(void, LSM_RET_VOID, key_free, struct key *key)
+LSM_HOOK(int, 0, key_permission, key_ref_t key_ref, const struct cred *cred,
+	 unsigned perm)
+LSM_HOOK(int, 0, key_getsecurity, struct key *key, char **_buffer)
+#endif /* CONFIG_KEYS */
+
+#ifdef CONFIG_AUDIT
+LSM_HOOK(int, 0, audit_rule_init, u32 field, u32 op, char *rulestr,
+	 void **lsmrule)
+LSM_HOOK(int, 0, audit_rule_known, struct audit_krule *krule)
+LSM_HOOK(int, 0, audit_rule_match, u32 secid, u32 field, u32 op, void *lsmrule)
+LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule)
+#endif /* CONFIG_AUDIT */
+
+#ifdef CONFIG_BPF_SYSCALL
+LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size)
+LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode)
+LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog)
+LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map)
+LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map)
+LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux)
+LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux)
+#endif /* CONFIG_BPF_SYSCALL */
+
+LSM_HOOK(int, 0, locked_down, enum lockdown_reason what)
+
+#ifdef CONFIG_PERF_EVENTS
+LSM_HOOK(int, 0, perf_event_open, struct perf_event_attr *attr, int type)
+LSM_HOOK(int, 0, perf_event_alloc, struct perf_event *event)
+LSM_HOOK(void, LSM_RET_VOID, perf_event_free, struct perf_event *event)
+LSM_HOOK(int, 0, perf_event_read, struct perf_event *event)
+LSM_HOOK(int, 0, perf_event_write, struct perf_event *event)
+#endif /* CONFIG_PERF_EVENTS */
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 20d8cf194fb7..c09623b32489 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1456,625 +1456,15 @@
  *     @what: kernel feature being accessed
  */
 union security_list_options {
-	int (*binder_set_context_mgr)(struct task_struct *mgr);
-	int (*binder_transaction)(struct task_struct *from,
-					struct task_struct *to);
-	int (*binder_transfer_binder)(struct task_struct *from,
-					struct task_struct *to);
-	int (*binder_transfer_file)(struct task_struct *from,
-					struct task_struct *to,
-					struct file *file);
-
-	int (*ptrace_access_check)(struct task_struct *child,
-					unsigned int mode);
-	int (*ptrace_traceme)(struct task_struct *parent);
-	int (*capget)(struct task_struct *target, kernel_cap_t *effective,
-			kernel_cap_t *inheritable, kernel_cap_t *permitted);
-	int (*capset)(struct cred *new, const struct cred *old,
-			const kernel_cap_t *effective,
-			const kernel_cap_t *inheritable,
-			const kernel_cap_t *permitted);
-	int (*capable)(const struct cred *cred,
-			struct user_namespace *ns,
-			int cap,
-			unsigned int opts);
-	int (*quotactl)(int cmds, int type, int id, struct super_block *sb);
-	int (*quota_on)(struct dentry *dentry);
-	int (*syslog)(int type);
-	int (*settime)(const struct timespec64 *ts, const struct timezone *tz);
-	int (*vm_enough_memory)(struct mm_struct *mm, long pages);
-
-	int (*bprm_set_creds)(struct linux_binprm *bprm);
-	int (*bprm_check_security)(struct linux_binprm *bprm);
-	void (*bprm_committing_creds)(struct linux_binprm *bprm);
-	void (*bprm_committed_creds)(struct linux_binprm *bprm);
-
-	int (*fs_context_dup)(struct fs_context *fc, struct fs_context *src_sc);
-	int (*fs_context_parse_param)(struct fs_context *fc, struct fs_parameter *param);
-
-	int (*sb_alloc_security)(struct super_block *sb);
-	void (*sb_free_security)(struct super_block *sb);
-	void (*sb_free_mnt_opts)(void *mnt_opts);
-	int (*sb_eat_lsm_opts)(char *orig, void **mnt_opts);
-	int (*sb_remount)(struct super_block *sb, void *mnt_opts);
-	int (*sb_kern_mount)(struct super_block *sb);
-	int (*sb_show_options)(struct seq_file *m, struct super_block *sb);
-	int (*sb_statfs)(struct dentry *dentry);
-	int (*sb_mount)(const char *dev_name, const struct path *path,
-			const char *type, unsigned long flags, void *data);
-	int (*sb_umount)(struct vfsmount *mnt, int flags);
-	int (*sb_pivotroot)(const struct path *old_path, const struct path *new_path);
-	int (*sb_set_mnt_opts)(struct super_block *sb,
-				void *mnt_opts,
-				unsigned long kern_flags,
-				unsigned long *set_kern_flags);
-	int (*sb_clone_mnt_opts)(const struct super_block *oldsb,
-					struct super_block *newsb,
-					unsigned long kern_flags,
-					unsigned long *set_kern_flags);
-	int (*sb_add_mnt_opt)(const char *option, const char *val, int len,
-			      void **mnt_opts);
-	int (*move_mount)(const struct path *from_path, const struct path *to_path);
-	int (*dentry_init_security)(struct dentry *dentry, int mode,
-					const struct qstr *name, void **ctx,
-					u32 *ctxlen);
-	int (*dentry_create_files_as)(struct dentry *dentry, int mode,
-					struct qstr *name,
-					const struct cred *old,
-					struct cred *new);
-
-
-#ifdef CONFIG_SECURITY_PATH
-	int (*path_unlink)(const struct path *dir, struct dentry *dentry);
-	int (*path_mkdir)(const struct path *dir, struct dentry *dentry,
-				umode_t mode);
-	int (*path_rmdir)(const struct path *dir, struct dentry *dentry);
-	int (*path_mknod)(const struct path *dir, struct dentry *dentry,
-				umode_t mode, unsigned int dev);
-	int (*path_truncate)(const struct path *path);
-	int (*path_symlink)(const struct path *dir, struct dentry *dentry,
-				const char *old_name);
-	int (*path_link)(struct dentry *old_dentry, const struct path *new_dir,
-				struct dentry *new_dentry);
-	int (*path_rename)(const struct path *old_dir, struct dentry *old_dentry,
-				const struct path *new_dir,
-				struct dentry *new_dentry);
-	int (*path_chmod)(const struct path *path, umode_t mode);
-	int (*path_chown)(const struct path *path, kuid_t uid, kgid_t gid);
-	int (*path_chroot)(const struct path *path);
-#endif
-	/* Needed for inode based security check */
-	int (*path_notify)(const struct path *path, u64 mask,
-				unsigned int obj_type);
-	int (*inode_alloc_security)(struct inode *inode);
-	void (*inode_free_security)(struct inode *inode);
-	int (*inode_init_security)(struct inode *inode, struct inode *dir,
-					const struct qstr *qstr,
-					const char **name, void **value,
-					size_t *len);
-	int (*inode_create)(struct inode *dir, struct dentry *dentry,
-				umode_t mode);
-	int (*inode_link)(struct dentry *old_dentry, struct inode *dir,
-				struct dentry *new_dentry);
-	int (*inode_unlink)(struct inode *dir, struct dentry *dentry);
-	int (*inode_symlink)(struct inode *dir, struct dentry *dentry,
-				const char *old_name);
-	int (*inode_mkdir)(struct inode *dir, struct dentry *dentry,
-				umode_t mode);
-	int (*inode_rmdir)(struct inode *dir, struct dentry *dentry);
-	int (*inode_mknod)(struct inode *dir, struct dentry *dentry,
-				umode_t mode, dev_t dev);
-	int (*inode_rename)(struct inode *old_dir, struct dentry *old_dentry,
-				struct inode *new_dir,
-				struct dentry *new_dentry);
-	int (*inode_readlink)(struct dentry *dentry);
-	int (*inode_follow_link)(struct dentry *dentry, struct inode *inode,
-				 bool rcu);
-	int (*inode_permission)(struct inode *inode, int mask);
-	int (*inode_setattr)(struct dentry *dentry, struct iattr *attr);
-	int (*inode_getattr)(const struct path *path);
-	int (*inode_setxattr)(struct dentry *dentry, const char *name,
-				const void *value, size_t size, int flags);
-	void (*inode_post_setxattr)(struct dentry *dentry, const char *name,
-					const void *value, size_t size,
-					int flags);
-	int (*inode_getxattr)(struct dentry *dentry, const char *name);
-	int (*inode_listxattr)(struct dentry *dentry);
-	int (*inode_removexattr)(struct dentry *dentry, const char *name);
-	int (*inode_need_killpriv)(struct dentry *dentry);
-	int (*inode_killpriv)(struct dentry *dentry);
-	int (*inode_getsecurity)(struct inode *inode, const char *name,
-					void **buffer, bool alloc);
-	int (*inode_setsecurity)(struct inode *inode, const char *name,
-					const void *value, size_t size,
-					int flags);
-	int (*inode_listsecurity)(struct inode *inode, char *buffer,
-					size_t buffer_size);
-	void (*inode_getsecid)(struct inode *inode, u32 *secid);
-	int (*inode_copy_up)(struct dentry *src, struct cred **new);
-	int (*inode_copy_up_xattr)(const char *name);
-
-	int (*kernfs_init_security)(struct kernfs_node *kn_dir,
-				    struct kernfs_node *kn);
-
-	int (*file_permission)(struct file *file, int mask);
-	int (*file_alloc_security)(struct file *file);
-	void (*file_free_security)(struct file *file);
-	int (*file_ioctl)(struct file *file, unsigned int cmd,
-				unsigned long arg);
-	int (*mmap_addr)(unsigned long addr);
-	int (*mmap_file)(struct file *file, unsigned long reqprot,
-				unsigned long prot, unsigned long flags);
-	int (*file_mprotect)(struct vm_area_struct *vma, unsigned long reqprot,
-				unsigned long prot);
-	int (*file_lock)(struct file *file, unsigned int cmd);
-	int (*file_fcntl)(struct file *file, unsigned int cmd,
-				unsigned long arg);
-	void (*file_set_fowner)(struct file *file);
-	int (*file_send_sigiotask)(struct task_struct *tsk,
-					struct fown_struct *fown, int sig);
-	int (*file_receive)(struct file *file);
-	int (*file_open)(struct file *file);
-
-	int (*task_alloc)(struct task_struct *task, unsigned long clone_flags);
-	void (*task_free)(struct task_struct *task);
-	int (*cred_alloc_blank)(struct cred *cred, gfp_t gfp);
-	void (*cred_free)(struct cred *cred);
-	int (*cred_prepare)(struct cred *new, const struct cred *old,
-				gfp_t gfp);
-	void (*cred_transfer)(struct cred *new, const struct cred *old);
-	void (*cred_getsecid)(const struct cred *c, u32 *secid);
-	int (*kernel_act_as)(struct cred *new, u32 secid);
-	int (*kernel_create_files_as)(struct cred *new, struct inode *inode);
-	int (*kernel_module_request)(char *kmod_name);
-	int (*kernel_load_data)(enum kernel_load_data_id id);
-	int (*kernel_read_file)(struct file *file, enum kernel_read_file_id id);
-	int (*kernel_post_read_file)(struct file *file, char *buf, loff_t size,
-				     enum kernel_read_file_id id);
-	int (*task_fix_setuid)(struct cred *new, const struct cred *old,
-				int flags);
-	int (*task_setpgid)(struct task_struct *p, pid_t pgid);
-	int (*task_getpgid)(struct task_struct *p);
-	int (*task_getsid)(struct task_struct *p);
-	void (*task_getsecid)(struct task_struct *p, u32 *secid);
-	int (*task_setnice)(struct task_struct *p, int nice);
-	int (*task_setioprio)(struct task_struct *p, int ioprio);
-	int (*task_getioprio)(struct task_struct *p);
-	int (*task_prlimit)(const struct cred *cred, const struct cred *tcred,
-			    unsigned int flags);
-	int (*task_setrlimit)(struct task_struct *p, unsigned int resource,
-				struct rlimit *new_rlim);
-	int (*task_setscheduler)(struct task_struct *p);
-	int (*task_getscheduler)(struct task_struct *p);
-	int (*task_movememory)(struct task_struct *p);
-	int (*task_kill)(struct task_struct *p, struct kernel_siginfo *info,
-				int sig, const struct cred *cred);
-	int (*task_prctl)(int option, unsigned long arg2, unsigned long arg3,
-				unsigned long arg4, unsigned long arg5);
-	void (*task_to_inode)(struct task_struct *p, struct inode *inode);
-
-	int (*ipc_permission)(struct kern_ipc_perm *ipcp, short flag);
-	void (*ipc_getsecid)(struct kern_ipc_perm *ipcp, u32 *secid);
-
-	int (*msg_msg_alloc_security)(struct msg_msg *msg);
-	void (*msg_msg_free_security)(struct msg_msg *msg);
-
-	int (*msg_queue_alloc_security)(struct kern_ipc_perm *perm);
-	void (*msg_queue_free_security)(struct kern_ipc_perm *perm);
-	int (*msg_queue_associate)(struct kern_ipc_perm *perm, int msqflg);
-	int (*msg_queue_msgctl)(struct kern_ipc_perm *perm, int cmd);
-	int (*msg_queue_msgsnd)(struct kern_ipc_perm *perm, struct msg_msg *msg,
-				int msqflg);
-	int (*msg_queue_msgrcv)(struct kern_ipc_perm *perm, struct msg_msg *msg,
-				struct task_struct *target, long type,
-				int mode);
-
-	int (*shm_alloc_security)(struct kern_ipc_perm *perm);
-	void (*shm_free_security)(struct kern_ipc_perm *perm);
-	int (*shm_associate)(struct kern_ipc_perm *perm, int shmflg);
-	int (*shm_shmctl)(struct kern_ipc_perm *perm, int cmd);
-	int (*shm_shmat)(struct kern_ipc_perm *perm, char __user *shmaddr,
-				int shmflg);
-
-	int (*sem_alloc_security)(struct kern_ipc_perm *perm);
-	void (*sem_free_security)(struct kern_ipc_perm *perm);
-	int (*sem_associate)(struct kern_ipc_perm *perm, int semflg);
-	int (*sem_semctl)(struct kern_ipc_perm *perm, int cmd);
-	int (*sem_semop)(struct kern_ipc_perm *perm, struct sembuf *sops,
-				unsigned nsops, int alter);
-
-	int (*netlink_send)(struct sock *sk, struct sk_buff *skb);
-
-	void (*d_instantiate)(struct dentry *dentry, struct inode *inode);
-
-	int (*getprocattr)(struct task_struct *p, char *name, char **value);
-	int (*setprocattr)(const char *name, void *value, size_t size);
-	int (*ismaclabel)(const char *name);
-	int (*secid_to_secctx)(u32 secid, char **secdata, u32 *seclen);
-	int (*secctx_to_secid)(const char *secdata, u32 seclen, u32 *secid);
-	void (*release_secctx)(char *secdata, u32 seclen);
-
-	void (*inode_invalidate_secctx)(struct inode *inode);
-	int (*inode_notifysecctx)(struct inode *inode, void *ctx, u32 ctxlen);
-	int (*inode_setsecctx)(struct dentry *dentry, void *ctx, u32 ctxlen);
-	int (*inode_getsecctx)(struct inode *inode, void **ctx, u32 *ctxlen);
-
-#ifdef CONFIG_SECURITY_NETWORK
-	int (*unix_stream_connect)(struct sock *sock, struct sock *other,
-					struct sock *newsk);
-	int (*unix_may_send)(struct socket *sock, struct socket *other);
-
-	int (*socket_create)(int family, int type, int protocol, int kern);
-	int (*socket_post_create)(struct socket *sock, int family, int type,
-					int protocol, int kern);
-	int (*socket_socketpair)(struct socket *socka, struct socket *sockb);
-	int (*socket_bind)(struct socket *sock, struct sockaddr *address,
-				int addrlen);
-	int (*socket_connect)(struct socket *sock, struct sockaddr *address,
-				int addrlen);
-	int (*socket_listen)(struct socket *sock, int backlog);
-	int (*socket_accept)(struct socket *sock, struct socket *newsock);
-	int (*socket_sendmsg)(struct socket *sock, struct msghdr *msg,
-				int size);
-	int (*socket_recvmsg)(struct socket *sock, struct msghdr *msg,
-				int size, int flags);
-	int (*socket_getsockname)(struct socket *sock);
-	int (*socket_getpeername)(struct socket *sock);
-	int (*socket_getsockopt)(struct socket *sock, int level, int optname);
-	int (*socket_setsockopt)(struct socket *sock, int level, int optname);
-	int (*socket_shutdown)(struct socket *sock, int how);
-	int (*socket_sock_rcv_skb)(struct sock *sk, struct sk_buff *skb);
-	int (*socket_getpeersec_stream)(struct socket *sock,
-					char __user *optval,
-					int __user *optlen, unsigned len);
-	int (*socket_getpeersec_dgram)(struct socket *sock,
-					struct sk_buff *skb, u32 *secid);
-	int (*sk_alloc_security)(struct sock *sk, int family, gfp_t priority);
-	void (*sk_free_security)(struct sock *sk);
-	void (*sk_clone_security)(const struct sock *sk, struct sock *newsk);
-	void (*sk_getsecid)(struct sock *sk, u32 *secid);
-	void (*sock_graft)(struct sock *sk, struct socket *parent);
-	int (*inet_conn_request)(struct sock *sk, struct sk_buff *skb,
-					struct request_sock *req);
-	void (*inet_csk_clone)(struct sock *newsk,
-				const struct request_sock *req);
-	void (*inet_conn_established)(struct sock *sk, struct sk_buff *skb);
-	int (*secmark_relabel_packet)(u32 secid);
-	void (*secmark_refcount_inc)(void);
-	void (*secmark_refcount_dec)(void);
-	void (*req_classify_flow)(const struct request_sock *req,
-					struct flowi *fl);
-	int (*tun_dev_alloc_security)(void **security);
-	void (*tun_dev_free_security)(void *security);
-	int (*tun_dev_create)(void);
-	int (*tun_dev_attach_queue)(void *security);
-	int (*tun_dev_attach)(struct sock *sk, void *security);
-	int (*tun_dev_open)(void *security);
-	int (*sctp_assoc_request)(struct sctp_endpoint *ep,
-				  struct sk_buff *skb);
-	int (*sctp_bind_connect)(struct sock *sk, int optname,
-				 struct sockaddr *address, int addrlen);
-	void (*sctp_sk_clone)(struct sctp_endpoint *ep, struct sock *sk,
-			      struct sock *newsk);
-#endif	/* CONFIG_SECURITY_NETWORK */
-
-#ifdef CONFIG_SECURITY_INFINIBAND
-	int (*ib_pkey_access)(void *sec, u64 subnet_prefix, u16 pkey);
-	int (*ib_endport_manage_subnet)(void *sec, const char *dev_name,
-					u8 port_num);
-	int (*ib_alloc_security)(void **sec);
-	void (*ib_free_security)(void *sec);
-#endif	/* CONFIG_SECURITY_INFINIBAND */
-
-#ifdef CONFIG_SECURITY_NETWORK_XFRM
-	int (*xfrm_policy_alloc_security)(struct xfrm_sec_ctx **ctxp,
-					  struct xfrm_user_sec_ctx *sec_ctx,
-						gfp_t gfp);
-	int (*xfrm_policy_clone_security)(struct xfrm_sec_ctx *old_ctx,
-						struct xfrm_sec_ctx **new_ctx);
-	void (*xfrm_policy_free_security)(struct xfrm_sec_ctx *ctx);
-	int (*xfrm_policy_delete_security)(struct xfrm_sec_ctx *ctx);
-	int (*xfrm_state_alloc)(struct xfrm_state *x,
-				struct xfrm_user_sec_ctx *sec_ctx);
-	int (*xfrm_state_alloc_acquire)(struct xfrm_state *x,
-					struct xfrm_sec_ctx *polsec,
-					u32 secid);
-	void (*xfrm_state_free_security)(struct xfrm_state *x);
-	int (*xfrm_state_delete_security)(struct xfrm_state *x);
-	int (*xfrm_policy_lookup)(struct xfrm_sec_ctx *ctx, u32 fl_secid,
-					u8 dir);
-	int (*xfrm_state_pol_flow_match)(struct xfrm_state *x,
-						struct xfrm_policy *xp,
-						const struct flowi *fl);
-	int (*xfrm_decode_session)(struct sk_buff *skb, u32 *secid, int ckall);
-#endif	/* CONFIG_SECURITY_NETWORK_XFRM */
-
-	/* key management security hooks */
-#ifdef CONFIG_KEYS
-	int (*key_alloc)(struct key *key, const struct cred *cred,
-				unsigned long flags);
-	void (*key_free)(struct key *key);
-	int (*key_permission)(key_ref_t key_ref, const struct cred *cred,
-				unsigned perm);
-	int (*key_getsecurity)(struct key *key, char **_buffer);
-#endif	/* CONFIG_KEYS */
-
-#ifdef CONFIG_AUDIT
-	int (*audit_rule_init)(u32 field, u32 op, char *rulestr,
-				void **lsmrule);
-	int (*audit_rule_known)(struct audit_krule *krule);
-	int (*audit_rule_match)(u32 secid, u32 field, u32 op, void *lsmrule);
-	void (*audit_rule_free)(void *lsmrule);
-#endif /* CONFIG_AUDIT */
-
-#ifdef CONFIG_BPF_SYSCALL
-	int (*bpf)(int cmd, union bpf_attr *attr,
-				 unsigned int size);
-	int (*bpf_map)(struct bpf_map *map, fmode_t fmode);
-	int (*bpf_prog)(struct bpf_prog *prog);
-	int (*bpf_map_alloc_security)(struct bpf_map *map);
-	void (*bpf_map_free_security)(struct bpf_map *map);
-	int (*bpf_prog_alloc_security)(struct bpf_prog_aux *aux);
-	void (*bpf_prog_free_security)(struct bpf_prog_aux *aux);
-#endif /* CONFIG_BPF_SYSCALL */
-	int (*locked_down)(enum lockdown_reason what);
-#ifdef CONFIG_PERF_EVENTS
-	int (*perf_event_open)(struct perf_event_attr *attr, int type);
-	int (*perf_event_alloc)(struct perf_event *event);
-	void (*perf_event_free)(struct perf_event *event);
-	int (*perf_event_read)(struct perf_event *event);
-	int (*perf_event_write)(struct perf_event *event);
-
-#endif
+	#define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__);
+	#include "lsm_hook_defs.h"
+	#undef LSM_HOOK
 };
 
 struct security_hook_heads {
-	struct hlist_head binder_set_context_mgr;
-	struct hlist_head binder_transaction;
-	struct hlist_head binder_transfer_binder;
-	struct hlist_head binder_transfer_file;
-	struct hlist_head ptrace_access_check;
-	struct hlist_head ptrace_traceme;
-	struct hlist_head capget;
-	struct hlist_head capset;
-	struct hlist_head capable;
-	struct hlist_head quotactl;
-	struct hlist_head quota_on;
-	struct hlist_head syslog;
-	struct hlist_head settime;
-	struct hlist_head vm_enough_memory;
-	struct hlist_head bprm_set_creds;
-	struct hlist_head bprm_check_security;
-	struct hlist_head bprm_committing_creds;
-	struct hlist_head bprm_committed_creds;
-	struct hlist_head fs_context_dup;
-	struct hlist_head fs_context_parse_param;
-	struct hlist_head sb_alloc_security;
-	struct hlist_head sb_free_security;
-	struct hlist_head sb_free_mnt_opts;
-	struct hlist_head sb_eat_lsm_opts;
-	struct hlist_head sb_remount;
-	struct hlist_head sb_kern_mount;
-	struct hlist_head sb_show_options;
-	struct hlist_head sb_statfs;
-	struct hlist_head sb_mount;
-	struct hlist_head sb_umount;
-	struct hlist_head sb_pivotroot;
-	struct hlist_head sb_set_mnt_opts;
-	struct hlist_head sb_clone_mnt_opts;
-	struct hlist_head sb_add_mnt_opt;
-	struct hlist_head move_mount;
-	struct hlist_head dentry_init_security;
-	struct hlist_head dentry_create_files_as;
-#ifdef CONFIG_SECURITY_PATH
-	struct hlist_head path_unlink;
-	struct hlist_head path_mkdir;
-	struct hlist_head path_rmdir;
-	struct hlist_head path_mknod;
-	struct hlist_head path_truncate;
-	struct hlist_head path_symlink;
-	struct hlist_head path_link;
-	struct hlist_head path_rename;
-	struct hlist_head path_chmod;
-	struct hlist_head path_chown;
-	struct hlist_head path_chroot;
-#endif
-	/* Needed for inode based modules as well */
-	struct hlist_head path_notify;
-	struct hlist_head inode_alloc_security;
-	struct hlist_head inode_free_security;
-	struct hlist_head inode_init_security;
-	struct hlist_head inode_create;
-	struct hlist_head inode_link;
-	struct hlist_head inode_unlink;
-	struct hlist_head inode_symlink;
-	struct hlist_head inode_mkdir;
-	struct hlist_head inode_rmdir;
-	struct hlist_head inode_mknod;
-	struct hlist_head inode_rename;
-	struct hlist_head inode_readlink;
-	struct hlist_head inode_follow_link;
-	struct hlist_head inode_permission;
-	struct hlist_head inode_setattr;
-	struct hlist_head inode_getattr;
-	struct hlist_head inode_setxattr;
-	struct hlist_head inode_post_setxattr;
-	struct hlist_head inode_getxattr;
-	struct hlist_head inode_listxattr;
-	struct hlist_head inode_removexattr;
-	struct hlist_head inode_need_killpriv;
-	struct hlist_head inode_killpriv;
-	struct hlist_head inode_getsecurity;
-	struct hlist_head inode_setsecurity;
-	struct hlist_head inode_listsecurity;
-	struct hlist_head inode_getsecid;
-	struct hlist_head inode_copy_up;
-	struct hlist_head inode_copy_up_xattr;
-	struct hlist_head kernfs_init_security;
-	struct hlist_head file_permission;
-	struct hlist_head file_alloc_security;
-	struct hlist_head file_free_security;
-	struct hlist_head file_ioctl;
-	struct hlist_head mmap_addr;
-	struct hlist_head mmap_file;
-	struct hlist_head file_mprotect;
-	struct hlist_head file_lock;
-	struct hlist_head file_fcntl;
-	struct hlist_head file_set_fowner;
-	struct hlist_head file_send_sigiotask;
-	struct hlist_head file_receive;
-	struct hlist_head file_open;
-	struct hlist_head task_alloc;
-	struct hlist_head task_free;
-	struct hlist_head cred_alloc_blank;
-	struct hlist_head cred_free;
-	struct hlist_head cred_prepare;
-	struct hlist_head cred_transfer;
-	struct hlist_head cred_getsecid;
-	struct hlist_head kernel_act_as;
-	struct hlist_head kernel_create_files_as;
-	struct hlist_head kernel_load_data;
-	struct hlist_head kernel_read_file;
-	struct hlist_head kernel_post_read_file;
-	struct hlist_head kernel_module_request;
-	struct hlist_head task_fix_setuid;
-	struct hlist_head task_setpgid;
-	struct hlist_head task_getpgid;
-	struct hlist_head task_getsid;
-	struct hlist_head task_getsecid;
-	struct hlist_head task_setnice;
-	struct hlist_head task_setioprio;
-	struct hlist_head task_getioprio;
-	struct hlist_head task_prlimit;
-	struct hlist_head task_setrlimit;
-	struct hlist_head task_setscheduler;
-	struct hlist_head task_getscheduler;
-	struct hlist_head task_movememory;
-	struct hlist_head task_kill;
-	struct hlist_head task_prctl;
-	struct hlist_head task_to_inode;
-	struct hlist_head ipc_permission;
-	struct hlist_head ipc_getsecid;
-	struct hlist_head msg_msg_alloc_security;
-	struct hlist_head msg_msg_free_security;
-	struct hlist_head msg_queue_alloc_security;
-	struct hlist_head msg_queue_free_security;
-	struct hlist_head msg_queue_associate;
-	struct hlist_head msg_queue_msgctl;
-	struct hlist_head msg_queue_msgsnd;
-	struct hlist_head msg_queue_msgrcv;
-	struct hlist_head shm_alloc_security;
-	struct hlist_head shm_free_security;
-	struct hlist_head shm_associate;
-	struct hlist_head shm_shmctl;
-	struct hlist_head shm_shmat;
-	struct hlist_head sem_alloc_security;
-	struct hlist_head sem_free_security;
-	struct hlist_head sem_associate;
-	struct hlist_head sem_semctl;
-	struct hlist_head sem_semop;
-	struct hlist_head netlink_send;
-	struct hlist_head d_instantiate;
-	struct hlist_head getprocattr;
-	struct hlist_head setprocattr;
-	struct hlist_head ismaclabel;
-	struct hlist_head secid_to_secctx;
-	struct hlist_head secctx_to_secid;
-	struct hlist_head release_secctx;
-	struct hlist_head inode_invalidate_secctx;
-	struct hlist_head inode_notifysecctx;
-	struct hlist_head inode_setsecctx;
-	struct hlist_head inode_getsecctx;
-#ifdef CONFIG_SECURITY_NETWORK
-	struct hlist_head unix_stream_connect;
-	struct hlist_head unix_may_send;
-	struct hlist_head socket_create;
-	struct hlist_head socket_post_create;
-	struct hlist_head socket_socketpair;
-	struct hlist_head socket_bind;
-	struct hlist_head socket_connect;
-	struct hlist_head socket_listen;
-	struct hlist_head socket_accept;
-	struct hlist_head socket_sendmsg;
-	struct hlist_head socket_recvmsg;
-	struct hlist_head socket_getsockname;
-	struct hlist_head socket_getpeername;
-	struct hlist_head socket_getsockopt;
-	struct hlist_head socket_setsockopt;
-	struct hlist_head socket_shutdown;
-	struct hlist_head socket_sock_rcv_skb;
-	struct hlist_head socket_getpeersec_stream;
-	struct hlist_head socket_getpeersec_dgram;
-	struct hlist_head sk_alloc_security;
-	struct hlist_head sk_free_security;
-	struct hlist_head sk_clone_security;
-	struct hlist_head sk_getsecid;
-	struct hlist_head sock_graft;
-	struct hlist_head inet_conn_request;
-	struct hlist_head inet_csk_clone;
-	struct hlist_head inet_conn_established;
-	struct hlist_head secmark_relabel_packet;
-	struct hlist_head secmark_refcount_inc;
-	struct hlist_head secmark_refcount_dec;
-	struct hlist_head req_classify_flow;
-	struct hlist_head tun_dev_alloc_security;
-	struct hlist_head tun_dev_free_security;
-	struct hlist_head tun_dev_create;
-	struct hlist_head tun_dev_attach_queue;
-	struct hlist_head tun_dev_attach;
-	struct hlist_head tun_dev_open;
-	struct hlist_head sctp_assoc_request;
-	struct hlist_head sctp_bind_connect;
-	struct hlist_head sctp_sk_clone;
-#endif	/* CONFIG_SECURITY_NETWORK */
-#ifdef CONFIG_SECURITY_INFINIBAND
-	struct hlist_head ib_pkey_access;
-	struct hlist_head ib_endport_manage_subnet;
-	struct hlist_head ib_alloc_security;
-	struct hlist_head ib_free_security;
-#endif	/* CONFIG_SECURITY_INFINIBAND */
-#ifdef CONFIG_SECURITY_NETWORK_XFRM
-	struct hlist_head xfrm_policy_alloc_security;
-	struct hlist_head xfrm_policy_clone_security;
-	struct hlist_head xfrm_policy_free_security;
-	struct hlist_head xfrm_policy_delete_security;
-	struct hlist_head xfrm_state_alloc;
-	struct hlist_head xfrm_state_alloc_acquire;
-	struct hlist_head xfrm_state_free_security;
-	struct hlist_head xfrm_state_delete_security;
-	struct hlist_head xfrm_policy_lookup;
-	struct hlist_head xfrm_state_pol_flow_match;
-	struct hlist_head xfrm_decode_session;
-#endif	/* CONFIG_SECURITY_NETWORK_XFRM */
-#ifdef CONFIG_KEYS
-	struct hlist_head key_alloc;
-	struct hlist_head key_free;
-	struct hlist_head key_permission;
-	struct hlist_head key_getsecurity;
-#endif	/* CONFIG_KEYS */
-#ifdef CONFIG_AUDIT
-	struct hlist_head audit_rule_init;
-	struct hlist_head audit_rule_known;
-	struct hlist_head audit_rule_match;
-	struct hlist_head audit_rule_free;
-#endif /* CONFIG_AUDIT */
-#ifdef CONFIG_BPF_SYSCALL
-	struct hlist_head bpf;
-	struct hlist_head bpf_map;
-	struct hlist_head bpf_prog;
-	struct hlist_head bpf_map_alloc_security;
-	struct hlist_head bpf_map_free_security;
-	struct hlist_head bpf_prog_alloc_security;
-	struct hlist_head bpf_prog_free_security;
-#endif /* CONFIG_BPF_SYSCALL */
-	struct hlist_head locked_down;
-#ifdef CONFIG_PERF_EVENTS
-	struct hlist_head perf_event_open;
-	struct hlist_head perf_event_alloc;
-	struct hlist_head perf_event_free;
-	struct hlist_head perf_event_read;
-	struct hlist_head perf_event_write;
-#endif
+	#define LSM_HOOK(RET, DEFAULT, NAME, ...) struct hlist_head NAME;
+	#include "lsm_hook_defs.h"
+	#undef LSM_HOOK
 } __randomize_layout;
 
 /*
@@ -2100,6 +1490,12 @@ struct lsm_blob_sizes {
 	int	lbs_task;
 };
 
+/*
+ * LSM_RET_VOID is used as the default value in LSM_HOOK definitions for void
+ * LSM hooks (in include/linux/lsm_hook_defs.h).
+ */
+#define LSM_RET_VOID ((void) 0)
+
 /*
  * Initializing a security_hook_list structure takes
  * up a lot of space in a source file. This macro takes
diff --git a/security/security.c b/security/security.c
index 565bc9b67276..7fed24b9d57e 100644
--- a/security/security.c
+++ b/security/security.c
@@ -668,6 +668,25 @@ static void __init lsm_early_task(struct task_struct *task)
 		panic("%s: Early task alloc failed.\n", __func__);
 }
 
+/*
+ * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
+ * can be accessed with:
+ *
+ *	LSM_RET_DEFAULT(<hook_name>)
+ *
+ * The macros below define static constants for the default value of each
+ * LSM hook.
+ */
+#define LSM_RET_DEFAULT(NAME) (NAME##_default)
+#define DECLARE_LSM_RET_DEFAULT_void(DEFAULT, NAME)
+#define DECLARE_LSM_RET_DEFAULT_int(DEFAULT, NAME) \
+	static const int LSM_RET_DEFAULT(NAME) = (DEFAULT);
+#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
+	DECLARE_LSM_RET_DEFAULT_##RET(DEFAULT, NAME)
+
+#include <linux/lsm_hook_defs.h>
+#undef LSM_HOOK
+
 /*
  * Hook list operation macros.
  *
@@ -1338,16 +1357,16 @@ int security_inode_getsecurity(struct inode *inode, const char *name, void **buf
 	int rc;
 
 	if (unlikely(IS_PRIVATE(inode)))
-		return -EOPNOTSUPP;
+		return LSM_RET_DEFAULT(inode_getsecurity);
 	/*
 	 * Only one module will provide an attribute with a given name.
 	 */
 	hlist_for_each_entry(hp, &security_hook_heads.inode_getsecurity, list) {
 		rc = hp->hook.inode_getsecurity(inode, name, buffer, alloc);
-		if (rc != -EOPNOTSUPP)
+		if (rc != LSM_RET_DEFAULT(inode_getsecurity))
 			return rc;
 	}
-	return -EOPNOTSUPP;
+	return LSM_RET_DEFAULT(inode_getsecurity);
 }
 
 int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags)
@@ -1356,17 +1375,17 @@ int security_inode_setsecurity(struct inode *inode, const char *name, const void
 	int rc;
 
 	if (unlikely(IS_PRIVATE(inode)))
-		return -EOPNOTSUPP;
+		return LSM_RET_DEFAULT(inode_setsecurity);
 	/*
 	 * Only one module will provide an attribute with a given name.
 	 */
 	hlist_for_each_entry(hp, &security_hook_heads.inode_setsecurity, list) {
 		rc = hp->hook.inode_setsecurity(inode, name, value, size,
 								flags);
-		if (rc != -EOPNOTSUPP)
+		if (rc != LSM_RET_DEFAULT(inode_setsecurity))
 			return rc;
 	}
-	return -EOPNOTSUPP;
+	return LSM_RET_DEFAULT(inode_setsecurity);
 }
 
 int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size)
@@ -1740,12 +1759,12 @@ int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			 unsigned long arg4, unsigned long arg5)
 {
 	int thisrc;
-	int rc = -ENOSYS;
+	int rc = LSM_RET_DEFAULT(task_prctl);
 	struct security_hook_list *hp;
 
 	hlist_for_each_entry(hp, &security_hook_heads.task_prctl, list) {
 		thisrc = hp->hook.task_prctl(option, arg2, arg3, arg4, arg5);
-		if (thisrc != -ENOSYS) {
+		if (thisrc != LSM_RET_DEFAULT(task_prctl)) {
 			rc = thisrc;
 			if (thisrc != 0)
 				break;
@@ -1917,7 +1936,7 @@ int security_getprocattr(struct task_struct *p, const char *lsm, char *name,
 			continue;
 		return hp->hook.getprocattr(p, name, value);
 	}
-	return -EINVAL;
+	return LSM_RET_DEFAULT(getprocattr);
 }
 
 int security_setprocattr(const char *lsm, const char *name, void *value,
@@ -1930,7 +1949,7 @@ int security_setprocattr(const char *lsm, const char *name, void *value,
 			continue;
 		return hp->hook.setprocattr(name, value, size);
 	}
-	return -EINVAL;
+	return LSM_RET_DEFAULT(setprocattr);
 }
 
 int security_netlink_send(struct sock *sk, struct sk_buff *skb)
@@ -2315,7 +2334,7 @@ int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
 				       const struct flowi *fl)
 {
 	struct security_hook_list *hp;
-	int rc = 1;
+	int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match);
 
 	/*
 	 * Since this function is expected to return 0 or 1, the judgment
-- 
cgit v1.2.3


From 9d3fdea789c8fab51381c2d609932fabe94c0517 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@google.com>
Date: Sun, 29 Mar 2020 01:43:51 +0100
Subject: bpf: lsm: Provide attachment points for BPF LSM programs

When CONFIG_BPF_LSM is enabled, nop functions, bpf_lsm_<hook_name>, are
generated for each LSM hook. These functions are initialized as LSM
hooks in a subsequent patch.

Signed-off-by: KP Singh <kpsingh@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Brendan Jackman <jackmanb@google.com>
Reviewed-by: Florent Revest <revest@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: James Morris <jamorris@linux.microsoft.com>
Link: https://lore.kernel.org/bpf/20200329004356.27286-4-kpsingh@chromium.org
---
 include/linux/bpf_lsm.h | 22 ++++++++++++++++++++++
 kernel/bpf/bpf_lsm.c    | 14 ++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 include/linux/bpf_lsm.h

(limited to 'include/linux')

diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
new file mode 100644
index 000000000000..83b96895829f
--- /dev/null
+++ b/include/linux/bpf_lsm.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Copyright (C) 2020 Google LLC.
+ */
+
+#ifndef _LINUX_BPF_LSM_H
+#define _LINUX_BPF_LSM_H
+
+#include <linux/bpf.h>
+#include <linux/lsm_hooks.h>
+
+#ifdef CONFIG_BPF_LSM
+
+#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
+	RET bpf_lsm_##NAME(__VA_ARGS__);
+#include <linux/lsm_hook_defs.h>
+#undef LSM_HOOK
+
+#endif /* CONFIG_BPF_LSM */
+
+#endif /* _LINUX_BPF_LSM_H */
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 82875039ca90..3b3bbb28603e 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -7,6 +7,20 @@
 #include <linux/filter.h>
 #include <linux/bpf.h>
 #include <linux/btf.h>
+#include <linux/lsm_hooks.h>
+#include <linux/bpf_lsm.h>
+
+/* For every LSM hook that allows attachment of BPF programs, declare a nop
+ * function where a BPF program can be attached.
+ */
+#define LSM_HOOK(RET, DEFAULT, NAME, ...)	\
+noinline RET bpf_lsm_##NAME(__VA_ARGS__)	\
+{						\
+	return DEFAULT;				\
+}
+
+#include <linux/lsm_hook_defs.h>
+#undef LSM_HOOK
 
 const struct bpf_prog_ops lsm_prog_ops = {
 };
-- 
cgit v1.2.3


From 9e4e01dfd3254c7f04f24b7c6b29596bc12332f3 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@google.com>
Date: Sun, 29 Mar 2020 01:43:52 +0100
Subject: bpf: lsm: Implement attach, detach and execution

JITed BPF programs are dynamically attached to the LSM hooks
using BPF trampolines. The trampoline prologue generates code to handle
conversion of the signature of the hook to the appropriate BPF context.

The allocated trampoline programs are attached to the nop functions
initialized as LSM hooks.

BPF_PROG_TYPE_LSM programs must have a GPL compatible license and
and need CAP_SYS_ADMIN (required for loading eBPF programs).

Upon attachment:

* A BPF fexit trampoline is used for LSM hooks with a void return type.
* A BPF fmod_ret trampoline is used for LSM hooks which return an
  int. The attached programs can override the return value of the
  bpf LSM hook to indicate a MAC Policy decision.

Signed-off-by: KP Singh <kpsingh@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Brendan Jackman <jackmanb@google.com>
Reviewed-by: Florent Revest <revest@google.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: James Morris <jamorris@linux.microsoft.com>
Link: https://lore.kernel.org/bpf/20200329004356.27286-5-kpsingh@chromium.org
---
 include/linux/bpf_lsm.h | 11 ++++++++++
 kernel/bpf/bpf_lsm.c    | 23 ++++++++++++++++++++
 kernel/bpf/btf.c        | 16 +++++++++++++-
 kernel/bpf/syscall.c    | 57 +++++++++++++++++++++++++++++++++----------------
 kernel/bpf/trampoline.c | 17 +++++++++++----
 kernel/bpf/verifier.c   | 19 +++++++++++++----
 6 files changed, 116 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h
index 83b96895829f..af74712af585 100644
--- a/include/linux/bpf_lsm.h
+++ b/include/linux/bpf_lsm.h
@@ -17,6 +17,17 @@
 #include <linux/lsm_hook_defs.h>
 #undef LSM_HOOK
 
+int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
+			const struct bpf_prog *prog);
+
+#else /* !CONFIG_BPF_LSM */
+
+static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
+				      const struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* CONFIG_BPF_LSM */
 
 #endif /* _LINUX_BPF_LSM_H */
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 3b3bbb28603e..19636703b24e 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -9,6 +9,8 @@
 #include <linux/btf.h>
 #include <linux/lsm_hooks.h>
 #include <linux/bpf_lsm.h>
+#include <linux/kallsyms.h>
+#include <linux/bpf_verifier.h>
 
 /* For every LSM hook that allows attachment of BPF programs, declare a nop
  * function where a BPF program can be attached.
@@ -22,6 +24,27 @@ noinline RET bpf_lsm_##NAME(__VA_ARGS__)	\
 #include <linux/lsm_hook_defs.h>
 #undef LSM_HOOK
 
+#define BPF_LSM_SYM_PREFX  "bpf_lsm_"
+
+int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
+			const struct bpf_prog *prog)
+{
+	if (!prog->gpl_compatible) {
+		bpf_log(vlog,
+			"LSM programs must have a GPL compatible license\n");
+		return -EINVAL;
+	}
+
+	if (strncmp(BPF_LSM_SYM_PREFX, prog->aux->attach_func_name,
+		    sizeof(BPF_LSM_SYM_PREFX) - 1)) {
+		bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n",
+			prog->aux->attach_btf_id, prog->aux->attach_func_name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 const struct bpf_prog_ops lsm_prog_ops = {
 };
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 6f397c4da05e..de335cd386f0 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3710,7 +3710,21 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	}
 
 	if (arg == nr_args) {
-		if (prog->expected_attach_type == BPF_TRACE_FEXIT) {
+		if (prog->expected_attach_type == BPF_TRACE_FEXIT ||
+		    prog->expected_attach_type == BPF_LSM_MAC) {
+			/* When LSM programs are attached to void LSM hooks
+			 * they use FEXIT trampolines and when attached to
+			 * int LSM hooks, they use MODIFY_RETURN trampolines.
+			 *
+			 * While the LSM programs are BPF_MODIFY_RETURN-like
+			 * the check:
+			 *
+			 *	if (ret_type != 'int')
+			 *		return -EINVAL;
+			 *
+			 * is _not_ done here. This is still safe as LSM hooks
+			 * have only void and int return types.
+			 */
 			if (!t)
 				return true;
 			t = btf_type_by_id(btf, t->type);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b2584b25748c..a616b63f23b4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -25,6 +25,7 @@
 #include <linux/nospec.h>
 #include <linux/audit.h>
 #include <uapi/linux/btf.h>
+#include <linux/bpf_lsm.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
 			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -1935,6 +1936,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 
 		switch (prog_type) {
 		case BPF_PROG_TYPE_TRACING:
+		case BPF_PROG_TYPE_LSM:
 		case BPF_PROG_TYPE_STRUCT_OPS:
 		case BPF_PROG_TYPE_EXT:
 			break;
@@ -2366,10 +2368,28 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog)
 	struct file *link_file;
 	int link_fd, err;
 
-	if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
-	    prog->expected_attach_type != BPF_TRACE_FEXIT &&
-	    prog->expected_attach_type != BPF_MODIFY_RETURN &&
-	    prog->type != BPF_PROG_TYPE_EXT) {
+	switch (prog->type) {
+	case BPF_PROG_TYPE_TRACING:
+		if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
+		    prog->expected_attach_type != BPF_TRACE_FEXIT &&
+		    prog->expected_attach_type != BPF_MODIFY_RETURN) {
+			err = -EINVAL;
+			goto out_put_prog;
+		}
+		break;
+	case BPF_PROG_TYPE_EXT:
+		if (prog->expected_attach_type != 0) {
+			err = -EINVAL;
+			goto out_put_prog;
+		}
+		break;
+	case BPF_PROG_TYPE_LSM:
+		if (prog->expected_attach_type != BPF_LSM_MAC) {
+			err = -EINVAL;
+			goto out_put_prog;
+		}
+		break;
+	default:
 		err = -EINVAL;
 		goto out_put_prog;
 	}
@@ -2448,16 +2468,10 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
-	    prog->type != BPF_PROG_TYPE_TRACING &&
-	    prog->type != BPF_PROG_TYPE_EXT &&
-	    prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
-		err = -EINVAL;
-		goto out_put_prog;
-	}
-
-	if (prog->type == BPF_PROG_TYPE_TRACING ||
-	    prog->type == BPF_PROG_TYPE_EXT) {
+	switch (prog->type) {
+	case BPF_PROG_TYPE_TRACING:
+	case BPF_PROG_TYPE_EXT:
+	case BPF_PROG_TYPE_LSM:
 		if (attr->raw_tracepoint.name) {
 			/* The attach point for this category of programs
 			 * should be specified via btf_id during program load.
@@ -2465,11 +2479,14 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 			err = -EINVAL;
 			goto out_put_prog;
 		}
-		if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
+		if (prog->type == BPF_PROG_TYPE_TRACING &&
+		    prog->expected_attach_type == BPF_TRACE_RAW_TP) {
 			tp_name = prog->aux->attach_func_name;
-		else
-			return bpf_tracing_prog_attach(prog);
-	} else {
+			break;
+		}
+		return bpf_tracing_prog_attach(prog);
+	case BPF_PROG_TYPE_RAW_TRACEPOINT:
+	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
 		if (strncpy_from_user(buf,
 				      u64_to_user_ptr(attr->raw_tracepoint.name),
 				      sizeof(buf) - 1) < 0) {
@@ -2478,6 +2495,10 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 		}
 		buf[sizeof(buf) - 1] = 0;
 		tp_name = buf;
+		break;
+	default:
+		err = -EINVAL;
+		goto out_put_prog;
 	}
 
 	btp = bpf_get_raw_tracepoint(tp_name);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index f30bca2a4d01..9be85aa4ec5f 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -6,6 +6,7 @@
 #include <linux/ftrace.h>
 #include <linux/rbtree_latch.h>
 #include <linux/perf_event.h>
+#include <linux/btf.h>
 
 /* dummy _ops. The verifier will operate on target program's ops. */
 const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -233,15 +234,23 @@ out:
 	return err;
 }
 
-static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(enum bpf_attach_type t)
+static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
 {
-	switch (t) {
+	switch (prog->expected_attach_type) {
 	case BPF_TRACE_FENTRY:
 		return BPF_TRAMP_FENTRY;
 	case BPF_MODIFY_RETURN:
 		return BPF_TRAMP_MODIFY_RETURN;
 	case BPF_TRACE_FEXIT:
 		return BPF_TRAMP_FEXIT;
+	case BPF_LSM_MAC:
+		if (!prog->aux->attach_func_proto->type)
+			/* The function returns void, we cannot modify its
+			 * return value.
+			 */
+			return BPF_TRAMP_FEXIT;
+		else
+			return BPF_TRAMP_MODIFY_RETURN;
 	default:
 		return BPF_TRAMP_REPLACE;
 	}
@@ -255,7 +264,7 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog)
 	int cnt;
 
 	tr = prog->aux->trampoline;
-	kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
+	kind = bpf_attach_type_to_tramp(prog);
 	mutex_lock(&tr->mutex);
 	if (tr->extension_prog) {
 		/* cannot attach fentry/fexit if extension prog is attached.
@@ -305,7 +314,7 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
 	int err;
 
 	tr = prog->aux->trampoline;
-	kind = bpf_attach_type_to_tramp(prog->expected_attach_type);
+	kind = bpf_attach_type_to_tramp(prog);
 	mutex_lock(&tr->mutex);
 	if (kind == BPF_TRAMP_REPLACE) {
 		WARN_ON_ONCE(!tr->extension_prog);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 46ba86c540e2..047b2e876399 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20,6 +20,7 @@
 #include <linux/perf_event.h>
 #include <linux/ctype.h>
 #include <linux/error-injection.h>
+#include <linux/bpf_lsm.h>
 
 #include "disasm.h"
 
@@ -6492,8 +6493,9 @@ static int check_return_code(struct bpf_verifier_env *env)
 	struct tnum range = tnum_range(0, 1);
 	int err;
 
-	/* The struct_ops func-ptr's return type could be "void" */
-	if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS &&
+	/* LSM and struct_ops func-ptr's return type could be "void" */
+	if ((env->prog->type == BPF_PROG_TYPE_STRUCT_OPS ||
+	     env->prog->type == BPF_PROG_TYPE_LSM) &&
 	    !prog->aux->attach_func_proto->type)
 		return 0;
 
@@ -9923,7 +9925,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
 		return check_struct_ops_btf_id(env);
 
-	if (prog->type != BPF_PROG_TYPE_TRACING && !prog_extension)
+	if (prog->type != BPF_PROG_TYPE_TRACING &&
+	    prog->type != BPF_PROG_TYPE_LSM &&
+	    !prog_extension)
 		return 0;
 
 	if (!btf_id) {
@@ -10054,8 +10058,16 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 			return -EINVAL;
 		/* fallthrough */
 	case BPF_MODIFY_RETURN:
+	case BPF_LSM_MAC:
 	case BPF_TRACE_FENTRY:
 	case BPF_TRACE_FEXIT:
+		prog->aux->attach_func_name = tname;
+		if (prog->type == BPF_PROG_TYPE_LSM) {
+			ret = bpf_lsm_verify_prog(&env->log, prog);
+			if (ret < 0)
+				return ret;
+		}
+
 		if (!btf_type_is_func(t)) {
 			verbose(env, "attach_btf_id %u is not a function\n",
 				btf_id);
@@ -10070,7 +10082,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 		tr = bpf_trampoline_lookup(key);
 		if (!tr)
 			return -ENOMEM;
-		prog->aux->attach_func_name = tname;
 		/* t is either vmlinux type or another program's type */
 		prog->aux->attach_func_proto = t;
 		mutex_lock(&tr->mutex);
-- 
cgit v1.2.3


From a08e7fd9123d85dfdf8d1dc61dbe321c8359d25f Mon Sep 17 00:00:00 2001
From: Cambda Zhu <cambda@linux.alibaba.com>
Date: Thu, 26 Mar 2020 15:33:14 +0800
Subject: net: Fix typo of SKB_SGO_CB_OFFSET

The SKB_SGO_CB_OFFSET should be SKB_GSO_CB_OFFSET which means the
offset of the GSO in skb cb. This patch fixes the typo.

Fixes: 9207f9d45b0a ("net: preserve IP control block during GSO segmentation")
Signed-off-by: Cambda Zhu <cambda@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h     | 4 ++--
 net/core/dev.c             | 4 ++--
 net/ipv4/ip_output.c       | 2 +-
 net/ipv4/udp.c             | 2 +-
 net/openvswitch/datapath.c | 2 +-
 net/xfrm/xfrm_output.c     | 4 ++--
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e47895082b4b..28b1a2b4459e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4389,8 +4389,8 @@ struct skb_gso_cb {
 	__wsum	csum;
 	__u16	csum_start;
 };
-#define SKB_SGO_CB_OFFSET	32
-#define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_SGO_CB_OFFSET))
+#define SKB_GSO_CB_OFFSET	32
+#define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_GSO_CB_OFFSET))
 
 static inline int skb_tnl_header_len(const struct sk_buff *inner_skb)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index d760dcc47978..dee392f21466 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3266,7 +3266,7 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
  *	It may return NULL if the skb requires no segmentation.  This is
  *	only possible when GSO is used for verifying header integrity.
  *
- *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
+ *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
  */
 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 				  netdev_features_t features, bool tx_path)
@@ -3295,7 +3295,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 			features &= ~NETIF_F_GSO_PARTIAL;
 	}
 
-	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
+	BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
 		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
 
 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index aaaaf907e0d8..090d3097ee15 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -263,7 +263,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
 	 *    insufficent MTU.
 	 */
 	features = netif_skb_features(skb);
-	BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
+	BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 	if (IS_ERR_OR_NULL(segs)) {
 		kfree_skb(skb);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2633fc231593..7ea90eb4a1ba 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2109,7 +2109,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	if (likely(!udp_unexpected_gso(sk, skb)))
 		return udp_queue_rcv_one_skb(sk, skb);
 
-	BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET);
+	BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET);
 	__skb_push(skb, -skb_mac_offset(skb));
 	segs = udp_rcv_segment(sk, skb, true);
 	skb_list_walk_safe(segs, skb, next) {
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 07a7dd185995..d8ae541d22a8 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -305,7 +305,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 	struct sk_buff *segs, *nskb;
 	int err;
 
-	BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET);
+	BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_GSO_CB_OFFSET);
 	segs = __skb_gso_segment(skb, NETIF_F_SG, false);
 	if (IS_ERR(segs))
 		return PTR_ERR(segs);
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index fafc7aba705f..2fd3d990d992 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -535,8 +535,8 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb
 {
 	struct sk_buff *segs, *nskb;
 
-	BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
-	BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_SGO_CB_OFFSET);
+	BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
+	BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_GSO_CB_OFFSET);
 	segs = skb_gso_segment(skb, 0);
 	kfree_skb(skb);
 	if (IS_ERR(segs))
-- 
cgit v1.2.3


From 3df523ab582c52f745f9a73b9ebf9368ede555ac Mon Sep 17 00:00:00 2001
From: Peter Krystad <peter.krystad@linux.intel.com>
Date: Fri, 27 Mar 2020 14:48:37 -0700
Subject: mptcp: Add ADD_ADDR handling

Add handling for sending and receiving the ADD_ADDR, ADD_ADDR6,
and RM_ADDR suboptions.

Co-developed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Peter Krystad <peter.krystad@linux.intel.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h  |  20 ++++-
 include/net/mptcp.h  |   9 +++
 net/mptcp/crypto.c   |  17 +++--
 net/mptcp/options.c  | 206 +++++++++++++++++++++++++++++++++++++++++++++++++--
 net/mptcp/protocol.h |  28 ++++++-
 5 files changed, 262 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 3dc964010fef..1225db308957 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -86,9 +86,13 @@ struct mptcp_options_received {
 	u64	data_seq;
 	u32	subflow_seq;
 	u16	data_len;
-	u8	mp_capable : 1,
+	u16	mp_capable : 1,
 		mp_join : 1,
-		dss : 1;
+		dss : 1,
+		add_addr : 1,
+		rm_addr : 1,
+		family : 4,
+		echo : 1;
 	u8	use_map:1,
 		dsn64:1,
 		data_fin:1,
@@ -96,6 +100,16 @@ struct mptcp_options_received {
 		ack64:1,
 		mpc_map:1,
 		__unused:2;
+	u8	addr_id;
+	u8	rm_id;
+	union {
+		struct in_addr	addr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+		struct in6_addr	addr6;
+#endif
+	};
+	u64	ahmac;
+	u16	port;
 };
 #endif
 
@@ -131,6 +145,8 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
 #if IS_ENABLED(CONFIG_MPTCP)
 	rx_opt->mptcp.mp_capable = 0;
 	rx_opt->mptcp.mp_join = 0;
+	rx_opt->mptcp.add_addr = 0;
+	rx_opt->mptcp.rm_addr = 0;
 	rx_opt->mptcp.dss = 0;
 #endif
 }
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index c971d25431ea..0d5ea71dd3d0 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -33,6 +33,15 @@ struct mptcp_out_options {
 	u16 suboptions;
 	u64 sndr_key;
 	u64 rcvr_key;
+	union {
+		struct in_addr addr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+		struct in6_addr addr6;
+#endif
+	};
+	u8 addr_id;
+	u64 ahmac;
+	u8 rm_id;
 	struct mptcp_ext ext_copy;
 #endif
 };
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
index 40d1bb18fd60..c151628bd416 100644
--- a/net/mptcp/crypto.c
+++ b/net/mptcp/crypto.c
@@ -44,8 +44,7 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn)
 		*idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6]));
 }
 
-void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
-			   void *hmac)
+void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
 {
 	u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE];
 	__be32 mptcp_hashed_key[SHA256_DIGEST_WORDS];
@@ -55,6 +54,9 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
 	u8 key2be[8];
 	int i;
 
+	if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE))
+		len = SHA256_DIGEST_SIZE;
+
 	put_unaligned_be64(key1, key1be);
 	put_unaligned_be64(key2, key2be);
 
@@ -65,11 +67,10 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
 	for (i = 0; i < 8; i++)
 		input[i + 8] ^= key2be[i];
 
-	put_unaligned_be32(nonce1, &input[SHA256_BLOCK_SIZE]);
-	put_unaligned_be32(nonce2, &input[SHA256_BLOCK_SIZE + 4]);
+	memcpy(&input[SHA256_BLOCK_SIZE], msg, len);
 
 	sha256_init(&state);
-	sha256_update(&state, input, SHA256_BLOCK_SIZE + 8);
+	sha256_update(&state, input, SHA256_BLOCK_SIZE + len);
 
 	/* emit sha256(K1 || msg) on the second input block, so we can
 	 * reuse 'input' for the last hashing
@@ -125,6 +126,7 @@ static int __init test_mptcp_crypto(void)
 	char hmac[20], hmac_hex[41];
 	u32 nonce1, nonce2;
 	u64 key1, key2;
+	u8 msg[8];
 	int i, j;
 
 	for (i = 0; i < ARRAY_SIZE(tests); ++i) {
@@ -134,7 +136,10 @@ static int __init test_mptcp_crypto(void)
 		nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0]));
 		nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4]));
 
-		mptcp_crypto_hmac_sha(key1, key2, nonce1, nonce2, hmac);
+		put_unaligned_be32(nonce1, &msg[0]);
+		put_unaligned_be32(nonce2, &msg[4]);
+
+		mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
 		for (j = 0; j < 20; ++j)
 			sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff);
 		hmac_hex[40] = 0;
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index aea1a62d9999..6c6c18a09a40 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -178,6 +178,71 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
 
 		break;
 
+	case MPTCPOPT_ADD_ADDR:
+		mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO;
+		if (!mp_opt->echo) {
+			if (opsize == TCPOLEN_MPTCP_ADD_ADDR ||
+			    opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT)
+				mp_opt->family = MPTCP_ADDR_IPVERSION_4;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+			else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 ||
+				 opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT)
+				mp_opt->family = MPTCP_ADDR_IPVERSION_6;
+#endif
+			else
+				break;
+		} else {
+			if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE ||
+			    opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT)
+				mp_opt->family = MPTCP_ADDR_IPVERSION_4;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+			else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE ||
+				 opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT)
+				mp_opt->family = MPTCP_ADDR_IPVERSION_6;
+#endif
+			else
+				break;
+		}
+
+		mp_opt->add_addr = 1;
+		mp_opt->port = 0;
+		mp_opt->addr_id = *ptr++;
+		pr_debug("ADD_ADDR: id=%d", mp_opt->addr_id);
+		if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
+			memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4);
+			ptr += 4;
+			if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT ||
+			    opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) {
+				mp_opt->port = get_unaligned_be16(ptr);
+				ptr += 2;
+			}
+		}
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+		else {
+			memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16);
+			ptr += 16;
+			if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT ||
+			    opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) {
+				mp_opt->port = get_unaligned_be16(ptr);
+				ptr += 2;
+			}
+		}
+#endif
+		if (!mp_opt->echo) {
+			mp_opt->ahmac = get_unaligned_be64(ptr);
+			ptr += 8;
+		}
+		break;
+
+	case MPTCPOPT_RM_ADDR:
+		if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE)
+			break;
+
+		mp_opt->rm_addr = 1;
+		mp_opt->rm_id = *ptr++;
+		pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
+		break;
+
 	default:
 		break;
 	}
@@ -386,6 +451,84 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
 	return true;
 }
 
+static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
+				  struct in_addr *addr)
+{
+	u8 hmac[MPTCP_ADDR_HMAC_LEN];
+	u8 msg[7];
+
+	msg[0] = addr_id;
+	memcpy(&msg[1], &addr->s_addr, 4);
+	msg[5] = 0;
+	msg[6] = 0;
+
+	mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac);
+
+	return get_unaligned_be64(hmac);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
+				   struct in6_addr *addr)
+{
+	u8 hmac[MPTCP_ADDR_HMAC_LEN];
+	u8 msg[19];
+
+	msg[0] = addr_id;
+	memcpy(&msg[1], &addr->s6_addr, 16);
+	msg[17] = 0;
+	msg[18] = 0;
+
+	mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac);
+
+	return get_unaligned_be64(hmac);
+}
+#endif
+
+static bool mptcp_established_options_addr(struct sock *sk,
+					   unsigned int *size,
+					   unsigned int remaining,
+					   struct mptcp_out_options *opts)
+{
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+	struct sockaddr_storage saddr;
+	u8 id;
+
+	id = 0;
+	memset(&saddr, 0, sizeof(saddr));
+
+	if (saddr.ss_family == AF_INET) {
+		if (remaining < TCPOLEN_MPTCP_ADD_ADDR)
+			return false;
+		opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
+		opts->addr_id = id;
+		opts->addr = ((struct sockaddr_in *)&saddr)->sin_addr;
+		opts->ahmac = add_addr_generate_hmac(msk->local_key,
+						     msk->remote_key,
+						     opts->addr_id,
+						     &opts->addr);
+		*size = TCPOLEN_MPTCP_ADD_ADDR;
+	}
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+	else if (saddr.ss_family == AF_INET6) {
+		if (remaining < TCPOLEN_MPTCP_ADD_ADDR6)
+			return false;
+		opts->suboptions |= OPTION_MPTCP_ADD_ADDR6;
+		opts->addr_id = id;
+		opts->ahmac = add_addr6_generate_hmac(msk->local_key,
+						      msk->remote_key,
+						      opts->addr_id,
+						      &opts->addr6);
+		opts->addr6 = ((struct sockaddr_in6 *)&saddr)->sin6_addr;
+		*size = TCPOLEN_MPTCP_ADD_ADDR6;
+	}
+#endif
+	pr_debug("addr_id=%d, ahmac=%llu", opts->addr_id, opts->ahmac);
+
+	return true;
+}
+
 bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
 			       unsigned int *size, unsigned int remaining,
 			       struct mptcp_out_options *opts)
@@ -393,6 +536,8 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
 	unsigned int opt_size = 0;
 	bool ret = false;
 
+	opts->suboptions = 0;
+
 	if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
 		ret = true;
 	else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
@@ -407,6 +552,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
 
 	*size += opt_size;
 	remaining -= opt_size;
+	if (mptcp_established_options_addr(sk, &opt_size, remaining, opts)) {
+		*size += opt_size;
+		remaining -= opt_size;
+		ret = true;
+	}
 
 	return ret;
 }
@@ -521,10 +671,9 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
 		else
 			len = TCPOLEN_MPTCP_MPC_ACK;
 
-		*ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) |
-			       (MPTCPOPT_MP_CAPABLE << 12) |
-			       (MPTCP_SUPPORTED_VERSION << 8) |
-			       MPTCP_CAP_HMAC_SHA256);
+		*ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len,
+				      MPTCP_SUPPORTED_VERSION,
+				      MPTCP_CAP_HMAC_SHA256);
 
 		if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
 		    opts->suboptions))
@@ -546,6 +695,50 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
 	}
 
 mp_capable_done:
+	if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
+		if (opts->ahmac)
+			*ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+					      TCPOLEN_MPTCP_ADD_ADDR, 0,
+					      opts->addr_id);
+		else
+			*ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+					      TCPOLEN_MPTCP_ADD_ADDR_BASE,
+					      MPTCP_ADDR_ECHO,
+					      opts->addr_id);
+		memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4);
+		ptr += 1;
+		if (opts->ahmac) {
+			put_unaligned_be64(opts->ahmac, ptr);
+			ptr += 2;
+		}
+	}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+	if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) {
+		if (opts->ahmac)
+			*ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+					      TCPOLEN_MPTCP_ADD_ADDR6, 0,
+					      opts->addr_id);
+		else
+			*ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+					      TCPOLEN_MPTCP_ADD_ADDR6_BASE,
+					      MPTCP_ADDR_ECHO,
+					      opts->addr_id);
+		memcpy((u8 *)ptr, opts->addr6.s6_addr, 16);
+		ptr += 4;
+		if (opts->ahmac) {
+			put_unaligned_be64(opts->ahmac, ptr);
+			ptr += 2;
+		}
+	}
+#endif
+
+	if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
+		*ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
+				      TCPOLEN_MPTCP_RM_ADDR_BASE,
+				      0, opts->rm_id);
+	}
+
 	if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
 		struct mptcp_ext *mpext = &opts->ext_copy;
 		u8 len = TCPOLEN_MPTCP_DSS_BASE;
@@ -567,10 +760,7 @@ mp_capable_done:
 				flags |= MPTCP_DSS_DATA_FIN;
 		}
 
-		*ptr++ = htonl((TCPOPT_MPTCP << 24) |
-			       (len  << 16) |
-			       (MPTCPOPT_DSS << 12) |
-			       (flags));
+		*ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
 
 		if (mpext->use_ack) {
 			put_unaligned_be64(mpext->data_ack, ptr);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index eb3f65264a40..471e013d1c32 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -17,6 +17,9 @@
 #define OPTION_MPTCP_MPC_SYN	BIT(0)
 #define OPTION_MPTCP_MPC_SYNACK	BIT(1)
 #define OPTION_MPTCP_MPC_ACK	BIT(2)
+#define OPTION_MPTCP_ADD_ADDR	BIT(6)
+#define OPTION_MPTCP_ADD_ADDR6	BIT(7)
+#define OPTION_MPTCP_RM_ADDR	BIT(8)
 
 /* MPTCP option subtypes */
 #define MPTCPOPT_MP_CAPABLE	0
@@ -39,6 +42,16 @@
 #define TCPOLEN_MPTCP_DSS_MAP32		10
 #define TCPOLEN_MPTCP_DSS_MAP64		14
 #define TCPOLEN_MPTCP_DSS_CHECKSUM	2
+#define TCPOLEN_MPTCP_ADD_ADDR		16
+#define TCPOLEN_MPTCP_ADD_ADDR_PORT	18
+#define TCPOLEN_MPTCP_ADD_ADDR_BASE	8
+#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT	10
+#define TCPOLEN_MPTCP_ADD_ADDR6		28
+#define TCPOLEN_MPTCP_ADD_ADDR6_PORT	30
+#define TCPOLEN_MPTCP_ADD_ADDR6_BASE	20
+#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT	22
+#define TCPOLEN_MPTCP_PORT_LEN		2
+#define TCPOLEN_MPTCP_RM_ADDR_BASE	4
 
 /* MPTCP MP_CAPABLE flags */
 #define MPTCP_VERSION_MASK	(0x0F)
@@ -55,10 +68,22 @@
 #define MPTCP_DSS_HAS_ACK	BIT(0)
 #define MPTCP_DSS_FLAG_MASK	(0x1F)
 
+/* MPTCP ADD_ADDR flags */
+#define MPTCP_ADDR_ECHO		BIT(0)
+#define MPTCP_ADDR_HMAC_LEN	20
+#define MPTCP_ADDR_IPVERSION_4	4
+#define MPTCP_ADDR_IPVERSION_6	6
+
 /* MPTCP socket flags */
 #define MPTCP_DATA_READY	0
 #define MPTCP_SEND_SPACE	1
 
+static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
+{
+	return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) |
+		     ((nib & 0xF) << 8) | field);
+}
+
 /* MPTCP connection sock */
 struct mptcp_sock {
 	/* inet_connection_sock must be the first member */
@@ -219,8 +244,7 @@ static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn)
 	mptcp_crypto_key_sha(*key, token, idsn);
 }
 
-void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
-			   void *hash_out);
+void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac);
 
 static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From f296234c98a8fcec94eec80304a873f635d350ea Mon Sep 17 00:00:00 2001
From: Peter Krystad <peter.krystad@linux.intel.com>
Date: Fri, 27 Mar 2020 14:48:39 -0700
Subject: mptcp: Add handling of incoming MP_JOIN requests

Process the MP_JOIN option in a SYN packet with the same flow
as MP_CAPABLE but when the third ACK is received add the
subflow to the MPTCP socket subflow list instead of adding it to
the TCP socket accept queue.

The subflow is added at the end of the subflow list so it will not
interfere with the existing subflows operation and no data is
expected to be transmitted on it.

Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Peter Krystad <peter.krystad@linux.intel.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |   8 ++-
 include/net/mptcp.h      |  11 ++++
 net/ipv4/tcp_minisocks.c |   6 ++
 net/mptcp/options.c      | 107 ++++++++++++++++++++++++++++----
 net/mptcp/protocol.c     |  96 +++++++++++++++++++++++-----
 net/mptcp/protocol.h     |  23 +++++++
 net/mptcp/subflow.c      | 158 ++++++++++++++++++++++++++++++++++++++++++-----
 net/mptcp/token.c        |  27 ++++++++
 8 files changed, 390 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1225db308957..421c99c12291 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -92,7 +92,13 @@ struct mptcp_options_received {
 		add_addr : 1,
 		rm_addr : 1,
 		family : 4,
-		echo : 1;
+		echo : 1,
+		backup : 1;
+	u32	token;
+	u32	nonce;
+	u64	thmac;
+	u8	hmac[20];
+	u8	join_id;
 	u8	use_map:1,
 		dsn64:1,
 		data_fin:1,
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 0d5ea71dd3d0..a4aea0e4addc 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -42,6 +42,10 @@ struct mptcp_out_options {
 	u8 addr_id;
 	u64 ahmac;
 	u8 rm_id;
+	u8 join_id;
+	u8 backup;
+	u32 nonce;
+	u64 thmac;
 	struct mptcp_ext ext_copy;
 #endif
 };
@@ -115,6 +119,8 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
 				 skb_ext_find(from, SKB_EXT_MPTCP));
 }
 
+bool mptcp_sk_is_subflow(const struct sock *sk);
+
 #else
 
 static inline void mptcp_init(void)
@@ -181,6 +187,11 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
 	return true;
 }
 
+static inline bool mptcp_sk_is_subflow(const struct sock *sk)
+{
+	return false;
+}
+
 #endif /* CONFIG_MPTCP */
 
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 03af7c3e75ef..7e40322cc5ec 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -774,6 +774,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	if (!child)
 		goto listen_overflow;
 
+	if (own_req && sk_is_mptcp(child) && mptcp_sk_is_subflow(child)) {
+		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+		inet_csk_reqsk_queue_drop_and_put(sk, req);
+		return child;
+	}
+
 	sock_rps_save_rxhash(child, skb);
 	tcp_synack_rtt_meas(child, req);
 	*req_stolen = !own_req;
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index a3661318a7af..8e2b2dbadf6d 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -96,6 +96,38 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
 			 mp_opt->rcvr_key, mp_opt->data_len);
 		break;
 
+	case MPTCPOPT_MP_JOIN:
+		mp_opt->mp_join = 1;
+		if (opsize == TCPOLEN_MPTCP_MPJ_SYN) {
+			mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
+			mp_opt->join_id = *ptr++;
+			mp_opt->token = get_unaligned_be32(ptr);
+			ptr += 4;
+			mp_opt->nonce = get_unaligned_be32(ptr);
+			ptr += 4;
+			pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u",
+				 mp_opt->backup, mp_opt->join_id,
+				 mp_opt->token, mp_opt->nonce);
+		} else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) {
+			mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
+			mp_opt->join_id = *ptr++;
+			mp_opt->thmac = get_unaligned_be64(ptr);
+			ptr += 8;
+			mp_opt->nonce = get_unaligned_be32(ptr);
+			ptr += 4;
+			pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u",
+				 mp_opt->backup, mp_opt->join_id,
+				 mp_opt->thmac, mp_opt->nonce);
+		} else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) {
+			ptr += 2;
+			memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN);
+			pr_debug("MP_JOIN hmac");
+		} else {
+			pr_warn("MP_JOIN bad option size");
+			mp_opt->mp_join = 0;
+		}
+		break;
+
 	case MPTCPOPT_DSS:
 		pr_debug("DSS");
 		ptr++;
@@ -572,37 +604,80 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
 		pr_debug("subflow_req=%p, local_key=%llu",
 			 subflow_req, subflow_req->local_key);
 		return true;
+	} else if (subflow_req->mp_join) {
+		opts->suboptions = OPTION_MPTCP_MPJ_SYNACK;
+		opts->backup = subflow_req->backup;
+		opts->join_id = subflow_req->local_id;
+		opts->thmac = subflow_req->thmac;
+		opts->nonce = subflow_req->local_nonce;
+		pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u",
+			 subflow_req, opts->backup, opts->join_id,
+			 opts->thmac, opts->nonce);
+		*size = TCPOLEN_MPTCP_MPJ_SYNACK;
+		return true;
 	}
 	return false;
 }
 
-static bool check_fully_established(struct mptcp_subflow_context *subflow,
+static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
+				    struct mptcp_subflow_context *subflow,
 				    struct sk_buff *skb,
 				    struct mptcp_options_received *mp_opt)
 {
 	/* here we can process OoO, in-window pkts, only in-sequence 4th ack
-	 * are relevant
+	 * will make the subflow fully established
 	 */
-	if (likely(subflow->fully_established ||
-		   TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1))
-		return true;
+	if (likely(subflow->fully_established)) {
+		/* on passive sockets, check for 3rd ack retransmission
+		 * note that msk is always set by subflow_syn_recv_sock()
+		 * for mp_join subflows
+		 */
+		if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 &&
+		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq &&
+		    subflow->mp_join && mp_opt->mp_join &&
+		    READ_ONCE(msk->pm.server_side))
+			tcp_send_ack(sk);
+		goto fully_established;
+	}
+
+	/* we should process OoO packets before the first subflow is fully
+	 * established, but not expected for MP_JOIN subflows
+	 */
+	if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)
+		return subflow->mp_capable;
 
-	if (mp_opt->use_ack)
+	if (mp_opt->use_ack) {
+		/* subflows are fully established as soon as we get any
+		 * additional ack.
+		 */
 		subflow->fully_established = 1;
+		goto fully_established;
+	}
 
-	if (subflow->can_ack)
-		return true;
+	WARN_ON_ONCE(subflow->can_ack);
 
 	/* If the first established packet does not contain MP_CAPABLE + data
 	 * then fallback to TCP
 	 */
 	if (!mp_opt->mp_capable) {
 		subflow->mp_capable = 0;
-		tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0;
+		tcp_sk(sk)->is_mptcp = 0;
 		return false;
 	}
+
+	subflow->fully_established = 1;
 	subflow->remote_key = mp_opt->sndr_key;
 	subflow->can_ack = 1;
+
+fully_established:
+	if (likely(subflow->pm_notified))
+		return true;
+
+	subflow->pm_notified = 1;
+	if (subflow->mp_join)
+		mptcp_pm_subflow_established(msk, subflow);
+	else
+		mptcp_pm_fully_established(msk);
 	return true;
 }
 
@@ -641,7 +716,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
 	struct mptcp_ext *mpext;
 
 	mp_opt = &opt_rx->mptcp;
-	if (!check_fully_established(subflow, skb, mp_opt))
+	if (!check_fully_established(msk, sk, subflow, skb, mp_opt))
 		return;
 
 	if (mp_opt->add_addr && add_addr_hmac_valid(msk, mp_opt)) {
@@ -700,8 +775,6 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
 	}
 
 	mpext->data_fin = mp_opt->data_fin;
-
-	mptcp_pm_fully_established(msk);
 }
 
 void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
@@ -787,6 +860,16 @@ mp_capable_done:
 				      0, opts->rm_id);
 	}
 
+	if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
+		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+				      TCPOLEN_MPTCP_MPJ_SYNACK,
+				      opts->backup, opts->join_id);
+		put_unaligned_be64(opts->thmac, ptr);
+		ptr += 2;
+		put_unaligned_be32(opts->nonce, ptr);
+		ptr += 1;
+	}
+
 	if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
 		struct mptcp_ext *mpext = &opts->ext_copy;
 		u8 len = TCPOLEN_MPTCP_DSS_BASE;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 5f93e8c94ef9..f2dac715a17e 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -104,19 +104,6 @@ set_state:
 	return ssock;
 }
 
-static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
-{
-	struct mptcp_subflow_context *subflow;
-
-	sock_owned_by_me((const struct sock *)msk);
-
-	mptcp_for_each_subflow(msk, subflow) {
-		return mptcp_subflow_tcp_sock(subflow);
-	}
-
-	return NULL;
-}
-
 static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
 			     struct sk_buff *skb,
 			     unsigned int offset, size_t copy_len)
@@ -391,6 +378,43 @@ out:
 	return ret;
 }
 
+static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
+{
+	struct mptcp_subflow_context *subflow;
+	struct sock *backup = NULL;
+
+	sock_owned_by_me((const struct sock *)msk);
+
+	mptcp_for_each_subflow(msk, subflow) {
+		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+		if (!sk_stream_memory_free(ssk)) {
+			struct socket *sock = ssk->sk_socket;
+
+			if (sock) {
+				clear_bit(MPTCP_SEND_SPACE, &msk->flags);
+				smp_mb__after_atomic();
+
+				/* enables sk->write_space() callbacks */
+				set_bit(SOCK_NOSPACE, &sock->flags);
+			}
+
+			return NULL;
+		}
+
+		if (subflow->backup) {
+			if (!backup)
+				backup = ssk;
+
+			continue;
+		}
+
+		return ssk;
+	}
+
+	return backup;
+}
+
 static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
 {
 	struct socket *sock;
@@ -438,10 +462,17 @@ fallback:
 		return ret >= 0 ? ret + copied : (copied ? copied : ret);
 	}
 
-	ssk = mptcp_subflow_get(msk);
-	if (!ssk) {
-		release_sock(sk);
-		return -ENOTCONN;
+	ssk = mptcp_subflow_get_send(msk);
+	while (!sk_stream_memory_free(sk) || !ssk) {
+		ret = sk_stream_wait_memory(sk, &timeo);
+		if (ret)
+			goto out;
+
+		ssk = mptcp_subflow_get_send(msk);
+		if (list_empty(&msk->conn_list)) {
+			ret = -ENOTCONN;
+			goto out;
+		}
 	}
 
 	pr_debug("conn_list->subflow=%p", ssk);
@@ -1070,6 +1101,37 @@ static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
+bool mptcp_finish_join(struct sock *sk)
+{
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+	struct sock *parent = (void *)msk;
+	struct socket *parent_sock;
+
+	pr_debug("msk=%p, subflow=%p", msk, subflow);
+
+	/* mptcp socket already closing? */
+	if (inet_sk_state_load(parent) != TCP_ESTABLISHED)
+		return false;
+
+	if (!msk->pm.server_side)
+		return true;
+
+	/* passive connection, attach to msk socket */
+	parent_sock = READ_ONCE(parent->sk_socket);
+	if (parent_sock && !sk->sk_socket)
+		mptcp_sock_graft(sk, parent_sock);
+
+	return mptcp_pm_allow_new_subflow(msk);
+}
+
+bool mptcp_sk_is_subflow(const struct sock *sk)
+{
+	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+	return subflow->mp_join == 1;
+}
+
 static bool mptcp_memory_free(const struct sock *sk, int wake)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 8d4761ae3951..ef94e36b8560 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -17,6 +17,9 @@
 #define OPTION_MPTCP_MPC_SYN	BIT(0)
 #define OPTION_MPTCP_MPC_SYNACK	BIT(1)
 #define OPTION_MPTCP_MPC_ACK	BIT(2)
+#define OPTION_MPTCP_MPJ_SYN	BIT(3)
+#define OPTION_MPTCP_MPJ_SYNACK	BIT(4)
+#define OPTION_MPTCP_MPJ_ACK	BIT(5)
 #define OPTION_MPTCP_ADD_ADDR	BIT(6)
 #define OPTION_MPTCP_ADD_ADDR6	BIT(7)
 #define OPTION_MPTCP_RM_ADDR	BIT(8)
@@ -36,6 +39,9 @@
 #define TCPOLEN_MPTCP_MPC_SYNACK	12
 #define TCPOLEN_MPTCP_MPC_ACK		20
 #define TCPOLEN_MPTCP_MPC_ACK_DATA	22
+#define TCPOLEN_MPTCP_MPJ_SYN		12
+#define TCPOLEN_MPTCP_MPJ_SYNACK	16
+#define TCPOLEN_MPTCP_MPJ_ACK		24
 #define TCPOLEN_MPTCP_DSS_BASE		4
 #define TCPOLEN_MPTCP_DSS_ACK32		4
 #define TCPOLEN_MPTCP_DSS_ACK64		8
@@ -53,6 +59,9 @@
 #define TCPOLEN_MPTCP_PORT_LEN		2
 #define TCPOLEN_MPTCP_RM_ADDR_BASE	4
 
+#define MPTCPOPT_BACKUP		BIT(0)
+#define MPTCPOPT_HMAC_LEN	20
+
 /* MPTCP MP_CAPABLE flags */
 #define MPTCP_VERSION_MASK	(0x0F)
 #define MPTCP_CAP_CHECKSUM_REQD	BIT(7)
@@ -162,11 +171,15 @@ struct mptcp_subflow_request_sock {
 		backup : 1,
 		remote_key_valid : 1;
 	u8	local_id;
+	u8	remote_id;
 	u64	local_key;
 	u64	remote_key;
 	u64	idsn;
 	u32	token;
 	u32	ssn_offset;
+	u64	thmac;
+	u32	local_nonce;
+	u32	remote_nonce;
 };
 
 static inline struct mptcp_subflow_request_sock *
@@ -190,15 +203,23 @@ struct mptcp_subflow_context {
 	u32	map_data_len;
 	u32	request_mptcp : 1,  /* send MP_CAPABLE */
 		mp_capable : 1,	    /* remote is MPTCP capable */
+		mp_join : 1,	    /* remote is JOINing */
 		fully_established : 1,	    /* path validated */
+		pm_notified : 1,    /* PM hook called for established status */
 		conn_finished : 1,
 		map_valid : 1,
 		mpc_map : 1,
+		backup : 1,
 		data_avail : 1,
 		rx_eof : 1,
 		data_fin_tx_enable : 1,
 		can_ack : 1;	    /* only after processing the remote a key */
 	u64	data_fin_tx_seq;
+	u32	remote_nonce;
+	u64	thmac;
+	u32	local_nonce;
+	u8	local_id;
+	u8	remote_id;
 
 	struct	sock *tcp_sock;	    /* tcp sk backpointer */
 	struct	sock *conn;	    /* parent mptcp_sock */
@@ -270,11 +291,13 @@ void mptcp_get_options(const struct sk_buff *skb,
 
 void mptcp_finish_connect(struct sock *sk);
 void mptcp_data_ready(struct sock *sk, struct sock *ssk);
+bool mptcp_finish_join(struct sock *sk);
 
 int mptcp_token_new_request(struct request_sock *req);
 void mptcp_token_destroy_request(u32 token);
 int mptcp_token_new_connect(struct sock *sk);
 int mptcp_token_new_accept(u32 token, struct sock *conn);
+struct mptcp_sock *mptcp_token_get_sock(u32 token);
 void mptcp_token_destroy(u32 token);
 
 void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index eb82cb33dde2..e7caa4f6e1e5 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
+#include <crypto/algapi.h>
 #include <net/sock.h>
 #include <net/inet_common.h>
 #include <net/inet_hashtables.h>
@@ -47,6 +48,52 @@ static void subflow_req_destructor(struct request_sock *req)
 	tcp_request_sock_ops.destructor(req);
 }
 
+static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
+				  void *hmac)
+{
+	u8 msg[8];
+
+	put_unaligned_be32(nonce1, &msg[0]);
+	put_unaligned_be32(nonce2, &msg[4]);
+
+	mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
+}
+
+/* validate received token and create truncated hmac and nonce for SYN-ACK */
+static bool subflow_token_join_request(struct request_sock *req,
+				       const struct sk_buff *skb)
+{
+	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+	u8 hmac[MPTCPOPT_HMAC_LEN];
+	struct mptcp_sock *msk;
+	int local_id;
+
+	msk = mptcp_token_get_sock(subflow_req->token);
+	if (!msk) {
+		pr_debug("subflow_req=%p, token=%u - not found\n",
+			 subflow_req, subflow_req->token);
+		return false;
+	}
+
+	local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
+	if (local_id < 0) {
+		sock_put((struct sock *)msk);
+		return false;
+	}
+	subflow_req->local_id = local_id;
+
+	get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
+
+	subflow_generate_hmac(msk->local_key, msk->remote_key,
+			      subflow_req->local_nonce,
+			      subflow_req->remote_nonce, hmac);
+
+	subflow_req->thmac = get_unaligned_be64(hmac);
+
+	sock_put((struct sock *)msk);
+	return true;
+}
+
 static void subflow_init_req(struct request_sock *req,
 			     const struct sock *sk_listener,
 			     struct sk_buff *skb)
@@ -61,6 +108,7 @@ static void subflow_init_req(struct request_sock *req,
 	mptcp_get_options(skb, &rx_opt);
 
 	subflow_req->mp_capable = 0;
+	subflow_req->mp_join = 0;
 	subflow_req->remote_key_valid = 0;
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -71,6 +119,9 @@ static void subflow_init_req(struct request_sock *req,
 		return;
 #endif
 
+	if (rx_opt.mptcp.mp_capable && rx_opt.mptcp.mp_join)
+		return;
+
 	if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
 		int err;
 
@@ -79,6 +130,18 @@ static void subflow_init_req(struct request_sock *req,
 			subflow_req->mp_capable = 1;
 
 		subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
+	} else if (rx_opt.mptcp.mp_join && listener->request_mptcp) {
+		subflow_req->mp_join = 1;
+		subflow_req->backup = rx_opt.mptcp.backup;
+		subflow_req->remote_id = rx_opt.mptcp.join_id;
+		subflow_req->token = rx_opt.mptcp.token;
+		subflow_req->remote_nonce = rx_opt.mptcp.nonce;
+		pr_debug("token=%u, remote_nonce=%u", subflow_req->token,
+			 subflow_req->remote_nonce);
+		if (!subflow_token_join_request(req, skb)) {
+			subflow_req->mp_join = 0;
+			// @@ need to trigger RST
+		}
 	}
 }
 
@@ -178,6 +241,32 @@ drop:
 }
 #endif
 
+/* validate hmac received in third ACK */
+static bool subflow_hmac_valid(const struct request_sock *req,
+			       const struct tcp_options_received *rx_opt)
+{
+	const struct mptcp_subflow_request_sock *subflow_req;
+	u8 hmac[MPTCPOPT_HMAC_LEN];
+	struct mptcp_sock *msk;
+	bool ret;
+
+	subflow_req = mptcp_subflow_rsk(req);
+	msk = mptcp_token_get_sock(subflow_req->token);
+	if (!msk)
+		return false;
+
+	subflow_generate_hmac(msk->remote_key, msk->local_key,
+			      subflow_req->remote_nonce,
+			      subflow_req->local_nonce, hmac);
+
+	ret = true;
+	if (crypto_memneq(hmac, rx_opt->mptcp.hmac, sizeof(hmac)))
+		ret = false;
+
+	sock_put((struct sock *)msk);
+	return ret;
+}
+
 static struct sock *subflow_syn_recv_sock(const struct sock *sk,
 					  struct sk_buff *skb,
 					  struct request_sock *req,
@@ -188,6 +277,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
 	struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
 	struct mptcp_subflow_request_sock *subflow_req;
 	struct tcp_options_received opt_rx;
+	bool fallback_is_fatal = false;
 	struct sock *new_msk = NULL;
 	struct sock *child;
 
@@ -221,6 +311,13 @@ create_msk:
 		new_msk = mptcp_sk_clone(listener->conn, req);
 		if (!new_msk)
 			subflow_req->mp_capable = 0;
+	} else if (subflow_req->mp_join) {
+		fallback_is_fatal = true;
+		opt_rx.mptcp.mp_join = 0;
+		mptcp_get_options(skb, &opt_rx);
+		if (!opt_rx.mptcp.mp_join ||
+		    !subflow_hmac_valid(req, &opt_rx))
+			return NULL;
 	}
 
 create_child:
@@ -230,11 +327,14 @@ create_child:
 	if (child && *own_req) {
 		struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
 
-		/* we have null ctx on TCP fallback, not fatal on MPC
-		 * handshake
+		/* we have null ctx on TCP fallback, which is fatal on
+		 * MPJ handshake
 		 */
-		if (!ctx)
+		if (!ctx) {
+			if (fallback_is_fatal)
+				goto close_child;
 			goto out;
+		}
 
 		if (ctx->mp_capable) {
 			/* new mpc subflow takes ownership of the newly
@@ -244,6 +344,16 @@ create_child:
 			mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
 			ctx->conn = new_msk;
 			new_msk = NULL;
+		} else if (ctx->mp_join) {
+			struct mptcp_sock *owner;
+
+			owner = mptcp_token_get_sock(ctx->token);
+			if (!owner)
+				goto close_child;
+
+			ctx->conn = (struct sock *)owner;
+			if (!mptcp_finish_join(child))
+				goto close_child;
 		}
 	}
 
@@ -252,6 +362,12 @@ out:
 	if (unlikely(new_msk))
 		sock_put(new_msk);
 	return child;
+
+close_child:
+	tcp_send_active_reset(child, GFP_ATOMIC);
+	inet_csk_prepare_forced_close(child);
+	tcp_done(child);
+	return NULL;
 }
 
 static struct inet_connection_sock_af_ops subflow_specific;
@@ -566,7 +682,7 @@ static void subflow_data_ready(struct sock *sk)
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 	struct sock *parent = subflow->conn;
 
-	if (!subflow->mp_capable) {
+	if (!subflow->mp_capable && !subflow->mp_join) {
 		subflow->tcp_data_ready(sk);
 
 		parent->sk_data_ready(parent);
@@ -785,7 +901,8 @@ static void subflow_ulp_clone(const struct request_sock *req,
 	struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
 	struct mptcp_subflow_context *new_ctx;
 
-	if (!tcp_rsk(req)->is_mptcp || !subflow_req->mp_capable) {
+	if (!tcp_rsk(req)->is_mptcp ||
+	    (!subflow_req->mp_capable && !subflow_req->mp_join)) {
 		subflow_ulp_fallback(newsk, old_ctx);
 		return;
 	}
@@ -796,9 +913,6 @@ static void subflow_ulp_clone(const struct request_sock *req,
 		return;
 	}
 
-	/* see comments in subflow_syn_recv_sock(), MPTCP connection is fully
-	 * established only after we receive the remote key
-	 */
 	new_ctx->conn_finished = 1;
 	new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
 	new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
@@ -807,14 +921,26 @@ static void subflow_ulp_clone(const struct request_sock *req,
 	new_ctx->rel_write_seq = 1;
 	new_ctx->tcp_sock = newsk;
 
-	new_ctx->mp_capable = 1;
-	new_ctx->fully_established = subflow_req->remote_key_valid;
-	new_ctx->can_ack = subflow_req->remote_key_valid;
-	new_ctx->remote_key = subflow_req->remote_key;
-	new_ctx->local_key = subflow_req->local_key;
-	new_ctx->token = subflow_req->token;
-	new_ctx->ssn_offset = subflow_req->ssn_offset;
-	new_ctx->idsn = subflow_req->idsn;
+	if (subflow_req->mp_capable) {
+		/* see comments in subflow_syn_recv_sock(), MPTCP connection
+		 * is fully established only after we receive the remote key
+		 */
+		new_ctx->mp_capable = 1;
+		new_ctx->fully_established = subflow_req->remote_key_valid;
+		new_ctx->can_ack = subflow_req->remote_key_valid;
+		new_ctx->remote_key = subflow_req->remote_key;
+		new_ctx->local_key = subflow_req->local_key;
+		new_ctx->token = subflow_req->token;
+		new_ctx->ssn_offset = subflow_req->ssn_offset;
+		new_ctx->idsn = subflow_req->idsn;
+	} else if (subflow_req->mp_join) {
+		new_ctx->mp_join = 1;
+		new_ctx->fully_established = 1;
+		new_ctx->backup = subflow_req->backup;
+		new_ctx->local_id = subflow_req->local_id;
+		new_ctx->token = subflow_req->token;
+		new_ctx->thmac = subflow_req->thmac;
+	}
 }
 
 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
index b71b53c0ac8d..129a5ad1bc35 100644
--- a/net/mptcp/token.c
+++ b/net/mptcp/token.c
@@ -140,6 +140,33 @@ int mptcp_token_new_accept(u32 token, struct sock *conn)
 	return err;
 }
 
+/**
+ * mptcp_token_get_sock - retrieve mptcp connection sock using its token
+ * @token: token of the mptcp connection to retrieve
+ *
+ * This function returns the mptcp connection structure with the given token.
+ * A reference count on the mptcp socket returned is taken.
+ *
+ * returns NULL if no connection with the given token value exists.
+ */
+struct mptcp_sock *mptcp_token_get_sock(u32 token)
+{
+	struct sock *conn;
+
+	spin_lock_bh(&token_tree_lock);
+	conn = radix_tree_lookup(&token_tree, token);
+	if (conn) {
+		/* token still reserved? */
+		if (conn == (struct sock *)&token_used)
+			conn = NULL;
+		else
+			sock_hold(conn);
+	}
+	spin_unlock_bh(&token_tree_lock);
+
+	return mptcp_sk(conn);
+}
+
 /**
  * mptcp_token_destroy_request - remove mptcp connection/token
  * @token - token of mptcp connection to remove
-- 
cgit v1.2.3


From 8610c7c6e3bd647ff98d21c8bc0580e77bc2f8b3 Mon Sep 17 00:00:00 2001
From: Alexander Aring <alex.aring@gmail.com>
Date: Fri, 27 Mar 2020 18:00:20 -0400
Subject: net: ipv6: add support for rpl sr exthdr

This patch adds rpl source routing receive handling. Everything works
only if sysconf "rpl_seg_enabled" and source routing is enabled. Mostly
the same behaviour as IPv6 segmentation routing. To handle compression
and uncompression a rpl.c file is created which contains the necessary
functionality. The receive handling will also care about IPv6
encapsulated so far it's specified as possible nexthdr in RFC 6554.

Signed-off-by: Alexander Aring <alex.aring@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h      |   1 +
 include/net/rpl.h         |  34 ++++++++
 include/uapi/linux/ipv6.h |   2 +
 net/ipv6/Makefile         |   2 +-
 net/ipv6/addrconf.c       |  10 +++
 net/ipv6/exthdrs.c        | 201 +++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/rpl.c            | 123 ++++++++++++++++++++++++++++
 7 files changed, 370 insertions(+), 3 deletions(-)
 create mode 100644 include/net/rpl.h
 create mode 100644 net/ipv6/rpl.c

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ea7c7906591e..2cb445a8fc9e 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -74,6 +74,7 @@ struct ipv6_devconf {
 	__u32		addr_gen_mode;
 	__s32		disable_policy;
 	__s32           ndisc_tclass;
+	__s32		rpl_seg_enabled;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/net/rpl.h b/include/net/rpl.h
new file mode 100644
index 000000000000..d694d49a61db
--- /dev/null
+++ b/include/net/rpl.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *  RPL implementation
+ *
+ *  Author:
+ *  (C) 2020 Alexander Aring <alex.aring@gmail.com>
+ */
+
+#ifndef _NET_RPL_H
+#define _NET_RPL_H
+
+#include <linux/rpl.h>
+
+/* Worst decompression memory usage ipv6 address (16) + pad 7 */
+#define IPV6_RPL_SRH_WORST_SWAP_SIZE (sizeof(struct in6_addr) + 7)
+
+static inline size_t ipv6_rpl_srh_alloc_size(unsigned char n)
+{
+	return sizeof(struct ipv6_rpl_sr_hdr) +
+		((n + 1) * sizeof(struct in6_addr));
+}
+
+size_t ipv6_rpl_srh_size(unsigned char n, unsigned char cmpri,
+			 unsigned char cmpre);
+
+void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr,
+			     const struct ipv6_rpl_sr_hdr *inhdr,
+			     const struct in6_addr *daddr, unsigned char n);
+
+void ipv6_rpl_srh_compress(struct ipv6_rpl_sr_hdr *outhdr,
+			   const struct ipv6_rpl_sr_hdr *inhdr,
+			   const struct in6_addr *daddr, unsigned char n);
+
+#endif /* _NET_RPL_H */
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 9c0f4a92bcff..13e8751bf24a 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -40,6 +40,7 @@ struct in6_ifreq {
 #define IPV6_SRCRT_STRICT	0x01	/* Deprecated; will be removed */
 #define IPV6_SRCRT_TYPE_0	0	/* Deprecated; will be removed */
 #define IPV6_SRCRT_TYPE_2	2	/* IPv6 type 2 Routing Header	*/
+#define IPV6_SRCRT_TYPE_3	3	/* RPL Segment Routing with IPv6 */
 #define IPV6_SRCRT_TYPE_4	4	/* Segment Routing with IPv6 */
 
 /*
@@ -187,6 +188,7 @@ enum {
 	DEVCONF_DISABLE_POLICY,
 	DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN,
 	DEVCONF_NDISC_TCLASS,
+	DEVCONF_RPL_SEG_ENABLED,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 8ccf35514015..9d3e9bd2334f 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -10,7 +10,7 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
 		route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
 		raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
 		exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
-		udp_offload.o seg6.o fib6_notifier.o
+		udp_offload.o seg6.o fib6_notifier.o rpl.o
 
 ipv6-offload :=	ip6_offload.o tcpv6_offload.o exthdrs_offload.o
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 594963a7e1ec..a11fd4d67832 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -236,6 +236,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.enhanced_dad           = 1,
 	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
 	.disable_policy		= 0,
+	.rpl_seg_enabled	= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -290,6 +291,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.enhanced_dad           = 1,
 	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
 	.disable_policy		= 0,
+	.rpl_seg_enabled	= 0,
 };
 
 /* Check if link is ready: is it up and is a valid qdisc available */
@@ -5520,6 +5522,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
 	array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy;
 	array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass;
+	array[DEVCONF_RPL_SEG_ENABLED] = cnf->rpl_seg_enabled;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -6900,6 +6903,13 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.extra1		= (void *)SYSCTL_ZERO,
 		.extra2		= (void *)&two_five_five,
 	},
+	{
+		.procname	= "rpl_seg_enabled",
+		.data		= &ipv6_devconf.rpl_seg_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		/* sentinel */
 	}
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index bcb9f5e62808..5a8bbcdcaf2b 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -48,6 +48,7 @@
 #ifdef CONFIG_IPV6_SEG6_HMAC
 #include <net/seg6_hmac.h>
 #endif
+#include <net/rpl.h>
 
 #include <linux/uaccess.h>
 
@@ -468,6 +469,195 @@ looped_back:
 	return -1;
 }
 
+static int ipv6_rpl_srh_rcv(struct sk_buff *skb)
+{
+	struct ipv6_rpl_sr_hdr *hdr, *ohdr, *chdr;
+	struct inet6_skb_parm *opt = IP6CB(skb);
+	struct net *net = dev_net(skb->dev);
+	struct inet6_dev *idev;
+	struct ipv6hdr *oldhdr;
+	struct in6_addr addr;
+	unsigned char *buf;
+	int accept_rpl_seg;
+	int i, err;
+	u64 n = 0;
+	u32 r;
+
+	idev = __in6_dev_get(skb->dev);
+
+	accept_rpl_seg = net->ipv6.devconf_all->rpl_seg_enabled;
+	if (accept_rpl_seg > idev->cnf.rpl_seg_enabled)
+		accept_rpl_seg = idev->cnf.rpl_seg_enabled;
+
+	if (!accept_rpl_seg) {
+		kfree_skb(skb);
+		return -1;
+	}
+
+looped_back:
+	hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb);
+
+	if (hdr->segments_left == 0) {
+		if (hdr->nexthdr == NEXTHDR_IPV6) {
+			int offset = (hdr->hdrlen + 1) << 3;
+
+			skb_postpull_rcsum(skb, skb_network_header(skb),
+					   skb_network_header_len(skb));
+
+			if (!pskb_pull(skb, offset)) {
+				kfree_skb(skb);
+				return -1;
+			}
+			skb_postpull_rcsum(skb, skb_transport_header(skb),
+					   offset);
+
+			skb_reset_network_header(skb);
+			skb_reset_transport_header(skb);
+			skb->encapsulation = 0;
+
+			__skb_tunnel_rx(skb, skb->dev, net);
+
+			netif_rx(skb);
+			return -1;
+		}
+
+		opt->srcrt = skb_network_header_len(skb);
+		opt->lastopt = opt->srcrt;
+		skb->transport_header += (hdr->hdrlen + 1) << 3;
+		opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
+
+		return 1;
+	}
+
+	if (!pskb_may_pull(skb, sizeof(*hdr))) {
+		kfree_skb(skb);
+		return -1;
+	}
+
+	n = (hdr->hdrlen << 3) - hdr->pad - (16 - hdr->cmpre);
+	r = do_div(n, (16 - hdr->cmpri));
+	/* checks if calculation was without remainder and n fits into
+	 * unsigned char which is segments_left field. Should not be
+	 * higher than that.
+	 */
+	if (r || (n + 1) > 255) {
+		kfree_skb(skb);
+		return -1;
+	}
+
+	if (hdr->segments_left > n + 1) {
+		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+		icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+				  ((&hdr->segments_left) -
+				   skb_network_header(skb)));
+		return -1;
+	}
+
+	if (skb_cloned(skb)) {
+		if (pskb_expand_head(skb, IPV6_RPL_SRH_WORST_SWAP_SIZE, 0,
+				     GFP_ATOMIC)) {
+			__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+					IPSTATS_MIB_OUTDISCARDS);
+			kfree_skb(skb);
+			return -1;
+		}
+	} else {
+		err = skb_cow_head(skb, IPV6_RPL_SRH_WORST_SWAP_SIZE);
+		if (unlikely(err)) {
+			kfree_skb(skb);
+			return -1;
+		}
+	}
+
+	hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb);
+
+	if (!pskb_may_pull(skb, ipv6_rpl_srh_size(n, hdr->cmpri,
+						  hdr->cmpre))) {
+		kfree_skb(skb);
+		return -1;
+	}
+
+	hdr->segments_left--;
+	i = n - hdr->segments_left;
+
+	buf = kzalloc(ipv6_rpl_srh_alloc_size(n + 1) * 2, GFP_ATOMIC);
+	if (unlikely(!buf)) {
+		kfree_skb(skb);
+		return -1;
+	}
+
+	ohdr = (struct ipv6_rpl_sr_hdr *)buf;
+	ipv6_rpl_srh_decompress(ohdr, hdr, &ipv6_hdr(skb)->daddr, n);
+	chdr = (struct ipv6_rpl_sr_hdr *)(buf + ((ohdr->hdrlen + 1) << 3));
+
+	if ((ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST) ||
+	    (ipv6_addr_type(&ohdr->rpl_segaddr[i]) & IPV6_ADDR_MULTICAST)) {
+		kfree_skb(skb);
+		kfree(buf);
+		return -1;
+	}
+
+	err = ipv6_chk_rpl_srh_loop(net, ohdr->rpl_segaddr, n + 1);
+	if (err) {
+		icmpv6_send(skb, ICMPV6_PARAMPROB, 0, 0);
+		kfree_skb(skb);
+		kfree(buf);
+		return -1;
+	}
+
+	addr = ipv6_hdr(skb)->daddr;
+	ipv6_hdr(skb)->daddr = ohdr->rpl_segaddr[i];
+	ohdr->rpl_segaddr[i] = addr;
+
+	ipv6_rpl_srh_compress(chdr, ohdr, &ipv6_hdr(skb)->daddr, n);
+
+	oldhdr = ipv6_hdr(skb);
+
+	skb_pull(skb, ((hdr->hdrlen + 1) << 3));
+	skb_postpull_rcsum(skb, oldhdr,
+			   sizeof(struct ipv6hdr) + ((hdr->hdrlen + 1) << 3));
+	skb_push(skb, ((chdr->hdrlen + 1) << 3) + sizeof(struct ipv6hdr));
+	skb_reset_network_header(skb);
+	skb_mac_header_rebuild(skb);
+	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+	memmove(ipv6_hdr(skb), oldhdr, sizeof(struct ipv6hdr));
+	memcpy(skb_transport_header(skb), chdr, (chdr->hdrlen + 1) << 3);
+
+	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+	skb_postpush_rcsum(skb, ipv6_hdr(skb),
+			   sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3));
+
+	kfree(buf);
+
+	skb_dst_drop(skb);
+
+	ip6_route_input(skb);
+
+	if (skb_dst(skb)->error) {
+		dst_input(skb);
+		return -1;
+	}
+
+	if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
+		if (ipv6_hdr(skb)->hop_limit <= 1) {
+			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+			icmpv6_send(skb, ICMPV6_TIME_EXCEED,
+				    ICMPV6_EXC_HOPLIMIT, 0);
+			kfree_skb(skb);
+			return -1;
+		}
+		ipv6_hdr(skb)->hop_limit--;
+
+		skb_pull(skb, sizeof(struct ipv6hdr));
+		goto looped_back;
+	}
+
+	dst_input(skb);
+
+	return -1;
+}
+
 /********************************
   Routing header.
  ********************************/
@@ -506,9 +696,16 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
 		return -1;
 	}
 
-	/* segment routing */
-	if (hdr->type == IPV6_SRCRT_TYPE_4)
+	switch (hdr->type) {
+	case IPV6_SRCRT_TYPE_4:
+		/* segment routing */
 		return ipv6_srh_rcv(skb);
+	case IPV6_SRCRT_TYPE_3:
+		/* rpl segment routing */
+		return ipv6_rpl_srh_rcv(skb);
+	default:
+		break;
+	}
 
 looped_back:
 	if (hdr->segments_left == 0) {
diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c
new file mode 100644
index 000000000000..dc4f20e23bf7
--- /dev/null
+++ b/net/ipv6/rpl.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/**
+ * Authors:
+ * (C) 2020 Alexander Aring <alex.aring@gmail.com>
+ */
+
+#include <net/ipv6.h>
+#include <net/rpl.h>
+
+#define IPV6_PFXTAIL_LEN(x) (sizeof(struct in6_addr) - (x))
+
+static void ipv6_rpl_addr_decompress(struct in6_addr *dst,
+				     const struct in6_addr *daddr,
+				     const void *post, unsigned char pfx)
+{
+	memcpy(dst, daddr, pfx);
+	memcpy(&dst->s6_addr[pfx], post, IPV6_PFXTAIL_LEN(pfx));
+}
+
+static void ipv6_rpl_addr_compress(void *dst, const struct in6_addr *addr,
+				   unsigned char pfx)
+{
+	memcpy(dst, &addr->s6_addr[pfx], IPV6_PFXTAIL_LEN(pfx));
+}
+
+static void *ipv6_rpl_segdata_pos(const struct ipv6_rpl_sr_hdr *hdr, int i)
+{
+	return (void *)&hdr->rpl_segdata[i * IPV6_PFXTAIL_LEN(hdr->cmpri)];
+}
+
+size_t ipv6_rpl_srh_size(unsigned char n, unsigned char cmpri,
+			 unsigned char cmpre)
+{
+	return (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre);
+}
+
+void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr,
+			     const struct ipv6_rpl_sr_hdr *inhdr,
+			     const struct in6_addr *daddr, unsigned char n)
+{
+	int i;
+
+	outhdr->nexthdr = inhdr->nexthdr;
+	outhdr->hdrlen = (((n + 1) * sizeof(struct in6_addr)) >> 3);
+	outhdr->pad = 0;
+	outhdr->type = inhdr->type;
+	outhdr->segments_left = inhdr->segments_left;
+	outhdr->cmpri = 0;
+	outhdr->cmpre = 0;
+
+	for (i = 0; i <= n; i++)
+		ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[i], daddr,
+					 ipv6_rpl_segdata_pos(inhdr, i),
+					 inhdr->cmpri);
+
+	ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[n], daddr,
+				 ipv6_rpl_segdata_pos(inhdr, n),
+				 inhdr->cmpre);
+}
+
+static unsigned char ipv6_rpl_srh_calc_cmpri(const struct ipv6_rpl_sr_hdr *inhdr,
+					     const struct in6_addr *daddr,
+					     unsigned char n)
+{
+	unsigned char plen;
+	int i;
+
+	for (plen = 0; plen < sizeof(*daddr); plen++) {
+		for (i = 0; i <= n; i++) {
+			if (daddr->s6_addr[plen] !=
+			    inhdr->rpl_segaddr[i].s6_addr[plen])
+				return plen;
+		}
+	}
+
+	return plen;
+}
+
+static unsigned char ipv6_rpl_srh_calc_cmpre(const struct in6_addr *daddr,
+					     const struct in6_addr *last_segment)
+{
+	unsigned int plen;
+
+	for (plen = 0; plen < sizeof(*daddr); plen++) {
+		if (daddr->s6_addr[plen] != last_segment->s6_addr[plen])
+			break;
+	}
+
+	return plen;
+}
+
+void ipv6_rpl_srh_compress(struct ipv6_rpl_sr_hdr *outhdr,
+			   const struct ipv6_rpl_sr_hdr *inhdr,
+			   const struct in6_addr *daddr, unsigned char n)
+{
+	unsigned char cmpri, cmpre;
+	size_t seglen;
+	int i;
+
+	cmpri = ipv6_rpl_srh_calc_cmpri(inhdr, daddr, n);
+	cmpre = ipv6_rpl_srh_calc_cmpre(daddr, &inhdr->rpl_segaddr[n]);
+
+	outhdr->nexthdr = inhdr->nexthdr;
+	seglen = (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre);
+	outhdr->hdrlen = seglen >> 3;
+	if (seglen & 0x7) {
+		outhdr->hdrlen++;
+		outhdr->pad = 8 - (seglen & 0x7);
+	} else {
+		outhdr->pad = 0;
+	}
+	outhdr->type = inhdr->type;
+	outhdr->segments_left = inhdr->segments_left;
+	outhdr->cmpri = cmpri;
+	outhdr->cmpre = cmpre;
+
+	for (i = 0; i <= n; i++)
+		ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, i),
+				       &inhdr->rpl_segaddr[i], cmpri);
+
+	ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, n),
+			       &inhdr->rpl_segaddr[n], cmpre);
+}
-- 
cgit v1.2.3


From 62582a7ee78364c6106d09d5e0f1dc7f564be887 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Sun, 29 Mar 2020 07:55:10 -0700
Subject: ptp: Avoid deadlocks in the programmable pin code.

The PTP Hardware Clock (PHC) subsystem offers an API for configuring
programmable pins.  User space sets or gets the settings using ioctls,
and drivers verify dialed settings via a callback.  Drivers may also
query pin settings by calling the ptp_find_pin() method.

Although the core subsystem protects concurrent access to the pin
settings, the implementation places illogical restrictions on how
drivers may call ptp_find_pin().  When enabling an auxiliary function
via the .enable(on=1) callback, drivers may invoke the pin finding
method, but when disabling with .enable(on=0) drivers are not
permitted to do so.  With the exception of the mv88e6xxx, all of the
PHC drivers do respect this restriction, but still the locking pattern
is both confusing and unnecessary.

This patch changes the locking implementation to allow PHC drivers to
freely call ptp_find_pin() from their .enable() and .verify()
callbacks.

V2 ChangeLog:
- fixed spelling in the kernel doc
- add Vladimir's tested by tag

Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Yangbo Lu <yangbo.lu@nxp.com>
Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/dp83640.c        |  2 +-
 drivers/ptp/ptp_chardev.c        |  9 +++++++++
 drivers/ptp/ptp_clock.c          | 17 +++++++++++++++--
 include/linux/ptp_clock_kernel.h | 19 +++++++++++++++++++
 4 files changed, 44 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index ac72a324fcd1..415c27310982 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -628,7 +628,7 @@ static void recalibrate(struct dp83640_clock *clock)
 	u16 cal_gpio, cfg0, evnt, ptp_trig, trigger, val;
 
 	trigger = CAL_TRIGGER;
-	cal_gpio = 1 + ptp_find_pin(clock->ptp_clock, PTP_PF_PHYSYNC, 0);
+	cal_gpio = 1 + ptp_find_pin_unlocked(clock->ptp_clock, PTP_PF_PHYSYNC, 0);
 	if (cal_gpio < 1) {
 		pr_err("PHY calibration pin not available - PHY is not calibrated.");
 		return;
diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 9d72ab593f13..93d574faf1fe 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -175,7 +175,10 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		}
 		req.type = PTP_CLK_REQ_EXTTS;
 		enable = req.extts.flags & PTP_ENABLE_FEATURE ? 1 : 0;
+		if (mutex_lock_interruptible(&ptp->pincfg_mux))
+			return -ERESTARTSYS;
 		err = ops->enable(ops, &req, enable);
+		mutex_unlock(&ptp->pincfg_mux);
 		break;
 
 	case PTP_PEROUT_REQUEST:
@@ -206,7 +209,10 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		}
 		req.type = PTP_CLK_REQ_PEROUT;
 		enable = req.perout.period.sec || req.perout.period.nsec;
+		if (mutex_lock_interruptible(&ptp->pincfg_mux))
+			return -ERESTARTSYS;
 		err = ops->enable(ops, &req, enable);
+		mutex_unlock(&ptp->pincfg_mux);
 		break;
 
 	case PTP_ENABLE_PPS:
@@ -217,7 +223,10 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 			return -EPERM;
 		req.type = PTP_CLK_REQ_PPS;
 		enable = arg ? 1 : 0;
+		if (mutex_lock_interruptible(&ptp->pincfg_mux))
+			return -ERESTARTSYS;
 		err = ops->enable(ops, &req, enable);
+		mutex_unlock(&ptp->pincfg_mux);
 		break;
 
 	case PTP_SYS_OFFSET_PRECISE:
diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index ac1f2bf9e888..acabbe72e55e 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -348,7 +348,6 @@ int ptp_find_pin(struct ptp_clock *ptp,
 	struct ptp_pin_desc *pin = NULL;
 	int i;
 
-	mutex_lock(&ptp->pincfg_mux);
 	for (i = 0; i < ptp->info->n_pins; i++) {
 		if (ptp->info->pin_config[i].func == func &&
 		    ptp->info->pin_config[i].chan == chan) {
@@ -356,12 +355,26 @@ int ptp_find_pin(struct ptp_clock *ptp,
 			break;
 		}
 	}
-	mutex_unlock(&ptp->pincfg_mux);
 
 	return pin ? i : -1;
 }
 EXPORT_SYMBOL(ptp_find_pin);
 
+int ptp_find_pin_unlocked(struct ptp_clock *ptp,
+			  enum ptp_pin_function func, unsigned int chan)
+{
+	int result;
+
+	mutex_lock(&ptp->pincfg_mux);
+
+	result = ptp_find_pin(ptp, func, chan);
+
+	mutex_unlock(&ptp->pincfg_mux);
+
+	return result;
+}
+EXPORT_SYMBOL(ptp_find_pin_unlocked);
+
 int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay)
 {
 	return kthread_mod_delayed_work(ptp->kworker, &ptp->aux_work, delay);
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index c64a1ef87240..121a7eda4593 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -223,6 +223,12 @@ extern s32 scaled_ppm_to_ppb(long ppm);
 /**
  * ptp_find_pin() - obtain the pin index of a given auxiliary function
  *
+ * The caller must hold ptp_clock::pincfg_mux.  Drivers do not have
+ * access to that mutex as ptp_clock is an opaque type.  However, the
+ * core code acquires the mutex before invoking the driver's
+ * ptp_clock_info::enable() callback, and so drivers may call this
+ * function from that context.
+ *
  * @ptp:    The clock obtained from ptp_clock_register().
  * @func:   One of the ptp_pin_function enumerated values.
  * @chan:   The particular functional channel to find.
@@ -233,6 +239,19 @@ extern s32 scaled_ppm_to_ppb(long ppm);
 int ptp_find_pin(struct ptp_clock *ptp,
 		 enum ptp_pin_function func, unsigned int chan);
 
+/**
+ * ptp_find_pin_unlocked() - wrapper for ptp_find_pin()
+ *
+ * This function acquires the ptp_clock::pincfg_mux mutex before
+ * invoking ptp_find_pin().  Instead of using this function, drivers
+ * should most likely call ptp_find_pin() directly from their
+ * ptp_clock_info::enable() method.
+ *
+ */
+
+int ptp_find_pin_unlocked(struct ptp_clock *ptp,
+			  enum ptp_pin_function func, unsigned int chan);
+
 /**
  * ptp_schedule_worker() - schedule ptp auxiliary work
  *
-- 
cgit v1.2.3


From 8063f761cd7c17fc1d0018728936e0c33a25388a Mon Sep 17 00:00:00 2001
From: Yuval Basson <ybason@marvell.com>
Date: Sun, 29 Mar 2020 20:32:49 +0300
Subject: qed: Fix use after free in qed_chain_free

The qed_chain data structure was modified in
commit 1a4a69751f4d ("qed: Chain support for external PBL") to support
receiving an external pbl (due to iWARP FW requirements).
The pages pointed to by the pbl are allocated in qed_chain_alloc
and their virtual address are stored in an virtual addresses array to
enable accessing and freeing the data. The physical addresses however
weren't stored and were accessed directly from the external-pbl
during free.

Destroy-qp flow, leads to freeing the external pbl before the chain is
freed, when the chain is freed it tries accessing the already freed
external pbl, leading to a use-after-free. Therefore we need to store
the physical addresses in additional to the virtual addresses in a
new data structure.

Fixes: 1a4a69751f4d ("qed: Chain support for external PBL")
Signed-off-by: Michal Kalderon <mkalderon@marvell.com>
Signed-off-by: Yuval Bason <ybason@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c | 38 +++++++++++++------------------
 include/linux/qed/qed_chain.h             | 24 +++++++++++--------
 2 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 03bdd2e26329..38a65b984e47 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -4691,26 +4691,20 @@ static void qed_chain_free_single(struct qed_dev *cdev,
 
 static void qed_chain_free_pbl(struct qed_dev *cdev, struct qed_chain *p_chain)
 {
-	void **pp_virt_addr_tbl = p_chain->pbl.pp_virt_addr_tbl;
+	struct addr_tbl_entry *pp_addr_tbl = p_chain->pbl.pp_addr_tbl;
 	u32 page_cnt = p_chain->page_cnt, i, pbl_size;
-	u8 *p_pbl_virt = p_chain->pbl_sp.p_virt_table;
 
-	if (!pp_virt_addr_tbl)
+	if (!pp_addr_tbl)
 		return;
 
-	if (!p_pbl_virt)
-		goto out;
-
 	for (i = 0; i < page_cnt; i++) {
-		if (!pp_virt_addr_tbl[i])
+		if (!pp_addr_tbl[i].virt_addr || !pp_addr_tbl[i].dma_map)
 			break;
 
 		dma_free_coherent(&cdev->pdev->dev,
 				  QED_CHAIN_PAGE_SIZE,
-				  pp_virt_addr_tbl[i],
-				  *(dma_addr_t *)p_pbl_virt);
-
-		p_pbl_virt += QED_CHAIN_PBL_ENTRY_SIZE;
+				  pp_addr_tbl[i].virt_addr,
+				  pp_addr_tbl[i].dma_map);
 	}
 
 	pbl_size = page_cnt * QED_CHAIN_PBL_ENTRY_SIZE;
@@ -4720,9 +4714,9 @@ static void qed_chain_free_pbl(struct qed_dev *cdev, struct qed_chain *p_chain)
 				  pbl_size,
 				  p_chain->pbl_sp.p_virt_table,
 				  p_chain->pbl_sp.p_phys_table);
-out:
-	vfree(p_chain->pbl.pp_virt_addr_tbl);
-	p_chain->pbl.pp_virt_addr_tbl = NULL;
+
+	vfree(p_chain->pbl.pp_addr_tbl);
+	p_chain->pbl.pp_addr_tbl = NULL;
 }
 
 void qed_chain_free(struct qed_dev *cdev, struct qed_chain *p_chain)
@@ -4823,19 +4817,19 @@ qed_chain_alloc_pbl(struct qed_dev *cdev,
 {
 	u32 page_cnt = p_chain->page_cnt, size, i;
 	dma_addr_t p_phys = 0, p_pbl_phys = 0;
-	void **pp_virt_addr_tbl = NULL;
+	struct addr_tbl_entry *pp_addr_tbl;
 	u8 *p_pbl_virt = NULL;
 	void *p_virt = NULL;
 
-	size = page_cnt * sizeof(*pp_virt_addr_tbl);
-	pp_virt_addr_tbl = vzalloc(size);
-	if (!pp_virt_addr_tbl)
+	size = page_cnt * sizeof(*pp_addr_tbl);
+	pp_addr_tbl =  vzalloc(size);
+	if (!pp_addr_tbl)
 		return -ENOMEM;
 
 	/* The allocation of the PBL table is done with its full size, since it
 	 * is expected to be successive.
 	 * qed_chain_init_pbl_mem() is called even in a case of an allocation
-	 * failure, since pp_virt_addr_tbl was previously allocated, and it
+	 * failure, since tbl was previously allocated, and it
 	 * should be saved to allow its freeing during the error flow.
 	 */
 	size = page_cnt * QED_CHAIN_PBL_ENTRY_SIZE;
@@ -4849,8 +4843,7 @@ qed_chain_alloc_pbl(struct qed_dev *cdev,
 		p_chain->b_external_pbl = true;
 	}
 
-	qed_chain_init_pbl_mem(p_chain, p_pbl_virt, p_pbl_phys,
-			       pp_virt_addr_tbl);
+	qed_chain_init_pbl_mem(p_chain, p_pbl_virt, p_pbl_phys, pp_addr_tbl);
 	if (!p_pbl_virt)
 		return -ENOMEM;
 
@@ -4869,7 +4862,8 @@ qed_chain_alloc_pbl(struct qed_dev *cdev,
 		/* Fill the PBL table with the physical address of the page */
 		*(dma_addr_t *)p_pbl_virt = p_phys;
 		/* Keep the virtual address of the page */
-		p_chain->pbl.pp_virt_addr_tbl[i] = p_virt;
+		p_chain->pbl.pp_addr_tbl[i].virt_addr = p_virt;
+		p_chain->pbl.pp_addr_tbl[i].dma_map = p_phys;
 
 		p_pbl_virt += QED_CHAIN_PBL_ENTRY_SIZE;
 	}
diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h
index 2dd0a9ed5b36..733fad7dfbed 100644
--- a/include/linux/qed/qed_chain.h
+++ b/include/linux/qed/qed_chain.h
@@ -97,6 +97,11 @@ struct qed_chain_u32 {
 	u32 cons_idx;
 };
 
+struct addr_tbl_entry {
+	void *virt_addr;
+	dma_addr_t dma_map;
+};
+
 struct qed_chain {
 	/* fastpath portion of the chain - required for commands such
 	 * as produce / consume.
@@ -107,10 +112,11 @@ struct qed_chain {
 
 	/* Fastpath portions of the PBL [if exists] */
 	struct {
-		/* Table for keeping the virtual addresses of the chain pages,
-		 * respectively to the physical addresses in the pbl table.
+		/* Table for keeping the virtual and physical addresses of the
+		 * chain pages, respectively to the physical addresses
+		 * in the pbl table.
 		 */
-		void **pp_virt_addr_tbl;
+		struct addr_tbl_entry *pp_addr_tbl;
 
 		union {
 			struct qed_chain_pbl_u16 u16;
@@ -287,7 +293,7 @@ qed_chain_advance_page(struct qed_chain *p_chain,
 				*(u32 *)page_to_inc = 0;
 			page_index = *(u32 *)page_to_inc;
 		}
-		*p_next_elem = p_chain->pbl.pp_virt_addr_tbl[page_index];
+		*p_next_elem = p_chain->pbl.pp_addr_tbl[page_index].virt_addr;
 	}
 }
 
@@ -537,7 +543,7 @@ static inline void qed_chain_init_params(struct qed_chain *p_chain,
 
 	p_chain->pbl_sp.p_phys_table = 0;
 	p_chain->pbl_sp.p_virt_table = NULL;
-	p_chain->pbl.pp_virt_addr_tbl = NULL;
+	p_chain->pbl.pp_addr_tbl = NULL;
 }
 
 /**
@@ -575,11 +581,11 @@ static inline void qed_chain_init_mem(struct qed_chain *p_chain,
 static inline void qed_chain_init_pbl_mem(struct qed_chain *p_chain,
 					  void *p_virt_pbl,
 					  dma_addr_t p_phys_pbl,
-					  void **pp_virt_addr_tbl)
+					  struct addr_tbl_entry *pp_addr_tbl)
 {
 	p_chain->pbl_sp.p_phys_table = p_phys_pbl;
 	p_chain->pbl_sp.p_virt_table = p_virt_pbl;
-	p_chain->pbl.pp_virt_addr_tbl = pp_virt_addr_tbl;
+	p_chain->pbl.pp_addr_tbl = pp_addr_tbl;
 }
 
 /**
@@ -644,7 +650,7 @@ static inline void *qed_chain_get_last_elem(struct qed_chain *p_chain)
 		break;
 	case QED_CHAIN_MODE_PBL:
 		last_page_idx = p_chain->page_cnt - 1;
-		p_virt_addr = p_chain->pbl.pp_virt_addr_tbl[last_page_idx];
+		p_virt_addr = p_chain->pbl.pp_addr_tbl[last_page_idx].virt_addr;
 		break;
 	}
 	/* p_virt_addr points at this stage to the last page of the chain */
@@ -716,7 +722,7 @@ static inline void qed_chain_pbl_zero_mem(struct qed_chain *p_chain)
 	page_cnt = qed_chain_get_page_cnt(p_chain);
 
 	for (i = 0; i < page_cnt; i++)
-		memset(p_chain->pbl.pp_virt_addr_tbl[i], 0,
+		memset(p_chain->pbl.pp_addr_tbl[i].virt_addr, 0,
 		       QED_CHAIN_PAGE_SIZE);
 }
 
-- 
cgit v1.2.3


From 0bd274060a0f49d974b7e88fa87b6e1c1f496a48 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Mon, 30 Mar 2020 18:44:44 +0100
Subject: net: phylink: change phylink_mii_c22_pcs_set_advertisement()
 prototype

Change phylink_mii_c22_pcs_set_advertisement() to take only the PHY
interface and advertisement mask, rather than the full phylink state.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 12 +++++++-----
 include/linux/phylink.h   |  3 ++-
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index fed0c5907c6a..f31bfd39df4b 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -2184,7 +2184,8 @@ EXPORT_SYMBOL_GPL(phylink_mii_c22_pcs_get_state);
  * phylink_mii_c22_pcs_set_advertisement() - configure the clause 37 PCS
  *	advertisement
  * @pcs: a pointer to a &struct mdio_device.
- * @state: a pointer to the state being configured.
+ * @interface: the PHY interface mode being configured
+ * @advertising: the ethtool advertisement mask
  *
  * Helper for MAC PCS supporting the 802.3 clause 22 register set for
  * clause 37 negotiation and/or SGMII control.
@@ -2197,22 +2198,23 @@ EXPORT_SYMBOL_GPL(phylink_mii_c22_pcs_get_state);
  * zero if no change has been made, or one if the advertisement has changed.
  */
 int phylink_mii_c22_pcs_set_advertisement(struct mdio_device *pcs,
-					const struct phylink_link_state *state)
+					  phy_interface_t interface,
+					  const unsigned long *advertising)
 {
 	struct mii_bus *bus = pcs->bus;
 	int addr = pcs->addr;
 	int val, ret;
 	u16 adv;
 
-	switch (state->interface) {
+	switch (interface) {
 	case PHY_INTERFACE_MODE_1000BASEX:
 	case PHY_INTERFACE_MODE_2500BASEX:
 		adv = ADVERTISE_1000XFULL;
 		if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT,
-				      state->advertising))
+				      advertising))
 			adv |= ADVERTISE_1000XPAUSE;
 		if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT,
-				      state->advertising))
+				      advertising))
 			adv |= ADVERTISE_1000XPSE_ASYM;
 
 		val = mdiobus_read(bus, addr, MII_ADVERTISE);
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 8fa6df3b881b..6f6ecf3e0be1 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -320,7 +320,8 @@ void phylink_helper_basex_speed(struct phylink_link_state *state);
 void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs,
 				   struct phylink_link_state *state);
 int phylink_mii_c22_pcs_set_advertisement(struct mdio_device *pcs,
-					const struct phylink_link_state *state);
+					  phy_interface_t interface,
+					  const unsigned long *advertising);
 void phylink_mii_c22_pcs_an_restart(struct mdio_device *pcs);
 
 void phylink_mii_c45_pcs_get_state(struct mdio_device *pcs,
-- 
cgit v1.2.3


From e7765d634aaa9dd5db3cb59155269ef6c18d4592 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Mon, 30 Mar 2020 18:44:50 +0100
Subject: net: phylink: rename 'ops' to 'mac_ops'

Rename the bland 'ops' member of struct phylink to be a more
descriptive 'mac_ops' - this is necessary as we're about to introduce
another set of operations.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 30 +++++++++++++++---------------
 include/linux/phylink.h   |  2 +-
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index f31bfd39df4b..e2f30fd4d235 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -40,7 +40,7 @@ enum {
 struct phylink {
 	/* private: */
 	struct net_device *netdev;
-	const struct phylink_mac_ops *ops;
+	const struct phylink_mac_ops *mac_ops;
 	struct phylink_config *config;
 	struct device *dev;
 	unsigned int old_link_state:1;
@@ -154,7 +154,7 @@ static const char *phylink_an_mode_str(unsigned int mode)
 static int phylink_validate(struct phylink *pl, unsigned long *supported,
 			    struct phylink_link_state *state)
 {
-	pl->ops->validate(pl->config, supported, state);
+	pl->mac_ops->validate(pl->config, supported, state);
 
 	return phylink_is_empty_linkmode(supported) ? -EINVAL : 0;
 }
@@ -415,7 +415,7 @@ static void phylink_mac_config(struct phylink *pl,
 		    __ETHTOOL_LINK_MODE_MASK_NBITS, state->advertising,
 		    state->pause, state->link, state->an_enabled);
 
-	pl->ops->mac_config(pl->config, pl->cur_link_an_mode, state);
+	pl->mac_ops->mac_config(pl->config, pl->cur_link_an_mode, state);
 }
 
 static void phylink_mac_config_up(struct phylink *pl,
@@ -429,7 +429,7 @@ static void phylink_mac_an_restart(struct phylink *pl)
 {
 	if (pl->link_config.an_enabled &&
 	    phy_interface_mode_is_8023z(pl->link_config.interface))
-		pl->ops->mac_an_restart(pl->config);
+		pl->mac_ops->mac_an_restart(pl->config);
 }
 
 static void phylink_mac_pcs_get_state(struct phylink *pl,
@@ -445,7 +445,7 @@ static void phylink_mac_pcs_get_state(struct phylink *pl,
 	state->an_complete = 0;
 	state->link = 1;
 
-	pl->ops->mac_pcs_get_state(pl->config, state);
+	pl->mac_ops->mac_pcs_get_state(pl->config, state);
 }
 
 /* The fixed state is... fixed except for the link state,
@@ -512,11 +512,11 @@ static void phylink_mac_link_up(struct phylink *pl,
 	struct net_device *ndev = pl->netdev;
 
 	pl->cur_interface = link_state.interface;
-	pl->ops->mac_link_up(pl->config, pl->phydev,
-			     pl->cur_link_an_mode, pl->cur_interface,
-			     link_state.speed, link_state.duplex,
-			     !!(link_state.pause & MLO_PAUSE_TX),
-			     !!(link_state.pause & MLO_PAUSE_RX));
+	pl->mac_ops->mac_link_up(pl->config, pl->phydev,
+				 pl->cur_link_an_mode, pl->cur_interface,
+				 link_state.speed, link_state.duplex,
+				 !!(link_state.pause & MLO_PAUSE_TX),
+				 !!(link_state.pause & MLO_PAUSE_RX));
 
 	if (ndev)
 		netif_carrier_on(ndev);
@@ -534,8 +534,8 @@ static void phylink_mac_link_down(struct phylink *pl)
 
 	if (ndev)
 		netif_carrier_off(ndev);
-	pl->ops->mac_link_down(pl->config, pl->cur_link_an_mode,
-			       pl->cur_interface);
+	pl->mac_ops->mac_link_down(pl->config, pl->cur_link_an_mode,
+				   pl->cur_interface);
 	phylink_info(pl, "Link is Down\n");
 }
 
@@ -666,7 +666,7 @@ static int phylink_register_sfp(struct phylink *pl,
  * @fwnode: a pointer to a &struct fwnode_handle describing the network
  *	interface
  * @iface: the desired link mode defined by &typedef phy_interface_t
- * @ops: a pointer to a &struct phylink_mac_ops for the MAC.
+ * @mac_ops: a pointer to a &struct phylink_mac_ops for the MAC.
  *
  * Create a new phylink instance, and parse the link parameters found in @np.
  * This will parse in-band modes, fixed-link or SFP configuration.
@@ -679,7 +679,7 @@ static int phylink_register_sfp(struct phylink *pl,
 struct phylink *phylink_create(struct phylink_config *config,
 			       struct fwnode_handle *fwnode,
 			       phy_interface_t iface,
-			       const struct phylink_mac_ops *ops)
+			       const struct phylink_mac_ops *mac_ops)
 {
 	struct phylink *pl;
 	int ret;
@@ -712,7 +712,7 @@ struct phylink *phylink_create(struct phylink_config *config,
 	pl->link_config.speed = SPEED_UNKNOWN;
 	pl->link_config.duplex = DUPLEX_UNKNOWN;
 	pl->link_config.an_enabled = true;
-	pl->ops = ops;
+	pl->mac_ops = mac_ops;
 	__set_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
 	timer_setup(&pl->link_poll, phylink_fixed_poll, 0);
 
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 6f6ecf3e0be1..90c907eaae15 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -272,7 +272,7 @@ void mac_link_up(struct phylink_config *config, struct phy_device *phy,
 
 struct phylink *phylink_create(struct phylink_config *, struct fwnode_handle *,
 			       phy_interface_t iface,
-			       const struct phylink_mac_ops *ops);
+			       const struct phylink_mac_ops *mac_ops);
 void phylink_destroy(struct phylink *);
 
 int phylink_connect_phy(struct phylink *, struct phy_device *);
-- 
cgit v1.2.3


From 4c0d6d3a7a81fcd2dcb4abf15fe2e13074cf8619 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Mon, 30 Mar 2020 18:44:55 +0100
Subject: net: phylink: add separate pcs operations structure

Add a separate set of PCS operations, which MAC drivers can use to
couple phylink with their associated MAC PCS layer.  The PCS
operations include:

- pcs_get_state() - reads the link up/down, resolved speed, duplex
   and pause from the PCS.
- pcs_config() - configures the PCS for the specified mode, PHY
   interface type, and setting the advertisement.
- pcs_an_restart() - restarts 802.3 in-band negotiation with the
   link partner
- pcs_link_up() - informs the PCS that link has come up, and the
   parameters of the link. Link parameters are used to program the
   PCS for fixed speed and non-inband modes.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 77 +++++++++++++++++++++++++++++------------
 include/linux/phylink.h   | 88 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index e2f30fd4d235..34ca12aec61b 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -41,6 +41,7 @@ struct phylink {
 	/* private: */
 	struct net_device *netdev;
 	const struct phylink_mac_ops *mac_ops;
+	const struct phylink_pcs_ops *pcs_ops;
 	struct phylink_config *config;
 	struct device *dev;
 	unsigned int old_link_state:1;
@@ -425,11 +426,32 @@ static void phylink_mac_config_up(struct phylink *pl,
 		phylink_mac_config(pl, state);
 }
 
-static void phylink_mac_an_restart(struct phylink *pl)
+static void phylink_mac_pcs_an_restart(struct phylink *pl)
 {
 	if (pl->link_config.an_enabled &&
-	    phy_interface_mode_is_8023z(pl->link_config.interface))
-		pl->mac_ops->mac_an_restart(pl->config);
+	    phy_interface_mode_is_8023z(pl->link_config.interface)) {
+		if (pl->pcs_ops)
+			pl->pcs_ops->pcs_an_restart(pl->config);
+		else
+			pl->mac_ops->mac_an_restart(pl->config);
+	}
+}
+
+static void phylink_pcs_config(struct phylink *pl, bool force_restart,
+			       const struct phylink_link_state *state)
+{
+	bool restart = force_restart;
+
+	if (pl->pcs_ops && pl->pcs_ops->pcs_config(pl->config,
+						   pl->cur_link_an_mode,
+						   state->interface,
+						   state->advertising))
+		restart = true;
+
+	phylink_mac_config(pl, state);
+
+	if (restart)
+		phylink_mac_pcs_an_restart(pl);
 }
 
 static void phylink_mac_pcs_get_state(struct phylink *pl,
@@ -445,7 +467,10 @@ static void phylink_mac_pcs_get_state(struct phylink *pl,
 	state->an_complete = 0;
 	state->link = 1;
 
-	pl->mac_ops->mac_pcs_get_state(pl->config, state);
+	if (pl->pcs_ops)
+		pl->pcs_ops->pcs_get_state(pl->config, state);
+	else
+		pl->mac_ops->mac_pcs_get_state(pl->config, state);
 }
 
 /* The fixed state is... fixed except for the link state,
@@ -463,7 +488,7 @@ static void phylink_get_fixed_state(struct phylink *pl,
 	phylink_resolve_flow(state);
 }
 
-static void phylink_mac_initial_config(struct phylink *pl)
+static void phylink_mac_initial_config(struct phylink *pl, bool force_restart)
 {
 	struct phylink_link_state link_state;
 
@@ -489,7 +514,7 @@ static void phylink_mac_initial_config(struct phylink *pl)
 	link_state.link = false;
 
 	phylink_apply_manual_flow(pl, &link_state);
-	phylink_mac_config(pl, &link_state);
+	phylink_pcs_config(pl, force_restart, &link_state);
 }
 
 static const char *phylink_pause_to_str(int pause)
@@ -506,12 +531,18 @@ static const char *phylink_pause_to_str(int pause)
 	}
 }
 
-static void phylink_mac_link_up(struct phylink *pl,
-				struct phylink_link_state link_state)
+static void phylink_link_up(struct phylink *pl,
+			    struct phylink_link_state link_state)
 {
 	struct net_device *ndev = pl->netdev;
 
 	pl->cur_interface = link_state.interface;
+
+	if (pl->pcs_ops && pl->pcs_ops->pcs_link_up)
+		pl->pcs_ops->pcs_link_up(pl->config, pl->cur_link_an_mode,
+					 pl->cur_interface,
+					 link_state.speed, link_state.duplex);
+
 	pl->mac_ops->mac_link_up(pl->config, pl->phydev,
 				 pl->cur_link_an_mode, pl->cur_interface,
 				 link_state.speed, link_state.duplex,
@@ -528,7 +559,7 @@ static void phylink_mac_link_up(struct phylink *pl,
 		     phylink_pause_to_str(link_state.pause));
 }
 
-static void phylink_mac_link_down(struct phylink *pl)
+static void phylink_link_down(struct phylink *pl)
 {
 	struct net_device *ndev = pl->netdev;
 
@@ -597,9 +628,9 @@ static void phylink_resolve(struct work_struct *w)
 	if (link_changed) {
 		pl->old_link_state = link_state.link;
 		if (!link_state.link)
-			phylink_mac_link_down(pl);
+			phylink_link_down(pl);
 		else
-			phylink_mac_link_up(pl, link_state);
+			phylink_link_up(pl, link_state);
 	}
 	if (!link_state.link && pl->mac_link_dropped) {
 		pl->mac_link_dropped = false;
@@ -746,6 +777,12 @@ struct phylink *phylink_create(struct phylink_config *config,
 }
 EXPORT_SYMBOL_GPL(phylink_create);
 
+void phylink_add_pcs(struct phylink *pl, const struct phylink_pcs_ops *ops)
+{
+	pl->pcs_ops = ops;
+}
+EXPORT_SYMBOL_GPL(phylink_add_pcs);
+
 /**
  * phylink_destroy() - cleanup and destroy the phylink instance
  * @pl: a pointer to a &struct phylink returned from phylink_create()
@@ -1082,14 +1119,12 @@ void phylink_start(struct phylink *pl)
 	/* Apply the link configuration to the MAC when starting. This allows
 	 * a fixed-link to start with the correct parameters, and also
 	 * ensures that we set the appropriate advertisement for Serdes links.
-	 */
-	phylink_mac_initial_config(pl);
-
-	/* Restart autonegotiation if using 802.3z to ensure that the link
+	 *
+	 * Restart autonegotiation if using 802.3z to ensure that the link
 	 * parameters are properly negotiated.  This is necessary for DSA
 	 * switches using 802.3z negotiation to ensure they see our modes.
 	 */
-	phylink_mac_an_restart(pl);
+	phylink_mac_initial_config(pl, true);
 
 	clear_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
 	phylink_run_resolve(pl);
@@ -1386,8 +1421,7 @@ int phylink_ethtool_ksettings_set(struct phylink *pl,
 			 * advertisement; the only thing we have is the pause
 			 * modes which can only come from a PHY.
 			 */
-			phylink_mac_config(pl, &pl->link_config);
-			phylink_mac_an_restart(pl);
+			phylink_pcs_config(pl, true, &pl->link_config);
 		}
 		mutex_unlock(&pl->state_mutex);
 	}
@@ -1415,7 +1449,7 @@ int phylink_ethtool_nway_reset(struct phylink *pl)
 
 	if (pl->phydev)
 		ret = phy_restart_aneg(pl->phydev);
-	phylink_mac_an_restart(pl);
+	phylink_mac_pcs_an_restart(pl);
 
 	return ret;
 }
@@ -1494,8 +1528,7 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl,
 				   pause->tx_pause);
 	} else if (!test_bit(PHYLINK_DISABLE_STOPPED,
 			     &pl->phylink_disable_state)) {
-		phylink_mac_config(pl, &pl->link_config);
-		phylink_mac_an_restart(pl);
+		phylink_pcs_config(pl, true, &pl->link_config);
 	}
 	mutex_unlock(&pl->state_mutex);
 
@@ -1901,7 +1934,7 @@ static int phylink_sfp_config(struct phylink *pl, u8 mode,
 
 	if (changed && !test_bit(PHYLINK_DISABLE_STOPPED,
 				 &pl->phylink_disable_state))
-		phylink_mac_initial_config(pl);
+		phylink_mac_initial_config(pl, false);
 
 	return ret;
 }
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 90c907eaae15..3f8d37ec5503 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -270,9 +270,97 @@ void mac_link_up(struct phylink_config *config, struct phy_device *phy,
 		 int speed, int duplex, bool tx_pause, bool rx_pause);
 #endif
 
+/**
+ * struct phylink_pcs_ops - MAC PCS operations structure.
+ * @pcs_get_state: read the current MAC PCS link state from the hardware.
+ * @pcs_config: configure the MAC PCS for the selected mode and state.
+ * @pcs_an_restart: restart 802.3z BaseX autonegotiation.
+ * @pcs_link_up: program the PCS for the resolved link configuration
+ *               (where necessary).
+ */
+struct phylink_pcs_ops {
+	void (*pcs_get_state)(struct phylink_config *config,
+			      struct phylink_link_state *state);
+	int (*pcs_config)(struct phylink_config *config, unsigned int mode,
+			  phy_interface_t interface,
+			  const unsigned long *advertising);
+	void (*pcs_an_restart)(struct phylink_config *config);
+	void (*pcs_link_up)(struct phylink_config *config, unsigned int mode,
+			    phy_interface_t interface, int speed, int duplex);
+};
+
+#if 0 /* For kernel-doc purposes only. */
+/**
+ * pcs_get_state() - Read the current inband link state from the hardware
+ * @config: a pointer to a &struct phylink_config.
+ * @state: a pointer to a &struct phylink_link_state.
+ *
+ * Read the current inband link state from the MAC PCS, reporting the
+ * current speed in @state->speed, duplex mode in @state->duplex, pause
+ * mode in @state->pause using the %MLO_PAUSE_RX and %MLO_PAUSE_TX bits,
+ * negotiation completion state in @state->an_complete, and link up state
+ * in @state->link. If possible, @state->lp_advertising should also be
+ * populated.
+ *
+ * When present, this overrides mac_pcs_get_state() in &struct
+ * phylink_mac_ops.
+ */
+void pcs_get_state(struct phylink_config *config,
+		   struct phylink_link_state *state);
+
+/**
+ * pcs_config() - Configure the PCS mode and advertisement
+ * @config: a pointer to a &struct phylink_config.
+ * @mode: one of %MLO_AN_FIXED, %MLO_AN_PHY, %MLO_AN_INBAND.
+ * @interface: interface mode to be used
+ * @advertising: adertisement ethtool link mode mask
+ *
+ * Configure the PCS for the operating mode, the interface mode, and set
+ * the advertisement mask.
+ *
+ * When operating in %MLO_AN_INBAND, inband should always be enabled,
+ * otherwise inband should be disabled.
+ *
+ * For SGMII, there is no advertisement from the MAC side, the PCS should
+ * be programmed to acknowledge the inband word from the PHY.
+ *
+ * For 1000BASE-X, the advertisement should be programmed into the PCS.
+ *
+ * For most 10GBASE-R, there is no advertisement.
+ */
+int (*pcs_config)(struct phylink_config *config, unsigned int mode,
+		  phy_interface_t interface, const unsigned long *advertising);
+
+/**
+ * pcs_an_restart() - restart 802.3z BaseX autonegotiation
+ * @config: a pointer to a &struct phylink_config.
+ *
+ * When PCS ops are present, this overrides mac_an_restart() in &struct
+ * phylink_mac_ops.
+ */
+void (*pcs_an_restart)(struct phylink_config *config);
+
+/**
+ * pcs_link_up() - program the PCS for the resolved link configuration
+ * @config: a pointer to a &struct phylink_config.
+ * @mode: link autonegotiation mode
+ * @interface: link &typedef phy_interface_t mode
+ * @speed: link speed
+ * @duplex: link duplex
+ *
+ * This call will be made just before mac_link_up() to inform the PCS of
+ * the resolved link parameters. For example, a PCS operating in SGMII
+ * mode without in-band AN needs to be manually configured for the link
+ * and duplex setting. Otherwise, this should be a no-op.
+ */
+void (*pcs_link_up)(struct phylink_config *config, unsigned int mode,
+		    phy_interface_t interface, int speed, int duplex);
+#endif
+
 struct phylink *phylink_create(struct phylink_config *, struct fwnode_handle *,
 			       phy_interface_t iface,
 			       const struct phylink_mac_ops *mac_ops);
+void phylink_add_pcs(struct phylink *, const struct phylink_pcs_ops *ops);
 void phylink_destroy(struct phylink *);
 
 int phylink_connect_phy(struct phylink *, struct phy_device *);
-- 
cgit v1.2.3


From 3f50f132d8400e129fc9eb68b5020167ef80a244 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 30 Mar 2020 14:36:39 -0700
Subject: bpf: Verifier, do explicit ALU32 bounds tracking

It is not possible for the current verifier to track ALU32 and JMP ops
correctly. This can result in the verifier aborting with errors even though
the program should be verifiable. BPF codes that hit this can work around
it by changin int variables to 64-bit types, marking variables volatile,
etc. But this is all very ugly so it would be better to avoid these tricks.

But, the main reason to address this now is do_refine_retval_range() was
assuming return values could not be negative. Once we fixed this code that
was previously working will no longer work. See do_refine_retval_range()
patch for details. And we don't want to suddenly cause programs that used
to work to fail.

The simplest example code snippet that illustrates the problem is likely
this,

 53: w8 = w0                    // r8 <- [0, S32_MAX],
                                // w8 <- [-S32_MIN, X]
 54: w8 <s 0                    // r8 <- [0, U32_MAX]
                                // w8 <- [0, X]

The expected 64-bit and 32-bit bounds after each line are shown on the
right. The current issue is without the w* bounds we are forced to use
the worst case bound of [0, U32_MAX]. To resolve this type of case,
jmp32 creating divergent 32-bit bounds from 64-bit bounds, we add explicit
32-bit register bounds s32_{min|max}_value and u32_{min|max}_value. Then
from branch_taken logic creating new bounds we can track 32-bit bounds
explicitly.

The next case we observed is ALU ops after the jmp32,

 53: w8 = w0                    // r8 <- [0, S32_MAX],
                                // w8 <- [-S32_MIN, X]
 54: w8 <s 0                    // r8 <- [0, U32_MAX]
                                // w8 <- [0, X]
 55: w8 += 1                    // r8 <- [0, U32_MAX+1]
                                // w8 <- [0, X+1]

In order to keep the bounds accurate at this point we also need to track
ALU32 ops. To do this we add explicit ALU32 logic for each of the ALU
ops, mov, add, sub, etc.

Finally there is a question of how and when to merge bounds. The cases
enumerate here,

1. MOV ALU32   - zext 32-bit -> 64-bit
2. MOV ALU64   - copy 64-bit -> 32-bit
3. op  ALU32   - zext 32-bit -> 64-bit
4. op  ALU64   - n/a
5. jmp ALU32   - 64-bit: var32_off | upper_32_bits(var64_off)
6. jmp ALU64   - 32-bit: (>> (<< var64_off))

Details for each case,

For "MOV ALU32" BPF arch zero extends so we simply copy the bounds
from 32-bit into 64-bit ensuring we truncate var_off and 64-bit
bounds correctly. See zext_32_to_64.

For "MOV ALU64" copy all bounds including 32-bit into new register. If
the src register had 32-bit bounds the dst register will as well.

For "op ALU32" zero extend 32-bit into 64-bit the same as move,
see zext_32_to_64.

For "op ALU64" calculate both 32-bit and 64-bit bounds no merging
is done here. Except we have a special case. When RSH or ARSH is
done we can't simply ignore shifting bits from 64-bit reg into the
32-bit subreg. So currently just push bounds from 64-bit into 32-bit.
This will be correct in the sense that they will represent a valid
state of the register. However we could lose some accuracy if an
ARSH is following a jmp32 operation. We can handle this special
case in a follow up series.

For "jmp ALU32" mark 64-bit reg unknown and recalculate 64-bit bounds
from tnum by setting var_off to ((<<(>>var_off)) | var32_off). We
special case if 64-bit bounds has zero'd upper 32bits at which point
we can simply copy 32-bit bounds into 64-bit register. This catches
a common compiler trick where upper 32-bits are zeroed and then
32-bit ops are used followed by a 64-bit compare or 64-bit op on
a pointer. See __reg_combine_64_into_32().

For "jmp ALU64" cast the bounds of the 64bit to their 32-bit
counterpart. For example s32_min_value = (s32)reg->smin_value. For
tnum use only the lower 32bits via, (>>(<<var_off)). See
__reg_combine_64_into_32().

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/158560419880.10843.11448220440809118343.stgit@john-Precision-5820-Tower
---
 include/linux/bpf_verifier.h |    4 +
 include/linux/limits.h       |    1 +
 include/linux/tnum.h         |   12 +
 kernel/bpf/tnum.c            |   15 +
 kernel/bpf/verifier.c        | 1118 +++++++++++++++++++++++++++++++-----------
 5 files changed, 869 insertions(+), 281 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5406e6e96585..6abd5a778fcd 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -123,6 +123,10 @@ struct bpf_reg_state {
 	s64 smax_value; /* maximum possible (s64)value */
 	u64 umin_value; /* minimum possible (u64)value */
 	u64 umax_value; /* maximum possible (u64)value */
+	s32 s32_min_value; /* minimum possible (s32)value */
+	s32 s32_max_value; /* maximum possible (s32)value */
+	u32 u32_min_value; /* minimum possible (u32)value */
+	u32 u32_max_value; /* maximum possible (u32)value */
 	/* parentage chain for liveness checking */
 	struct bpf_reg_state *parent;
 	/* Inside the callee two registers can be both PTR_TO_STACK like
diff --git a/include/linux/limits.h b/include/linux/limits.h
index 76afcd24ff8c..0d3de82dd354 100644
--- a/include/linux/limits.h
+++ b/include/linux/limits.h
@@ -27,6 +27,7 @@
 #define S16_MAX		((s16)(U16_MAX >> 1))
 #define S16_MIN		((s16)(-S16_MAX - 1))
 #define U32_MAX		((u32)~0U)
+#define U32_MIN		((u32)0)
 #define S32_MAX		((s32)(U32_MAX >> 1))
 #define S32_MIN		((s32)(-S32_MAX - 1))
 #define U64_MAX		((u64)~0ULL)
diff --git a/include/linux/tnum.h b/include/linux/tnum.h
index ea627d1ab7e3..498dbcedb451 100644
--- a/include/linux/tnum.h
+++ b/include/linux/tnum.h
@@ -86,4 +86,16 @@ int tnum_strn(char *str, size_t size, struct tnum a);
 /* Format a tnum as tristate binary expansion */
 int tnum_sbin(char *str, size_t size, struct tnum a);
 
+/* Returns the 32-bit subreg */
+struct tnum tnum_subreg(struct tnum a);
+/* Returns the tnum with the lower 32-bit subreg cleared */
+struct tnum tnum_clear_subreg(struct tnum a);
+/* Returns the tnum with the lower 32-bit subreg set to value */
+struct tnum tnum_const_subreg(struct tnum a, u32 value);
+/* Returns true if 32-bit subreg @a is a known constant*/
+static inline bool tnum_subreg_is_const(struct tnum a)
+{
+	return !(tnum_subreg(a)).mask;
+}
+
 #endif /* _LINUX_TNUM_H */
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index d4f335a9a899..ceac5281bd31 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -194,3 +194,18 @@ int tnum_sbin(char *str, size_t size, struct tnum a)
 	str[min(size - 1, (size_t)64)] = 0;
 	return 64;
 }
+
+struct tnum tnum_subreg(struct tnum a)
+{
+	return tnum_cast(a, 4);
+}
+
+struct tnum tnum_clear_subreg(struct tnum a)
+{
+	return tnum_lshift(tnum_rshift(a, 32), 32);
+}
+
+struct tnum tnum_const_subreg(struct tnum a, u32 value)
+{
+	return tnum_or(tnum_clear_subreg(a), tnum_const(value));
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index dda3b94d9661..1c60d001bb46 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -550,6 +550,22 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 					tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 					verbose(env, ",var_off=%s", tn_buf);
 				}
+				if (reg->s32_min_value != reg->smin_value &&
+				    reg->s32_min_value != S32_MIN)
+					verbose(env, ",s32_min_value=%d",
+						(int)(reg->s32_min_value));
+				if (reg->s32_max_value != reg->smax_value &&
+				    reg->s32_max_value != S32_MAX)
+					verbose(env, ",s32_max_value=%d",
+						(int)(reg->s32_max_value));
+				if (reg->u32_min_value != reg->umin_value &&
+				    reg->u32_min_value != U32_MIN)
+					verbose(env, ",u32_min_value=%d",
+						(int)(reg->u32_min_value));
+				if (reg->u32_max_value != reg->umax_value &&
+				    reg->u32_max_value != U32_MAX)
+					verbose(env, ",u32_max_value=%d",
+						(int)(reg->u32_max_value));
 			}
 			verbose(env, ")");
 		}
@@ -924,6 +940,20 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
 	reg->smax_value = (s64)imm;
 	reg->umin_value = imm;
 	reg->umax_value = imm;
+
+	reg->s32_min_value = (s32)imm;
+	reg->s32_max_value = (s32)imm;
+	reg->u32_min_value = (u32)imm;
+	reg->u32_max_value = (u32)imm;
+}
+
+static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm)
+{
+	reg->var_off = tnum_const_subreg(reg->var_off, imm);
+	reg->s32_min_value = (s32)imm;
+	reg->s32_max_value = (s32)imm;
+	reg->u32_min_value = (u32)imm;
+	reg->u32_max_value = (u32)imm;
 }
 
 /* Mark the 'variable offset' part of a register as zero.  This should be
@@ -978,8 +1008,52 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
 	       tnum_equals_const(reg->var_off, 0);
 }
 
-/* Attempts to improve min/max values based on var_off information */
-static void __update_reg_bounds(struct bpf_reg_state *reg)
+/* Reset the min/max bounds of a register */
+static void __mark_reg_unbounded(struct bpf_reg_state *reg)
+{
+	reg->smin_value = S64_MIN;
+	reg->smax_value = S64_MAX;
+	reg->umin_value = 0;
+	reg->umax_value = U64_MAX;
+
+	reg->s32_min_value = S32_MIN;
+	reg->s32_max_value = S32_MAX;
+	reg->u32_min_value = 0;
+	reg->u32_max_value = U32_MAX;
+}
+
+static void __mark_reg64_unbounded(struct bpf_reg_state *reg)
+{
+	reg->smin_value = S64_MIN;
+	reg->smax_value = S64_MAX;
+	reg->umin_value = 0;
+	reg->umax_value = U64_MAX;
+}
+
+static void __mark_reg32_unbounded(struct bpf_reg_state *reg)
+{
+	reg->s32_min_value = S32_MIN;
+	reg->s32_max_value = S32_MAX;
+	reg->u32_min_value = 0;
+	reg->u32_max_value = U32_MAX;
+}
+
+static void __update_reg32_bounds(struct bpf_reg_state *reg)
+{
+	struct tnum var32_off = tnum_subreg(reg->var_off);
+
+	/* min signed is max(sign bit) | min(other bits) */
+	reg->s32_min_value = max_t(s32, reg->s32_min_value,
+			var32_off.value | (var32_off.mask & S32_MIN));
+	/* max signed is min(sign bit) | max(other bits) */
+	reg->s32_max_value = min_t(s32, reg->s32_max_value,
+			var32_off.value | (var32_off.mask & S32_MAX));
+	reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value);
+	reg->u32_max_value = min(reg->u32_max_value,
+				 (u32)(var32_off.value | var32_off.mask));
+}
+
+static void __update_reg64_bounds(struct bpf_reg_state *reg)
 {
 	/* min signed is max(sign bit) | min(other bits) */
 	reg->smin_value = max_t(s64, reg->smin_value,
@@ -992,8 +1066,48 @@ static void __update_reg_bounds(struct bpf_reg_state *reg)
 			      reg->var_off.value | reg->var_off.mask);
 }
 
+static void __update_reg_bounds(struct bpf_reg_state *reg)
+{
+	__update_reg32_bounds(reg);
+	__update_reg64_bounds(reg);
+}
+
 /* Uses signed min/max values to inform unsigned, and vice-versa */
-static void __reg_deduce_bounds(struct bpf_reg_state *reg)
+static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
+{
+	/* Learn sign from signed bounds.
+	 * If we cannot cross the sign boundary, then signed and unsigned bounds
+	 * are the same, so combine.  This works even in the negative case, e.g.
+	 * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
+	 */
+	if (reg->s32_min_value >= 0 || reg->s32_max_value < 0) {
+		reg->s32_min_value = reg->u32_min_value =
+			max_t(u32, reg->s32_min_value, reg->u32_min_value);
+		reg->s32_max_value = reg->u32_max_value =
+			min_t(u32, reg->s32_max_value, reg->u32_max_value);
+		return;
+	}
+	/* Learn sign from unsigned bounds.  Signed bounds cross the sign
+	 * boundary, so we must be careful.
+	 */
+	if ((s32)reg->u32_max_value >= 0) {
+		/* Positive.  We can't learn anything from the smin, but smax
+		 * is positive, hence safe.
+		 */
+		reg->s32_min_value = reg->u32_min_value;
+		reg->s32_max_value = reg->u32_max_value =
+			min_t(u32, reg->s32_max_value, reg->u32_max_value);
+	} else if ((s32)reg->u32_min_value < 0) {
+		/* Negative.  We can't learn anything from the smax, but smin
+		 * is negative, hence safe.
+		 */
+		reg->s32_min_value = reg->u32_min_value =
+			max_t(u32, reg->s32_min_value, reg->u32_min_value);
+		reg->s32_max_value = reg->u32_max_value;
+	}
+}
+
+static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
 {
 	/* Learn sign from signed bounds.
 	 * If we cannot cross the sign boundary, then signed and unsigned bounds
@@ -1027,21 +1141,106 @@ static void __reg_deduce_bounds(struct bpf_reg_state *reg)
 	}
 }
 
+static void __reg_deduce_bounds(struct bpf_reg_state *reg)
+{
+	__reg32_deduce_bounds(reg);
+	__reg64_deduce_bounds(reg);
+}
+
 /* Attempts to improve var_off based on unsigned min/max information */
 static void __reg_bound_offset(struct bpf_reg_state *reg)
 {
-	reg->var_off = tnum_intersect(reg->var_off,
-				      tnum_range(reg->umin_value,
-						 reg->umax_value));
+	struct tnum var64_off = tnum_intersect(reg->var_off,
+					       tnum_range(reg->umin_value,
+							  reg->umax_value));
+	struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off),
+						tnum_range(reg->u32_min_value,
+							   reg->u32_max_value));
+
+	reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
 }
 
-/* Reset the min/max bounds of a register */
-static void __mark_reg_unbounded(struct bpf_reg_state *reg)
+static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
 {
-	reg->smin_value = S64_MIN;
-	reg->smax_value = S64_MAX;
-	reg->umin_value = 0;
-	reg->umax_value = U64_MAX;
+	reg->umin_value = reg->u32_min_value;
+	reg->umax_value = reg->u32_max_value;
+	/* Attempt to pull 32-bit signed bounds into 64-bit bounds
+	 * but must be positive otherwise set to worse case bounds
+	 * and refine later from tnum.
+	 */
+	if (reg->s32_min_value > 0)
+		reg->smin_value = reg->s32_min_value;
+	else
+		reg->smin_value = 0;
+	if (reg->s32_max_value > 0)
+		reg->smax_value = reg->s32_max_value;
+	else
+		reg->smax_value = U32_MAX;
+}
+
+static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
+{
+	/* special case when 64-bit register has upper 32-bit register
+	 * zeroed. Typically happens after zext or <<32, >>32 sequence
+	 * allowing us to use 32-bit bounds directly,
+	 */
+	if (tnum_equals_const(tnum_clear_subreg(reg->var_off), 0)) {
+		__reg_assign_32_into_64(reg);
+	} else {
+		/* Otherwise the best we can do is push lower 32bit known and
+		 * unknown bits into register (var_off set from jmp logic)
+		 * then learn as much as possible from the 64-bit tnum
+		 * known and unknown bits. The previous smin/smax bounds are
+		 * invalid here because of jmp32 compare so mark them unknown
+		 * so they do not impact tnum bounds calculation.
+		 */
+		__mark_reg64_unbounded(reg);
+		__update_reg_bounds(reg);
+	}
+
+	/* Intersecting with the old var_off might have improved our bounds
+	 * slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+	 * then new var_off is (0; 0x7f...fc) which improves our umax.
+	 */
+	__reg_deduce_bounds(reg);
+	__reg_bound_offset(reg);
+	__update_reg_bounds(reg);
+}
+
+static bool __reg64_bound_s32(s64 a)
+{
+	if (a > S32_MIN && a < S32_MAX)
+		return true;
+	return false;
+}
+
+static bool __reg64_bound_u32(u64 a)
+{
+	if (a > U32_MIN && a < U32_MAX)
+		return true;
+	return false;
+}
+
+static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
+{
+	__mark_reg32_unbounded(reg);
+
+	if (__reg64_bound_s32(reg->smin_value))
+		reg->s32_min_value = (s32)reg->smin_value;
+	if (__reg64_bound_s32(reg->smax_value))
+		reg->s32_max_value = (s32)reg->smax_value;
+	if (__reg64_bound_u32(reg->umin_value))
+		reg->u32_min_value = (u32)reg->umin_value;
+	if (__reg64_bound_u32(reg->umax_value))
+		reg->u32_max_value = (u32)reg->umax_value;
+
+	/* Intersecting with the old var_off might have improved our bounds
+	 * slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+	 * then new var_off is (0; 0x7f...fc) which improves our umax.
+	 */
+	__reg_deduce_bounds(reg);
+	__reg_bound_offset(reg);
+	__update_reg_bounds(reg);
 }
 
 /* Mark a register as having a completely unknown (scalar) value. */
@@ -2774,6 +2973,12 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env,
 	return 0;
 }
 
+/* BPF architecture zero extends alu32 ops into 64-bit registesr */
+static void zext_32_to_64(struct bpf_reg_state *reg)
+{
+	reg->var_off = tnum_subreg(reg->var_off);
+	__reg_assign_32_into_64(reg);
+}
 
 /* truncate register to smaller size (in bytes)
  * must be called with size < BPF_REG_SIZE
@@ -2796,6 +3001,14 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
 	}
 	reg->smin_value = reg->umin_value;
 	reg->smax_value = reg->umax_value;
+
+	/* If size is smaller than 32bit register the 32bit register
+	 * values are also truncated so we push 64-bit bounds into
+	 * 32-bit bounds. Above were truncated < 32-bits already.
+	 */
+	if (size >= 4)
+		return;
+	__reg_combine_64_into_32(reg);
 }
 
 static bool bpf_map_is_rdonly(const struct bpf_map *map)
@@ -4431,7 +4644,17 @@ static bool signed_add_overflows(s64 a, s64 b)
 	return res < a;
 }
 
-static bool signed_sub_overflows(s64 a, s64 b)
+static bool signed_add32_overflows(s64 a, s64 b)
+{
+	/* Do the add in u32, where overflow is well-defined */
+	s32 res = (s32)((u32)a + (u32)b);
+
+	if (b < 0)
+		return res > a;
+	return res < a;
+}
+
+static bool signed_sub_overflows(s32 a, s32 b)
 {
 	/* Do the sub in u64, where overflow is well-defined */
 	s64 res = (s64)((u64)a - (u64)b);
@@ -4441,6 +4664,16 @@ static bool signed_sub_overflows(s64 a, s64 b)
 	return res > a;
 }
 
+static bool signed_sub32_overflows(s32 a, s32 b)
+{
+	/* Do the sub in u64, where overflow is well-defined */
+	s32 res = (s32)((u32)a - (u32)b);
+
+	if (b < 0)
+		return res < a;
+	return res > a;
+}
+
 static bool check_reg_sane_offset(struct bpf_verifier_env *env,
 				  const struct bpf_reg_state *reg,
 				  enum bpf_reg_type type)
@@ -4677,6 +4910,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	    !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
 		return -EINVAL;
 
+	/* pointer types do not carry 32-bit bounds at the moment. */
+	__mark_reg32_unbounded(dst_reg);
+
 	switch (opcode) {
 	case BPF_ADD:
 		ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
@@ -4840,6 +5076,32 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
+				 struct bpf_reg_state *src_reg)
+{
+	s32 smin_val = src_reg->s32_min_value;
+	s32 smax_val = src_reg->s32_max_value;
+	u32 umin_val = src_reg->u32_min_value;
+	u32 umax_val = src_reg->u32_max_value;
+
+	if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) ||
+	    signed_add32_overflows(dst_reg->s32_max_value, smax_val)) {
+		dst_reg->s32_min_value = S32_MIN;
+		dst_reg->s32_max_value = S32_MAX;
+	} else {
+		dst_reg->s32_min_value += smin_val;
+		dst_reg->s32_max_value += smax_val;
+	}
+	if (dst_reg->u32_min_value + umin_val < umin_val ||
+	    dst_reg->u32_max_value + umax_val < umax_val) {
+		dst_reg->u32_min_value = 0;
+		dst_reg->u32_max_value = U32_MAX;
+	} else {
+		dst_reg->u32_min_value += umin_val;
+		dst_reg->u32_max_value += umax_val;
+	}
+}
+
 static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
 			       struct bpf_reg_state *src_reg)
 {
@@ -4864,7 +5126,34 @@ static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
 		dst_reg->umin_value += umin_val;
 		dst_reg->umax_value += umax_val;
 	}
-	dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg->var_off);
+}
+
+static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
+				 struct bpf_reg_state *src_reg)
+{
+	s32 smin_val = src_reg->s32_min_value;
+	s32 smax_val = src_reg->s32_max_value;
+	u32 umin_val = src_reg->u32_min_value;
+	u32 umax_val = src_reg->u32_max_value;
+
+	if (signed_sub32_overflows(dst_reg->s32_min_value, smax_val) ||
+	    signed_sub32_overflows(dst_reg->s32_max_value, smin_val)) {
+		/* Overflow possible, we know nothing */
+		dst_reg->s32_min_value = S32_MIN;
+		dst_reg->s32_max_value = S32_MAX;
+	} else {
+		dst_reg->s32_min_value -= smax_val;
+		dst_reg->s32_max_value -= smin_val;
+	}
+	if (dst_reg->u32_min_value < umax_val) {
+		/* Overflow possible, we know nothing */
+		dst_reg->u32_min_value = 0;
+		dst_reg->u32_max_value = U32_MAX;
+	} else {
+		/* Cannot overflow (as long as bounds are consistent) */
+		dst_reg->u32_min_value -= umax_val;
+		dst_reg->u32_max_value -= umin_val;
+	}
 }
 
 static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
@@ -4893,7 +5182,38 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
 		dst_reg->umin_value -= umax_val;
 		dst_reg->umax_value -= umin_val;
 	}
-	dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg->var_off);
+}
+
+static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg,
+				 struct bpf_reg_state *src_reg)
+{
+	s32 smin_val = src_reg->s32_min_value;
+	u32 umin_val = src_reg->u32_min_value;
+	u32 umax_val = src_reg->u32_max_value;
+
+	if (smin_val < 0 || dst_reg->s32_min_value < 0) {
+		/* Ain't nobody got time to multiply that sign */
+		__mark_reg32_unbounded(dst_reg);
+		return;
+	}
+	/* Both values are positive, so we can work with unsigned and
+	 * copy the result to signed (unless it exceeds S32_MAX).
+	 */
+	if (umax_val > U16_MAX || dst_reg->u32_max_value > U16_MAX) {
+		/* Potential overflow, we know nothing */
+		__mark_reg32_unbounded(dst_reg);
+		return;
+	}
+	dst_reg->u32_min_value *= umin_val;
+	dst_reg->u32_max_value *= umax_val;
+	if (dst_reg->u32_max_value > S32_MAX) {
+		/* Overflow possible, we know nothing */
+		dst_reg->s32_min_value = S32_MIN;
+		dst_reg->s32_max_value = S32_MAX;
+	} else {
+		dst_reg->s32_min_value = dst_reg->u32_min_value;
+		dst_reg->s32_max_value = dst_reg->u32_max_value;
+	}
 }
 
 static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
@@ -4903,11 +5223,9 @@ static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
 	u64 umin_val = src_reg->umin_value;
 	u64 umax_val = src_reg->umax_value;
 
-	dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg->var_off);
 	if (smin_val < 0 || dst_reg->smin_value < 0) {
 		/* Ain't nobody got time to multiply that sign */
-		__mark_reg_unbounded(dst_reg);
-		__update_reg_bounds(dst_reg);
+		__mark_reg64_unbounded(dst_reg);
 		return;
 	}
 	/* Both values are positive, so we can work with unsigned and
@@ -4915,9 +5233,7 @@ static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
 	 */
 	if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
 		/* Potential overflow, we know nothing */
-		__mark_reg_unbounded(dst_reg);
-		/* (except what we can learn from the var_off) */
-		__update_reg_bounds(dst_reg);
+		__mark_reg64_unbounded(dst_reg);
 		return;
 	}
 	dst_reg->umin_value *= umin_val;
@@ -4932,16 +5248,59 @@ static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
 	}
 }
 
+static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
+				 struct bpf_reg_state *src_reg)
+{
+	bool src_known = tnum_subreg_is_const(src_reg->var_off);
+	bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
+	struct tnum var32_off = tnum_subreg(dst_reg->var_off);
+	s32 smin_val = src_reg->s32_min_value;
+	u32 umax_val = src_reg->u32_max_value;
+
+	/* Assuming scalar64_min_max_and will be called so its safe
+	 * to skip updating register for known 32-bit case.
+	 */
+	if (src_known && dst_known)
+		return;
+
+	/* We get our minimum from the var_off, since that's inherently
+	 * bitwise.  Our maximum is the minimum of the operands' maxima.
+	 */
+	dst_reg->u32_min_value = var32_off.value;
+	dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);
+	if (dst_reg->s32_min_value < 0 || smin_val < 0) {
+		/* Lose signed bounds when ANDing negative numbers,
+		 * ain't nobody got time for that.
+		 */
+		dst_reg->s32_min_value = S32_MIN;
+		dst_reg->s32_max_value = S32_MAX;
+	} else {
+		/* ANDing two positives gives a positive, so safe to
+		 * cast result into s64.
+		 */
+		dst_reg->s32_min_value = dst_reg->u32_min_value;
+		dst_reg->s32_max_value = dst_reg->u32_max_value;
+	}
+
+}
+
 static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
 			       struct bpf_reg_state *src_reg)
 {
+	bool src_known = tnum_is_const(src_reg->var_off);
+	bool dst_known = tnum_is_const(dst_reg->var_off);
 	s64 smin_val = src_reg->smin_value;
 	u64 umax_val = src_reg->umax_value;
 
+	if (src_known && dst_known) {
+		__mark_reg_known(dst_reg, dst_reg->var_off.value &
+					  src_reg->var_off.value);
+		return;
+	}
+
 	/* We get our minimum from the var_off, since that's inherently
 	 * bitwise.  Our maximum is the minimum of the operands' maxima.
 	 */
-	dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg->var_off);
 	dst_reg->umin_value = dst_reg->var_off.value;
 	dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
 	if (dst_reg->smin_value < 0 || smin_val < 0) {
@@ -4961,16 +5320,58 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
 	__update_reg_bounds(dst_reg);
 }
 
+static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
+				struct bpf_reg_state *src_reg)
+{
+	bool src_known = tnum_subreg_is_const(src_reg->var_off);
+	bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
+	struct tnum var32_off = tnum_subreg(dst_reg->var_off);
+	s32 smin_val = src_reg->smin_value;
+	u32 umin_val = src_reg->umin_value;
+
+	/* Assuming scalar64_min_max_or will be called so it is safe
+	 * to skip updating register for known case.
+	 */
+	if (src_known && dst_known)
+		return;
+
+	/* We get our maximum from the var_off, and our minimum is the
+	 * maximum of the operands' minima
+	 */
+	dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val);
+	dst_reg->u32_max_value = var32_off.value | var32_off.mask;
+	if (dst_reg->s32_min_value < 0 || smin_val < 0) {
+		/* Lose signed bounds when ORing negative numbers,
+		 * ain't nobody got time for that.
+		 */
+		dst_reg->s32_min_value = S32_MIN;
+		dst_reg->s32_max_value = S32_MAX;
+	} else {
+		/* ORing two positives gives a positive, so safe to
+		 * cast result into s64.
+		 */
+		dst_reg->s32_min_value = dst_reg->umin_value;
+		dst_reg->s32_max_value = dst_reg->umax_value;
+	}
+}
+
 static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
 			      struct bpf_reg_state *src_reg)
 {
+	bool src_known = tnum_is_const(src_reg->var_off);
+	bool dst_known = tnum_is_const(dst_reg->var_off);
 	s64 smin_val = src_reg->smin_value;
 	u64 umin_val = src_reg->umin_value;
 
+	if (src_known && dst_known) {
+		__mark_reg_known(dst_reg, dst_reg->var_off.value |
+					  src_reg->var_off.value);
+		return;
+	}
+
 	/* We get our maximum from the var_off, and our minimum is the
 	 * maximum of the operands' minima
 	 */
-	dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg->var_off);
 	dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
 	dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
 	if (dst_reg->smin_value < 0 || smin_val < 0) {
@@ -4990,17 +5391,62 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
 	__update_reg_bounds(dst_reg);
 }
 
-static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg,
-			       struct bpf_reg_state *src_reg)
+static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
+				   u64 umin_val, u64 umax_val)
 {
-	u64 umax_val = src_reg->umax_value;
-	u64 umin_val = src_reg->umin_value;
-
 	/* We lose all sign bit information (except what we can pick
 	 * up from var_off)
 	 */
-	dst_reg->smin_value = S64_MIN;
-	dst_reg->smax_value = S64_MAX;
+	dst_reg->s32_min_value = S32_MIN;
+	dst_reg->s32_max_value = S32_MAX;
+	/* If we might shift our top bit out, then we know nothing */
+	if (umax_val > 31 || dst_reg->u32_max_value > 1ULL << (31 - umax_val)) {
+		dst_reg->u32_min_value = 0;
+		dst_reg->u32_max_value = U32_MAX;
+	} else {
+		dst_reg->u32_min_value <<= umin_val;
+		dst_reg->u32_max_value <<= umax_val;
+	}
+}
+
+static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg,
+				 struct bpf_reg_state *src_reg)
+{
+	u32 umax_val = src_reg->u32_max_value;
+	u32 umin_val = src_reg->u32_min_value;
+	/* u32 alu operation will zext upper bits */
+	struct tnum subreg = tnum_subreg(dst_reg->var_off);
+
+	__scalar32_min_max_lsh(dst_reg, umin_val, umax_val);
+	dst_reg->var_off = tnum_subreg(tnum_lshift(subreg, umin_val));
+	/* Not required but being careful mark reg64 bounds as unknown so
+	 * that we are forced to pick them up from tnum and zext later and
+	 * if some path skips this step we are still safe.
+	 */
+	__mark_reg64_unbounded(dst_reg);
+	__update_reg32_bounds(dst_reg);
+}
+
+static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg,
+				   u64 umin_val, u64 umax_val)
+{
+	/* Special case <<32 because it is a common compiler pattern to sign
+	 * extend subreg by doing <<32 s>>32. In this case if 32bit bounds are
+	 * positive we know this shift will also be positive so we can track
+	 * bounds correctly. Otherwise we lose all sign bit information except
+	 * what we can pick up from var_off. Perhaps we can generalize this
+	 * later to shifts of any length.
+	 */
+	if (umin_val == 32 && umax_val == 32 && dst_reg->s32_max_value >= 0)
+		dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32;
+	else
+		dst_reg->smax_value = S64_MAX;
+
+	if (umin_val == 32 && umax_val == 32 && dst_reg->s32_min_value >= 0)
+		dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32;
+	else
+		dst_reg->smin_value = S64_MIN;
+
 	/* If we might shift our top bit out, then we know nothing */
 	if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
 		dst_reg->umin_value = 0;
@@ -5009,11 +5455,55 @@ static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg,
 		dst_reg->umin_value <<= umin_val;
 		dst_reg->umax_value <<= umax_val;
 	}
+}
+
+static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg,
+			       struct bpf_reg_state *src_reg)
+{
+	u64 umax_val = src_reg->umax_value;
+	u64 umin_val = src_reg->umin_value;
+
+	/* scalar64 calc uses 32bit unshifted bounds so must be called first */
+	__scalar64_min_max_lsh(dst_reg, umin_val, umax_val);
+	__scalar32_min_max_lsh(dst_reg, umin_val, umax_val);
+
 	dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
 	/* We may learn something more from the var_off */
 	__update_reg_bounds(dst_reg);
 }
 
+static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg,
+				 struct bpf_reg_state *src_reg)
+{
+	struct tnum subreg = tnum_subreg(dst_reg->var_off);
+	u32 umax_val = src_reg->u32_max_value;
+	u32 umin_val = src_reg->u32_min_value;
+
+	/* BPF_RSH is an unsigned shift.  If the value in dst_reg might
+	 * be negative, then either:
+	 * 1) src_reg might be zero, so the sign bit of the result is
+	 *    unknown, so we lose our signed bounds
+	 * 2) it's known negative, thus the unsigned bounds capture the
+	 *    signed bounds
+	 * 3) the signed bounds cross zero, so they tell us nothing
+	 *    about the result
+	 * If the value in dst_reg is known nonnegative, then again the
+	 * unsigned bounts capture the signed bounds.
+	 * Thus, in all cases it suffices to blow away our signed bounds
+	 * and rely on inferring new ones from the unsigned bounds and
+	 * var_off of the result.
+	 */
+	dst_reg->s32_min_value = S32_MIN;
+	dst_reg->s32_max_value = S32_MAX;
+
+	dst_reg->var_off = tnum_rshift(subreg, umin_val);
+	dst_reg->u32_min_value >>= umax_val;
+	dst_reg->u32_max_value >>= umin_val;
+
+	__mark_reg64_unbounded(dst_reg);
+	__update_reg32_bounds(dst_reg);
+}
+
 static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
 			       struct bpf_reg_state *src_reg)
 {
@@ -5039,35 +5529,62 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg,
 	dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
 	dst_reg->umin_value >>= umax_val;
 	dst_reg->umax_value >>= umin_val;
-	/* We may learn something more from the var_off */
+
+	/* Its not easy to operate on alu32 bounds here because it depends
+	 * on bits being shifted in. Take easy way out and mark unbounded
+	 * so we can recalculate later from tnum.
+	 */
+	__mark_reg32_unbounded(dst_reg);
 	__update_reg_bounds(dst_reg);
 }
 
-static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
-			        struct bpf_reg_state *src_reg,
-				u64 insn_bitness)
+static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg,
+				  struct bpf_reg_state *src_reg)
 {
-	u64 umin_val = src_reg->umin_value;
+	u64 umin_val = src_reg->u32_min_value;
 
 	/* Upon reaching here, src_known is true and
 	 * umax_val is equal to umin_val.
 	 */
-	if (insn_bitness == 32) {
-		dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val);
-		dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val);
-	} else {
-		dst_reg->smin_value >>= umin_val;
-		dst_reg->smax_value >>= umin_val;
-	}
+	dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val);
+	dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val);
 
-	dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val,
-					insn_bitness);
+	dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32);
+
+	/* blow away the dst_reg umin_value/umax_value and rely on
+	 * dst_reg var_off to refine the result.
+	 */
+	dst_reg->u32_min_value = 0;
+	dst_reg->u32_max_value = U32_MAX;
+
+	__mark_reg64_unbounded(dst_reg);
+	__update_reg32_bounds(dst_reg);
+}
+
+static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg,
+				struct bpf_reg_state *src_reg)
+{
+	u64 umin_val = src_reg->umin_value;
+
+	/* Upon reaching here, src_known is true and umax_val is equal
+	 * to umin_val.
+	 */
+	dst_reg->smin_value >>= umin_val;
+	dst_reg->smax_value >>= umin_val;
+
+	dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64);
 
 	/* blow away the dst_reg umin_value/umax_value and rely on
 	 * dst_reg var_off to refine the result.
 	 */
 	dst_reg->umin_value = 0;
 	dst_reg->umax_value = U64_MAX;
+
+	/* Its not easy to operate on alu32 bounds here because it depends
+	 * on bits being shifted in from upper 32-bits. Take easy way out
+	 * and mark unbounded so we can recalculate later from tnum.
+	 */
+	__mark_reg32_unbounded(dst_reg);
 	__update_reg_bounds(dst_reg);
 }
 
@@ -5085,33 +5602,47 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 	bool src_known, dst_known;
 	s64 smin_val, smax_val;
 	u64 umin_val, umax_val;
+	s32 s32_min_val, s32_max_val;
+	u32 u32_min_val, u32_max_val;
 	u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
 	u32 dst = insn->dst_reg;
 	int ret;
-
-	if (insn_bitness == 32) {
-		/* Relevant for 32-bit RSH: Information can propagate towards
-		 * LSB, so it isn't sufficient to only truncate the output to
-		 * 32 bits.
-		 */
-		coerce_reg_to_size(dst_reg, 4);
-		coerce_reg_to_size(&src_reg, 4);
-	}
+	bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
 
 	smin_val = src_reg.smin_value;
 	smax_val = src_reg.smax_value;
 	umin_val = src_reg.umin_value;
 	umax_val = src_reg.umax_value;
-	src_known = tnum_is_const(src_reg.var_off);
-	dst_known = tnum_is_const(dst_reg->var_off);
 
-	if ((src_known && (smin_val != smax_val || umin_val != umax_val)) ||
-	    smin_val > smax_val || umin_val > umax_val) {
-		/* Taint dst register if offset had invalid bounds derived from
-		 * e.g. dead branches.
-		 */
-		__mark_reg_unknown(env, dst_reg);
-		return 0;
+	s32_min_val = src_reg.s32_min_value;
+	s32_max_val = src_reg.s32_max_value;
+	u32_min_val = src_reg.u32_min_value;
+	u32_max_val = src_reg.u32_max_value;
+
+	if (alu32) {
+		src_known = tnum_subreg_is_const(src_reg.var_off);
+		dst_known = tnum_subreg_is_const(dst_reg->var_off);
+		if ((src_known &&
+		     (s32_min_val != s32_max_val || u32_min_val != u32_max_val)) ||
+		    s32_min_val > s32_max_val || u32_min_val > u32_max_val) {
+			/* Taint dst register if offset had invalid bounds
+			 * derived from e.g. dead branches.
+			 */
+			__mark_reg_unknown(env, dst_reg);
+			return 0;
+		}
+	} else {
+		src_known = tnum_is_const(src_reg.var_off);
+		dst_known = tnum_is_const(dst_reg->var_off);
+		if ((src_known &&
+		     (smin_val != smax_val || umin_val != umax_val)) ||
+		    smin_val > smax_val || umin_val > umax_val) {
+			/* Taint dst register if offset had invalid bounds
+			 * derived from e.g. dead branches.
+			 */
+			__mark_reg_unknown(env, dst_reg);
+			return 0;
+		}
 	}
 
 	if (!src_known &&
@@ -5120,6 +5651,20 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		return 0;
 	}
 
+	/* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops.
+	 * There are two classes of instructions: The first class we track both
+	 * alu32 and alu64 sign/unsigned bounds independently this provides the
+	 * greatest amount of precision when alu operations are mixed with jmp32
+	 * operations. These operations are BPF_ADD, BPF_SUB, BPF_MUL, BPF_ADD,
+	 * and BPF_OR. This is possible because these ops have fairly easy to
+	 * understand and calculate behavior in both 32-bit and 64-bit alu ops.
+	 * See alu32 verifier tests for examples. The second class of
+	 * operations, BPF_LSH, BPF_RSH, and BPF_ARSH, however are not so easy
+	 * with regards to tracking sign/unsigned bounds because the bits may
+	 * cross subreg boundaries in the alu64 case. When this happens we mark
+	 * the reg unbounded in the subreg bound space and use the resulting
+	 * tnum to calculate an approximation of the sign/unsigned bounds.
+	 */
 	switch (opcode) {
 	case BPF_ADD:
 		ret = sanitize_val_alu(env, insn);
@@ -5127,7 +5672,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			verbose(env, "R%d tried to add from different pointers or scalars\n", dst);
 			return ret;
 		}
+		scalar32_min_max_add(dst_reg, &src_reg);
 		scalar_min_max_add(dst_reg, &src_reg);
+		dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
 		break;
 	case BPF_SUB:
 		ret = sanitize_val_alu(env, insn);
@@ -5135,25 +5682,23 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			verbose(env, "R%d tried to sub from different pointers or scalars\n", dst);
 			return ret;
 		}
+		scalar32_min_max_sub(dst_reg, &src_reg);
 		scalar_min_max_sub(dst_reg, &src_reg);
+		dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
 		break;
 	case BPF_MUL:
+		dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
+		scalar32_min_max_mul(dst_reg, &src_reg);
 		scalar_min_max_mul(dst_reg, &src_reg);
 		break;
 	case BPF_AND:
-		if (src_known && dst_known) {
-			__mark_reg_known(dst_reg, dst_reg->var_off.value &
-						  src_reg.var_off.value);
-			break;
-		}
+		dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
+		scalar32_min_max_and(dst_reg, &src_reg);
 		scalar_min_max_and(dst_reg, &src_reg);
 		break;
 	case BPF_OR:
-		if (src_known && dst_known) {
-			__mark_reg_known(dst_reg, dst_reg->var_off.value |
-						  src_reg.var_off.value);
-			break;
-		}
+		dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
+		scalar32_min_max_or(dst_reg, &src_reg);
 		scalar_min_max_or(dst_reg, &src_reg);
 		break;
 	case BPF_LSH:
@@ -5164,7 +5709,10 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
 		}
-		scalar_min_max_lsh(dst_reg, &src_reg);
+		if (alu32)
+			scalar32_min_max_lsh(dst_reg, &src_reg);
+		else
+			scalar_min_max_lsh(dst_reg, &src_reg);
 		break;
 	case BPF_RSH:
 		if (umax_val >= insn_bitness) {
@@ -5174,7 +5722,10 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
 		}
-		scalar_min_max_rsh(dst_reg, &src_reg);
+		if (alu32)
+			scalar32_min_max_rsh(dst_reg, &src_reg);
+		else
+			scalar_min_max_rsh(dst_reg, &src_reg);
 		break;
 	case BPF_ARSH:
 		if (umax_val >= insn_bitness) {
@@ -5184,17 +5735,19 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
 		}
-		scalar_min_max_arsh(dst_reg, &src_reg, insn_bitness);
+		if (alu32)
+			scalar32_min_max_arsh(dst_reg, &src_reg);
+		else
+			scalar_min_max_arsh(dst_reg, &src_reg);
 		break;
 	default:
 		mark_reg_unknown(env, regs, insn->dst_reg);
 		break;
 	}
 
-	if (BPF_CLASS(insn->code) != BPF_ALU64) {
-		/* 32-bit ALU ops are (32,32)->32 */
-		coerce_reg_to_size(dst_reg, 4);
-	}
+	/* ALU32 ops are zero extended into 64bit register */
+	if (alu32)
+		zext_32_to_64(dst_reg);
 
 	__update_reg_bounds(dst_reg);
 	__reg_deduce_bounds(dst_reg);
@@ -5370,7 +5923,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 					mark_reg_unknown(env, regs,
 							 insn->dst_reg);
 				}
-				coerce_reg_to_size(dst_reg, 4);
+				zext_32_to_64(dst_reg);
 			}
 		} else {
 			/* case: R = imm
@@ -5540,55 +6093,83 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 					 new_range);
 }
 
-/* compute branch direction of the expression "if (reg opcode val) goto target;"
- * and return:
- *  1 - branch will be taken and "goto target" will be executed
- *  0 - branch will not be taken and fall-through to next insn
- * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10]
- */
-static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
-			   bool is_jmp32)
+static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
 {
-	struct bpf_reg_state reg_lo;
-	s64 sval;
+	struct tnum subreg = tnum_subreg(reg->var_off);
+	s32 sval = (s32)val;
 
-	if (__is_pointer_value(false, reg))
-		return -1;
+	switch (opcode) {
+	case BPF_JEQ:
+		if (tnum_is_const(subreg))
+			return !!tnum_equals_const(subreg, val);
+		break;
+	case BPF_JNE:
+		if (tnum_is_const(subreg))
+			return !tnum_equals_const(subreg, val);
+		break;
+	case BPF_JSET:
+		if ((~subreg.mask & subreg.value) & val)
+			return 1;
+		if (!((subreg.mask | subreg.value) & val))
+			return 0;
+		break;
+	case BPF_JGT:
+		if (reg->u32_min_value > val)
+			return 1;
+		else if (reg->u32_max_value <= val)
+			return 0;
+		break;
+	case BPF_JSGT:
+		if (reg->s32_min_value > sval)
+			return 1;
+		else if (reg->s32_max_value < sval)
+			return 0;
+		break;
+	case BPF_JLT:
+		if (reg->u32_max_value < val)
+			return 1;
+		else if (reg->u32_min_value >= val)
+			return 0;
+		break;
+	case BPF_JSLT:
+		if (reg->s32_max_value < sval)
+			return 1;
+		else if (reg->s32_min_value >= sval)
+			return 0;
+		break;
+	case BPF_JGE:
+		if (reg->u32_min_value >= val)
+			return 1;
+		else if (reg->u32_max_value < val)
+			return 0;
+		break;
+	case BPF_JSGE:
+		if (reg->s32_min_value >= sval)
+			return 1;
+		else if (reg->s32_max_value < sval)
+			return 0;
+		break;
+	case BPF_JLE:
+		if (reg->u32_max_value <= val)
+			return 1;
+		else if (reg->u32_min_value > val)
+			return 0;
+		break;
+	case BPF_JSLE:
+		if (reg->s32_max_value <= sval)
+			return 1;
+		else if (reg->s32_min_value > sval)
+			return 0;
+		break;
+	}
 
-	if (is_jmp32) {
-		reg_lo = *reg;
-		reg = &reg_lo;
-		/* For JMP32, only low 32 bits are compared, coerce_reg_to_size
-		 * could truncate high bits and update umin/umax according to
-		 * information of low bits.
-		 */
-		coerce_reg_to_size(reg, 4);
-		/* smin/smax need special handling. For example, after coerce,
-		 * if smin_value is 0x00000000ffffffffLL, the value is -1 when
-		 * used as operand to JMP32. It is a negative number from s32's
-		 * point of view, while it is a positive number when seen as
-		 * s64. The smin/smax are kept as s64, therefore, when used with
-		 * JMP32, they need to be transformed into s32, then sign
-		 * extended back to s64.
-		 *
-		 * Also, smin/smax were copied from umin/umax. If umin/umax has
-		 * different sign bit, then min/max relationship doesn't
-		 * maintain after casting into s32, for this case, set smin/smax
-		 * to safest range.
-		 */
-		if ((reg->umax_value ^ reg->umin_value) &
-		    (1ULL << 31)) {
-			reg->smin_value = S32_MIN;
-			reg->smax_value = S32_MAX;
-		}
-		reg->smin_value = (s64)(s32)reg->smin_value;
-		reg->smax_value = (s64)(s32)reg->smax_value;
+	return -1;
+}
 
-		val = (u32)val;
-		sval = (s64)(s32)val;
-	} else {
-		sval = (s64)val;
-	}
+
+static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
+{
+	s64 sval = (s64)val;
 
 	switch (opcode) {
 	case BPF_JEQ:
@@ -5658,91 +6239,22 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
 	return -1;
 }
 
-/* Generate min value of the high 32-bit from TNUM info. */
-static u64 gen_hi_min(struct tnum var)
-{
-	return var.value & ~0xffffffffULL;
-}
-
-/* Generate max value of the high 32-bit from TNUM info. */
-static u64 gen_hi_max(struct tnum var)
-{
-	return (var.value | var.mask) & ~0xffffffffULL;
-}
-
-/* Return true if VAL is compared with a s64 sign extended from s32, and they
- * are with the same signedness.
- */
-static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg)
-{
-	return ((s32)sval >= 0 &&
-		reg->smin_value >= 0 && reg->smax_value <= S32_MAX) ||
-	       ((s32)sval < 0 &&
-		reg->smax_value <= 0 && reg->smin_value >= S32_MIN);
-}
-
-/* Constrain the possible values of @reg with unsigned upper bound @bound.
- * If @is_exclusive, @bound is an exclusive limit, otherwise it is inclusive.
- * If @is_jmp32, @bound is a 32-bit value that only constrains the low 32 bits
- * of @reg.
- */
-static void set_upper_bound(struct bpf_reg_state *reg, u64 bound, bool is_jmp32,
-			    bool is_exclusive)
-{
-	if (is_exclusive) {
-		/* There are no values for `reg` that make `reg<0` true. */
-		if (bound == 0)
-			return;
-		bound--;
-	}
-	if (is_jmp32) {
-		/* Constrain the register's value in the tnum representation.
-		 * For 64-bit comparisons this happens later in
-		 * __reg_bound_offset(), but for 32-bit comparisons, we can be
-		 * more precise than what can be derived from the updated
-		 * numeric bounds.
-		 */
-		struct tnum t = tnum_range(0, bound);
-
-		t.mask |= ~0xffffffffULL; /* upper half is unknown */
-		reg->var_off = tnum_intersect(reg->var_off, t);
-
-		/* Compute the 64-bit bound from the 32-bit bound. */
-		bound += gen_hi_max(reg->var_off);
-	}
-	reg->umax_value = min(reg->umax_value, bound);
-}
-
-/* Constrain the possible values of @reg with unsigned lower bound @bound.
- * If @is_exclusive, @bound is an exclusive limit, otherwise it is inclusive.
- * If @is_jmp32, @bound is a 32-bit value that only constrains the low 32 bits
- * of @reg.
+/* compute branch direction of the expression "if (reg opcode val) goto target;"
+ * and return:
+ *  1 - branch will be taken and "goto target" will be executed
+ *  0 - branch will not be taken and fall-through to next insn
+ * -1 - unknown. Example: "if (reg < 5)" is unknown when register value
+ *      range [0,10]
  */
-static void set_lower_bound(struct bpf_reg_state *reg, u64 bound, bool is_jmp32,
-			    bool is_exclusive)
+static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
+			   bool is_jmp32)
 {
-	if (is_exclusive) {
-		/* There are no values for `reg` that make `reg>MAX` true. */
-		if (bound == (is_jmp32 ? U32_MAX : U64_MAX))
-			return;
-		bound++;
-	}
-	if (is_jmp32) {
-		/* Constrain the register's value in the tnum representation.
-		 * For 64-bit comparisons this happens later in
-		 * __reg_bound_offset(), but for 32-bit comparisons, we can be
-		 * more precise than what can be derived from the updated
-		 * numeric bounds.
-		 */
-		struct tnum t = tnum_range(bound, U32_MAX);
-
-		t.mask |= ~0xffffffffULL; /* upper half is unknown */
-		reg->var_off = tnum_intersect(reg->var_off, t);
+	if (__is_pointer_value(false, reg))
+		return -1;
 
-		/* Compute the 64-bit bound from the 32-bit bound. */
-		bound += gen_hi_min(reg->var_off);
-	}
-	reg->umin_value = max(reg->umin_value, bound);
+	if (is_jmp32)
+		return is_branch32_taken(reg, val, opcode);
+	return is_branch64_taken(reg, val, opcode);
 }
 
 /* Adjusts the register min/max values in the case that the dst_reg is the
@@ -5751,10 +6263,16 @@ static void set_lower_bound(struct bpf_reg_state *reg, u64 bound, bool is_jmp32,
  * In JEQ/JNE cases we also adjust the var_off values.
  */
 static void reg_set_min_max(struct bpf_reg_state *true_reg,
-			    struct bpf_reg_state *false_reg, u64 val,
+			    struct bpf_reg_state *false_reg,
+			    u64 val, u32 val32,
 			    u8 opcode, bool is_jmp32)
 {
-	s64 sval;
+	struct tnum false_32off = tnum_subreg(false_reg->var_off);
+	struct tnum false_64off = false_reg->var_off;
+	struct tnum true_32off = tnum_subreg(true_reg->var_off);
+	struct tnum true_64off = true_reg->var_off;
+	s64 sval = (s64)val;
+	s32 sval32 = (s32)val32;
 
 	/* If the dst_reg is a pointer, we can't learn anything about its
 	 * variable offset from the compare (unless src_reg were a pointer into
@@ -5765,9 +6283,6 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 	if (__is_pointer_value(false, false_reg))
 		return;
 
-	val = is_jmp32 ? (u32)val : val;
-	sval = is_jmp32 ? (s64)(s32)val : (s64)val;
-
 	switch (opcode) {
 	case BPF_JEQ:
 	case BPF_JNE:
@@ -5779,87 +6294,126 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 		 * if it is true we know the value for sure. Likewise for
 		 * BPF_JNE.
 		 */
-		if (is_jmp32) {
-			u64 old_v = reg->var_off.value;
-			u64 hi_mask = ~0xffffffffULL;
-
-			reg->var_off.value = (old_v & hi_mask) | val;
-			reg->var_off.mask &= hi_mask;
-		} else {
+		if (is_jmp32)
+			__mark_reg32_known(reg, val32);
+		else
 			__mark_reg_known(reg, val);
-		}
 		break;
 	}
 	case BPF_JSET:
-		false_reg->var_off = tnum_and(false_reg->var_off,
-					      tnum_const(~val));
-		if (is_power_of_2(val))
-			true_reg->var_off = tnum_or(true_reg->var_off,
-						    tnum_const(val));
+		if (is_jmp32) {
+			false_32off = tnum_and(false_32off, tnum_const(~val32));
+			if (is_power_of_2(val32))
+				true_32off = tnum_or(true_32off,
+						     tnum_const(val32));
+		} else {
+			false_64off = tnum_and(false_64off, tnum_const(~val));
+			if (is_power_of_2(val))
+				true_64off = tnum_or(true_64off,
+						     tnum_const(val));
+		}
 		break;
 	case BPF_JGE:
 	case BPF_JGT:
 	{
-		set_upper_bound(false_reg, val, is_jmp32, opcode == BPF_JGE);
-		set_lower_bound(true_reg, val, is_jmp32, opcode == BPF_JGT);
+		if (is_jmp32) {
+			u32 false_umax = opcode == BPF_JGT ? val32  : val32 - 1;
+			u32 true_umin = opcode == BPF_JGT ? val32 + 1 : val32;
+
+			false_reg->u32_max_value = min(false_reg->u32_max_value,
+						       false_umax);
+			true_reg->u32_min_value = max(true_reg->u32_min_value,
+						      true_umin);
+		} else {
+			u64 false_umax = opcode == BPF_JGT ? val    : val - 1;
+			u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
+
+			false_reg->umax_value = min(false_reg->umax_value, false_umax);
+			true_reg->umin_value = max(true_reg->umin_value, true_umin);
+		}
 		break;
 	}
 	case BPF_JSGE:
 	case BPF_JSGT:
 	{
-		s64 false_smax = opcode == BPF_JSGT ? sval    : sval - 1;
-		s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
+		if (is_jmp32) {
+			s32 false_smax = opcode == BPF_JSGT ? sval32    : sval32 - 1;
+			s32 true_smin = opcode == BPF_JSGT ? sval32 + 1 : sval32;
 
-		/* If the full s64 was not sign-extended from s32 then don't
-		 * deduct further info.
-		 */
-		if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
-			break;
-		false_reg->smax_value = min(false_reg->smax_value, false_smax);
-		true_reg->smin_value = max(true_reg->smin_value, true_smin);
+			false_reg->s32_max_value = min(false_reg->s32_max_value, false_smax);
+			true_reg->s32_min_value = max(true_reg->s32_min_value, true_smin);
+		} else {
+			s64 false_smax = opcode == BPF_JSGT ? sval    : sval - 1;
+			s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
+
+			false_reg->smax_value = min(false_reg->smax_value, false_smax);
+			true_reg->smin_value = max(true_reg->smin_value, true_smin);
+		}
 		break;
 	}
 	case BPF_JLE:
 	case BPF_JLT:
 	{
-		set_lower_bound(false_reg, val, is_jmp32, opcode == BPF_JLE);
-		set_upper_bound(true_reg, val, is_jmp32, opcode == BPF_JLT);
+		if (is_jmp32) {
+			u32 false_umin = opcode == BPF_JLT ? val32  : val32 + 1;
+			u32 true_umax = opcode == BPF_JLT ? val32 - 1 : val32;
+
+			false_reg->u32_min_value = max(false_reg->u32_min_value,
+						       false_umin);
+			true_reg->u32_max_value = min(true_reg->u32_max_value,
+						      true_umax);
+		} else {
+			u64 false_umin = opcode == BPF_JLT ? val    : val + 1;
+			u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
+
+			false_reg->umin_value = max(false_reg->umin_value, false_umin);
+			true_reg->umax_value = min(true_reg->umax_value, true_umax);
+		}
 		break;
 	}
 	case BPF_JSLE:
 	case BPF_JSLT:
 	{
-		s64 false_smin = opcode == BPF_JSLT ? sval    : sval + 1;
-		s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
+		if (is_jmp32) {
+			s32 false_smin = opcode == BPF_JSLT ? sval32    : sval32 + 1;
+			s32 true_smax = opcode == BPF_JSLT ? sval32 - 1 : sval32;
 
-		if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
-			break;
-		false_reg->smin_value = max(false_reg->smin_value, false_smin);
-		true_reg->smax_value = min(true_reg->smax_value, true_smax);
+			false_reg->s32_min_value = max(false_reg->s32_min_value, false_smin);
+			true_reg->s32_max_value = min(true_reg->s32_max_value, true_smax);
+		} else {
+			s64 false_smin = opcode == BPF_JSLT ? sval    : sval + 1;
+			s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
+
+			false_reg->smin_value = max(false_reg->smin_value, false_smin);
+			true_reg->smax_value = min(true_reg->smax_value, true_smax);
+		}
 		break;
 	}
 	default:
 		return;
 	}
 
-	__reg_deduce_bounds(false_reg);
-	__reg_deduce_bounds(true_reg);
-	/* We might have learned some bits from the bounds. */
-	__reg_bound_offset(false_reg);
-	__reg_bound_offset(true_reg);
-	/* Intersecting with the old var_off might have improved our bounds
-	 * slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
-	 * then new var_off is (0; 0x7f...fc) which improves our umax.
-	 */
-	__update_reg_bounds(false_reg);
-	__update_reg_bounds(true_reg);
+	if (is_jmp32) {
+		false_reg->var_off = tnum_or(tnum_clear_subreg(false_64off),
+					     tnum_subreg(false_32off));
+		true_reg->var_off = tnum_or(tnum_clear_subreg(true_64off),
+					    tnum_subreg(true_32off));
+		__reg_combine_32_into_64(false_reg);
+		__reg_combine_32_into_64(true_reg);
+	} else {
+		false_reg->var_off = false_64off;
+		true_reg->var_off = true_64off;
+		__reg_combine_64_into_32(false_reg);
+		__reg_combine_64_into_32(true_reg);
+	}
 }
 
 /* Same as above, but for the case that dst_reg holds a constant and src_reg is
  * the variable reg.
  */
 static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
-				struct bpf_reg_state *false_reg, u64 val,
+				struct bpf_reg_state *false_reg,
+				u64 val, u32 val32,
 				u8 opcode, bool is_jmp32)
 {
 	/* How can we transform "a <op> b" into "b <op> a"? */
@@ -5883,7 +6437,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 	 * BPF_JA, can't get here.
 	 */
 	if (opcode)
-		reg_set_min_max(true_reg, false_reg, val, opcode, is_jmp32);
+		reg_set_min_max(true_reg, false_reg, val, val32, opcode, is_jmp32);
 }
 
 /* Regs are known to be equal, so intersect their min/max/var_off */
@@ -6172,13 +6726,22 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	dst_reg = &regs[insn->dst_reg];
 	is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
 
-	if (BPF_SRC(insn->code) == BPF_K)
-		pred = is_branch_taken(dst_reg, insn->imm,
-				       opcode, is_jmp32);
-	else if (src_reg->type == SCALAR_VALUE &&
-		 tnum_is_const(src_reg->var_off))
-		pred = is_branch_taken(dst_reg, src_reg->var_off.value,
-				       opcode, is_jmp32);
+	if (BPF_SRC(insn->code) == BPF_K) {
+		pred = is_branch_taken(dst_reg, insn->imm, opcode, is_jmp32);
+	} else if (src_reg->type == SCALAR_VALUE &&
+		   is_jmp32 && tnum_is_const(tnum_subreg(src_reg->var_off))) {
+		pred = is_branch_taken(dst_reg,
+				       tnum_subreg(src_reg->var_off).value,
+				       opcode,
+				       is_jmp32);
+	} else if (src_reg->type == SCALAR_VALUE &&
+		   !is_jmp32 && tnum_is_const(src_reg->var_off)) {
+		pred = is_branch_taken(dst_reg,
+				       src_reg->var_off.value,
+				       opcode,
+				       is_jmp32);
+	}
+
 	if (pred >= 0) {
 		err = mark_chain_precision(env, insn->dst_reg);
 		if (BPF_SRC(insn->code) == BPF_X && !err)
@@ -6212,32 +6775,24 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	 */
 	if (BPF_SRC(insn->code) == BPF_X) {
 		struct bpf_reg_state *src_reg = &regs[insn->src_reg];
-		struct bpf_reg_state lo_reg0 = *dst_reg;
-		struct bpf_reg_state lo_reg1 = *src_reg;
-		struct bpf_reg_state *src_lo, *dst_lo;
-
-		dst_lo = &lo_reg0;
-		src_lo = &lo_reg1;
-		coerce_reg_to_size(dst_lo, 4);
-		coerce_reg_to_size(src_lo, 4);
 
 		if (dst_reg->type == SCALAR_VALUE &&
 		    src_reg->type == SCALAR_VALUE) {
 			if (tnum_is_const(src_reg->var_off) ||
-			    (is_jmp32 && tnum_is_const(src_lo->var_off)))
+			    (is_jmp32 &&
+			     tnum_is_const(tnum_subreg(src_reg->var_off))))
 				reg_set_min_max(&other_branch_regs[insn->dst_reg],
 						dst_reg,
-						is_jmp32
-						? src_lo->var_off.value
-						: src_reg->var_off.value,
+						src_reg->var_off.value,
+						tnum_subreg(src_reg->var_off).value,
 						opcode, is_jmp32);
 			else if (tnum_is_const(dst_reg->var_off) ||
-				 (is_jmp32 && tnum_is_const(dst_lo->var_off)))
+				 (is_jmp32 &&
+				  tnum_is_const(tnum_subreg(dst_reg->var_off))))
 				reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
 						    src_reg,
-						    is_jmp32
-						    ? dst_lo->var_off.value
-						    : dst_reg->var_off.value,
+						    dst_reg->var_off.value,
+						    tnum_subreg(dst_reg->var_off).value,
 						    opcode, is_jmp32);
 			else if (!is_jmp32 &&
 				 (opcode == BPF_JEQ || opcode == BPF_JNE))
@@ -6248,7 +6803,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		}
 	} else if (dst_reg->type == SCALAR_VALUE) {
 		reg_set_min_max(&other_branch_regs[insn->dst_reg],
-					dst_reg, insn->imm, opcode, is_jmp32);
+					dst_reg, insn->imm, (u32)insn->imm,
+					opcode, is_jmp32);
 	}
 
 	/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
-- 
cgit v1.2.3


From af6eea57437a830293eab56246b6025cc7d46ee7 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Sun, 29 Mar 2020 19:59:58 -0700
Subject: bpf: Implement bpf_link-based cgroup BPF program attachment

Implement new sub-command to attach cgroup BPF programs and return FD-based
bpf_link back on success. bpf_link, once attached to cgroup, cannot be
replaced, except by owner having its FD. Cgroup bpf_link supports only
BPF_F_ALLOW_MULTI semantics. Both link-based and prog-based BPF_F_ALLOW_MULTI
attachments can be freely intermixed.

To prevent bpf_cgroup_link from keeping cgroup alive past the point when no
BPF program can be executed, implement auto-detachment of link. When
cgroup_bpf_release() is called, all attached bpf_links are forced to release
cgroup refcounts, but they leave bpf_link otherwise active and allocated, as
well as still owning underlying bpf_prog. This is because user-space might
still have FDs open and active, so bpf_link as a user-referenced object can't
be freed yet. Once last active FD is closed, bpf_link will be freed and
underlying bpf_prog refcount will be dropped. But cgroup refcount won't be
touched, because cgroup is released already.

The inherent race between bpf_cgroup_link release (from closing last FD) and
cgroup_bpf_release() is resolved by both operations taking cgroup_mutex. So
the only additional check required is when bpf_cgroup_link attempts to detach
itself from cgroup. At that time we need to check whether there is still
cgroup associated with that link. And if not, exit with success, because
bpf_cgroup_link was already successfully detached.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Roman Gushchin <guro@fb.com>
Link: https://lore.kernel.org/bpf/20200330030001.2312810-2-andriin@fb.com
---
 include/linux/bpf-cgroup.h     |  29 +++-
 include/linux/bpf.h            |  10 +-
 include/uapi/linux/bpf.h       |  10 +-
 kernel/bpf/cgroup.c            | 315 +++++++++++++++++++++++++++++++----------
 kernel/bpf/syscall.c           |  64 +++++++--
 kernel/cgroup/cgroup.c         |  14 +-
 tools/include/uapi/linux/bpf.h |  10 +-
 7 files changed, 354 insertions(+), 98 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index a7cd5c7a2509..d2d969669564 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -51,9 +51,18 @@ struct bpf_cgroup_storage {
 	struct rcu_head rcu;
 };
 
+struct bpf_cgroup_link {
+	struct bpf_link link;
+	struct cgroup *cgroup;
+	enum bpf_attach_type type;
+};
+
+extern const struct bpf_link_ops bpf_cgroup_link_lops;
+
 struct bpf_prog_list {
 	struct list_head node;
 	struct bpf_prog *prog;
+	struct bpf_cgroup_link *link;
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 };
 
@@ -84,20 +93,23 @@ struct cgroup_bpf {
 int cgroup_bpf_inherit(struct cgroup *cgrp);
 void cgroup_bpf_offline(struct cgroup *cgrp);
 
-int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
-			struct bpf_prog *replace_prog,
+int __cgroup_bpf_attach(struct cgroup *cgrp,
+			struct bpf_prog *prog, struct bpf_prog *replace_prog,
+			struct bpf_cgroup_link *link,
 			enum bpf_attach_type type, u32 flags);
 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+			struct bpf_cgroup_link *link,
 			enum bpf_attach_type type);
 int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		       union bpf_attr __user *uattr);
 
 /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
-int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
-		      struct bpf_prog *replace_prog, enum bpf_attach_type type,
+int cgroup_bpf_attach(struct cgroup *cgrp,
+		      struct bpf_prog *prog, struct bpf_prog *replace_prog,
+		      struct bpf_cgroup_link *link, enum bpf_attach_type type,
 		      u32 flags);
 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, u32 flags);
+		      enum bpf_attach_type type);
 int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		     union bpf_attr __user *uattr);
 
@@ -332,6 +344,7 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 			   enum bpf_prog_type ptype, struct bpf_prog *prog);
 int cgroup_bpf_prog_detach(const union bpf_attr *attr,
 			   enum bpf_prog_type ptype);
+int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 int cgroup_bpf_prog_query(const union bpf_attr *attr,
 			  union bpf_attr __user *uattr);
 #else
@@ -354,6 +367,12 @@ static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr,
 	return -EINVAL;
 }
 
+static inline int cgroup_bpf_link_attach(const union bpf_attr *attr,
+					 struct bpf_prog *prog)
+{
+	return -EINVAL;
+}
+
 static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 					union bpf_attr __user *uattr)
 {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3bde59a8453b..56254d880293 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1082,15 +1082,23 @@ extern int sysctl_unprivileged_bpf_disabled;
 int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
 
-struct bpf_link;
+struct bpf_link {
+	atomic64_t refcnt;
+	const struct bpf_link_ops *ops;
+	struct bpf_prog *prog;
+	struct work_struct work;
+};
 
 struct bpf_link_ops {
 	void (*release)(struct bpf_link *link);
 	void (*dealloc)(struct bpf_link *link);
+
 };
 
 void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
 		   struct bpf_prog *prog);
+void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
+		      int link_fd);
 void bpf_link_inc(struct bpf_link *link);
 void bpf_link_put(struct bpf_link *link);
 int bpf_link_new_fd(struct bpf_link *link);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9f786a5a44ac..37dffe5089a0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -111,6 +111,7 @@ enum bpf_cmd {
 	BPF_MAP_LOOKUP_AND_DELETE_BATCH,
 	BPF_MAP_UPDATE_BATCH,
 	BPF_MAP_DELETE_BATCH,
+	BPF_LINK_CREATE,
 };
 
 enum bpf_map_type {
@@ -541,7 +542,7 @@ union bpf_attr {
 		__u32		prog_cnt;
 	} query;
 
-	struct {
+	struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
 		__u64 name;
 		__u32 prog_fd;
 	} raw_tracepoint;
@@ -569,6 +570,13 @@ union bpf_attr {
 		__u64		probe_offset;	/* output: probe_offset */
 		__u64		probe_addr;	/* output: probe_addr */
 	} task_fd_query;
+
+	struct { /* struct used by BPF_LINK_CREATE command */
+		__u32		prog_fd;	/* eBPF program to attach */
+		__u32		target_fd;	/* object to attach to */
+		__u32		attach_type;	/* attach type */
+		__u32		flags;		/* extra flags */
+	} link_create;
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 9c8472823a7f..c24029937431 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -80,6 +80,17 @@ static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[])
 		bpf_cgroup_storage_unlink(storages[stype]);
 }
 
+/* Called when bpf_cgroup_link is auto-detached from dying cgroup.
+ * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
+ * doesn't free link memory, which will eventually be done by bpf_link's
+ * release() callback, when its last FD is closed.
+ */
+static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
+{
+	cgroup_put(link->cgroup);
+	link->cgroup = NULL;
+}
+
 /**
  * cgroup_bpf_release() - put references of all bpf programs and
  *                        release all cgroup bpf data
@@ -100,7 +111,10 @@ static void cgroup_bpf_release(struct work_struct *work)
 
 		list_for_each_entry_safe(pl, tmp, progs, node) {
 			list_del(&pl->node);
-			bpf_prog_put(pl->prog);
+			if (pl->prog)
+				bpf_prog_put(pl->prog);
+			if (pl->link)
+				bpf_cgroup_link_auto_detach(pl->link);
 			bpf_cgroup_storages_unlink(pl->storage);
 			bpf_cgroup_storages_free(pl->storage);
 			kfree(pl);
@@ -134,6 +148,18 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref)
 	queue_work(system_wq, &cgrp->bpf.release_work);
 }
 
+/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
+ * link or direct prog.
+ */
+static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
+{
+	if (pl->prog)
+		return pl->prog;
+	if (pl->link)
+		return pl->link->link.prog;
+	return NULL;
+}
+
 /* count number of elements in the list.
  * it's slow but the list cannot be long
  */
@@ -143,7 +169,7 @@ static u32 prog_list_length(struct list_head *head)
 	u32 cnt = 0;
 
 	list_for_each_entry(pl, head, node) {
-		if (!pl->prog)
+		if (!prog_list_prog(pl))
 			continue;
 		cnt++;
 	}
@@ -212,11 +238,11 @@ static int compute_effective_progs(struct cgroup *cgrp,
 			continue;
 
 		list_for_each_entry(pl, &p->bpf.progs[type], node) {
-			if (!pl->prog)
+			if (!prog_list_prog(pl))
 				continue;
 
 			item = &progs->items[cnt];
-			item->prog = pl->prog;
+			item->prog = prog_list_prog(pl);
 			bpf_cgroup_storages_assign(item->cgroup_storage,
 						   pl->storage);
 			cnt++;
@@ -333,19 +359,60 @@ cleanup:
 
 #define BPF_CGROUP_MAX_PROGS 64
 
+static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
+					       struct bpf_prog *prog,
+					       struct bpf_cgroup_link *link,
+					       struct bpf_prog *replace_prog,
+					       bool allow_multi)
+{
+	struct bpf_prog_list *pl;
+
+	/* single-attach case */
+	if (!allow_multi) {
+		if (list_empty(progs))
+			return NULL;
+		return list_first_entry(progs, typeof(*pl), node);
+	}
+
+	list_for_each_entry(pl, progs, node) {
+		if (prog && pl->prog == prog)
+			/* disallow attaching the same prog twice */
+			return ERR_PTR(-EINVAL);
+		if (link && pl->link == link)
+			/* disallow attaching the same link twice */
+			return ERR_PTR(-EINVAL);
+	}
+
+	/* direct prog multi-attach w/ replacement case */
+	if (replace_prog) {
+		list_for_each_entry(pl, progs, node) {
+			if (pl->prog == replace_prog)
+				/* a match found */
+				return pl;
+		}
+		/* prog to replace not found for cgroup */
+		return ERR_PTR(-ENOENT);
+	}
+
+	return NULL;
+}
+
 /**
- * __cgroup_bpf_attach() - Attach the program to a cgroup, and
+ * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
  *                         propagate the change to descendants
  * @cgrp: The cgroup which descendants to traverse
  * @prog: A program to attach
+ * @link: A link to attach
  * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
  * @type: Type of attach operation
  * @flags: Option flags
  *
+ * Exactly one of @prog or @link can be non-null.
  * Must be called with cgroup_mutex held.
  */
-int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
-			struct bpf_prog *replace_prog,
+int __cgroup_bpf_attach(struct cgroup *cgrp,
+			struct bpf_prog *prog, struct bpf_prog *replace_prog,
+			struct bpf_cgroup_link *link,
 			enum bpf_attach_type type, u32 flags)
 {
 	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
@@ -353,13 +420,19 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 	struct bpf_prog *old_prog = NULL;
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
 		*old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
-	struct bpf_prog_list *pl, *replace_pl = NULL;
+	struct bpf_prog_list *pl;
 	int err;
 
 	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
 	    ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
 		/* invalid combination */
 		return -EINVAL;
+	if (link && (prog || replace_prog))
+		/* only either link or prog/replace_prog can be specified */
+		return -EINVAL;
+	if (!!replace_prog != !!(flags & BPF_F_REPLACE))
+		/* replace_prog implies BPF_F_REPLACE, and vice versa */
+		return -EINVAL;
 
 	if (!hierarchy_allows_attach(cgrp, type))
 		return -EPERM;
@@ -374,26 +447,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
 		return -E2BIG;
 
-	if (flags & BPF_F_ALLOW_MULTI) {
-		list_for_each_entry(pl, progs, node) {
-			if (pl->prog == prog)
-				/* disallow attaching the same prog twice */
-				return -EINVAL;
-			if (pl->prog == replace_prog)
-				replace_pl = pl;
-		}
-		if ((flags & BPF_F_REPLACE) && !replace_pl)
-			/* prog to replace not found for cgroup */
-			return -ENOENT;
-	} else if (!list_empty(progs)) {
-		replace_pl = list_first_entry(progs, typeof(*pl), node);
-	}
+	pl = find_attach_entry(progs, prog, link, replace_prog,
+			       flags & BPF_F_ALLOW_MULTI);
+	if (IS_ERR(pl))
+		return PTR_ERR(pl);
 
-	if (bpf_cgroup_storages_alloc(storage, prog))
+	if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog))
 		return -ENOMEM;
 
-	if (replace_pl) {
-		pl = replace_pl;
+	if (pl) {
 		old_prog = pl->prog;
 		bpf_cgroup_storages_unlink(pl->storage);
 		bpf_cgroup_storages_assign(old_storage, pl->storage);
@@ -407,6 +469,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 	}
 
 	pl->prog = prog;
+	pl->link = link;
 	bpf_cgroup_storages_assign(pl->storage, storage);
 	cgrp->bpf.flags[type] = saved_flags;
 
@@ -414,80 +477,93 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 	if (err)
 		goto cleanup;
 
-	static_branch_inc(&cgroup_bpf_enabled_key);
 	bpf_cgroup_storages_free(old_storage);
-	if (old_prog) {
+	if (old_prog)
 		bpf_prog_put(old_prog);
-		static_branch_dec(&cgroup_bpf_enabled_key);
-	}
-	bpf_cgroup_storages_link(storage, cgrp, type);
+	else
+		static_branch_inc(&cgroup_bpf_enabled_key);
+	bpf_cgroup_storages_link(pl->storage, cgrp, type);
 	return 0;
 
 cleanup:
-	/* and cleanup the prog list */
-	pl->prog = old_prog;
+	if (old_prog) {
+		pl->prog = old_prog;
+		pl->link = NULL;
+	}
 	bpf_cgroup_storages_free(pl->storage);
 	bpf_cgroup_storages_assign(pl->storage, old_storage);
 	bpf_cgroup_storages_link(pl->storage, cgrp, type);
-	if (!replace_pl) {
+	if (!old_prog) {
 		list_del(&pl->node);
 		kfree(pl);
 	}
 	return err;
 }
 
+static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
+					       struct bpf_prog *prog,
+					       struct bpf_cgroup_link *link,
+					       bool allow_multi)
+{
+	struct bpf_prog_list *pl;
+
+	if (!allow_multi) {
+		if (list_empty(progs))
+			/* report error when trying to detach and nothing is attached */
+			return ERR_PTR(-ENOENT);
+
+		/* to maintain backward compatibility NONE and OVERRIDE cgroups
+		 * allow detaching with invalid FD (prog==NULL) in legacy mode
+		 */
+		return list_first_entry(progs, typeof(*pl), node);
+	}
+
+	if (!prog && !link)
+		/* to detach MULTI prog the user has to specify valid FD
+		 * of the program or link to be detached
+		 */
+		return ERR_PTR(-EINVAL);
+
+	/* find the prog or link and detach it */
+	list_for_each_entry(pl, progs, node) {
+		if (pl->prog == prog && pl->link == link)
+			return pl;
+	}
+	return ERR_PTR(-ENOENT);
+}
+
 /**
- * __cgroup_bpf_detach() - Detach the program from a cgroup, and
+ * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
  *                         propagate the change to descendants
  * @cgrp: The cgroup which descendants to traverse
  * @prog: A program to detach or NULL
+ * @prog: A link to detach or NULL
  * @type: Type of detach operation
  *
+ * At most one of @prog or @link can be non-NULL.
  * Must be called with cgroup_mutex held.
  */
 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
-			enum bpf_attach_type type)
+			struct bpf_cgroup_link *link, enum bpf_attach_type type)
 {
 	struct list_head *progs = &cgrp->bpf.progs[type];
 	u32 flags = cgrp->bpf.flags[type];
-	struct bpf_prog *old_prog = NULL;
 	struct bpf_prog_list *pl;
+	struct bpf_prog *old_prog;
 	int err;
 
-	if (flags & BPF_F_ALLOW_MULTI) {
-		if (!prog)
-			/* to detach MULTI prog the user has to specify valid FD
-			 * of the program to be detached
-			 */
-			return -EINVAL;
-	} else {
-		if (list_empty(progs))
-			/* report error when trying to detach and nothing is attached */
-			return -ENOENT;
-	}
+	if (prog && link)
+		/* only one of prog or link can be specified */
+		return -EINVAL;
 
-	if (flags & BPF_F_ALLOW_MULTI) {
-		/* find the prog and detach it */
-		list_for_each_entry(pl, progs, node) {
-			if (pl->prog != prog)
-				continue;
-			old_prog = prog;
-			/* mark it deleted, so it's ignored while
-			 * recomputing effective
-			 */
-			pl->prog = NULL;
-			break;
-		}
-		if (!old_prog)
-			return -ENOENT;
-	} else {
-		/* to maintain backward compatibility NONE and OVERRIDE cgroups
-		 * allow detaching with invalid FD (prog==NULL)
-		 */
-		pl = list_first_entry(progs, typeof(*pl), node);
-		old_prog = pl->prog;
-		pl->prog = NULL;
-	}
+	pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
+	if (IS_ERR(pl))
+		return PTR_ERR(pl);
+
+	/* mark it deleted, so it's ignored while recomputing effective */
+	old_prog = pl->prog;
+	pl->prog = NULL;
+	pl->link = NULL;
 
 	err = update_effective_progs(cgrp, type);
 	if (err)
@@ -501,14 +577,15 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	if (list_empty(progs))
 		/* last program was detached, reset flags to zero */
 		cgrp->bpf.flags[type] = 0;
-
-	bpf_prog_put(old_prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
 	static_branch_dec(&cgroup_bpf_enabled_key);
 	return 0;
 
 cleanup:
-	/* and restore back old_prog */
+	/* restore back prog or link */
 	pl->prog = old_prog;
+	pl->link = link;
 	return err;
 }
 
@@ -521,6 +598,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	struct list_head *progs = &cgrp->bpf.progs[type];
 	u32 flags = cgrp->bpf.flags[type];
 	struct bpf_prog_array *effective;
+	struct bpf_prog *prog;
 	int cnt, ret = 0, i;
 
 	effective = rcu_dereference_protected(cgrp->bpf.effective[type],
@@ -551,7 +629,8 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 
 		i = 0;
 		list_for_each_entry(pl, progs, node) {
-			id = pl->prog->aux->id;
+			prog = prog_list_prog(pl);
+			id = prog->aux->id;
 			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
 				return -EFAULT;
 			if (++i == cnt)
@@ -581,8 +660,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 		}
 	}
 
-	ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type,
-				attr->attach_flags);
+	ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
+				attr->attach_type, attr->attach_flags);
 
 	if (replace_prog)
 		bpf_prog_put(replace_prog);
@@ -604,7 +683,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
 	if (IS_ERR(prog))
 		prog = NULL;
 
-	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
 	if (prog)
 		bpf_prog_put(prog);
 
@@ -612,6 +691,90 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
 	return ret;
 }
 
+static void bpf_cgroup_link_release(struct bpf_link *link)
+{
+	struct bpf_cgroup_link *cg_link =
+		container_of(link, struct bpf_cgroup_link, link);
+
+	/* link might have been auto-detached by dying cgroup already,
+	 * in that case our work is done here
+	 */
+	if (!cg_link->cgroup)
+		return;
+
+	mutex_lock(&cgroup_mutex);
+
+	/* re-check cgroup under lock again */
+	if (!cg_link->cgroup) {
+		mutex_unlock(&cgroup_mutex);
+		return;
+	}
+
+	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
+				    cg_link->type));
+
+	mutex_unlock(&cgroup_mutex);
+	cgroup_put(cg_link->cgroup);
+}
+
+static void bpf_cgroup_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_cgroup_link *cg_link =
+		container_of(link, struct bpf_cgroup_link, link);
+
+	kfree(cg_link);
+}
+
+const struct bpf_link_ops bpf_cgroup_link_lops = {
+	.release = bpf_cgroup_link_release,
+	.dealloc = bpf_cgroup_link_dealloc,
+};
+
+int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_cgroup_link *link;
+	struct file *link_file;
+	struct cgroup *cgrp;
+	int err, link_fd;
+
+	if (attr->link_create.flags)
+		return -EINVAL;
+
+	cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
+		err = -ENOMEM;
+		goto out_put_cgroup;
+	}
+	bpf_link_init(&link->link, &bpf_cgroup_link_lops, prog);
+	link->cgroup = cgrp;
+	link->type = attr->link_create.attach_type;
+
+	link_file = bpf_link_new_file(&link->link, &link_fd);
+	if (IS_ERR(link_file)) {
+		kfree(link);
+		err = PTR_ERR(link_file);
+		goto out_put_cgroup;
+	}
+
+	err = cgroup_bpf_attach(cgrp, NULL, NULL, link, link->type,
+				BPF_F_ALLOW_MULTI);
+	if (err) {
+		bpf_link_cleanup(&link->link, link_file, link_fd);
+		goto out_put_cgroup;
+	}
+
+	fd_install(link_fd, link_file);
+	return link_fd;
+
+out_put_cgroup:
+	cgroup_put(cgrp);
+	return err;
+}
+
 int cgroup_bpf_prog_query(const union bpf_attr *attr,
 			  union bpf_attr __user *uattr)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a616b63f23b4..97d5c6fb63cd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2175,13 +2175,6 @@ static int bpf_obj_get(const union bpf_attr *attr)
 				attr->file_flags);
 }
 
-struct bpf_link {
-	atomic64_t refcnt;
-	const struct bpf_link_ops *ops;
-	struct bpf_prog *prog;
-	struct work_struct work;
-};
-
 void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
 		   struct bpf_prog *prog)
 {
@@ -2195,8 +2188,8 @@ void bpf_link_init(struct bpf_link *link, const struct bpf_link_ops *ops,
  * anon_inode's release() call. This helper manages marking bpf_link as
  * defunct, releases anon_inode file and puts reserved FD.
  */
-static void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
-			     int link_fd)
+void bpf_link_cleanup(struct bpf_link *link, struct file *link_file,
+		      int link_fd)
 {
 	link->prog = NULL;
 	fput(link_file);
@@ -2266,6 +2259,10 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
 		link_type = "raw_tracepoint";
 	else if (link->ops == &bpf_tracing_link_lops)
 		link_type = "tracing";
+#ifdef CONFIG_CGROUP_BPF
+	else if (link->ops == &bpf_cgroup_link_lops)
+		link_type = "cgroup";
+#endif
 	else
 		link_type = "unknown";
 
@@ -3553,6 +3550,52 @@ err_put:
 	return err;
 }
 
+#define BPF_LINK_CREATE_LAST_FIELD link_create.flags
+static int link_create(union bpf_attr *attr)
+{
+	enum bpf_prog_type ptype;
+	struct bpf_prog *prog;
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (CHECK_ATTR(BPF_LINK_CREATE))
+		return -EINVAL;
+
+	ptype = attach_type_to_prog_type(attr->link_create.attach_type);
+	if (ptype == BPF_PROG_TYPE_UNSPEC)
+		return -EINVAL;
+
+	prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	ret = bpf_prog_attach_check_attach_type(prog,
+						attr->link_create.attach_type);
+	if (ret)
+		goto err_out;
+
+	switch (ptype) {
+	case BPF_PROG_TYPE_CGROUP_SKB:
+	case BPF_PROG_TYPE_CGROUP_SOCK:
+	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+	case BPF_PROG_TYPE_SOCK_OPS:
+	case BPF_PROG_TYPE_CGROUP_DEVICE:
+	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+		ret = cgroup_bpf_link_attach(attr, prog);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+err_out:
+	if (ret < 0)
+		bpf_prog_put(prog);
+	return ret;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -3663,6 +3706,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_MAP_DELETE_BATCH:
 		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
 		break;
+	case BPF_LINK_CREATE:
+		err = link_create(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 3dead0416b91..219624fba9ba 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6303,27 +6303,31 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 #endif	/* CONFIG_SOCK_CGROUP_DATA */
 
 #ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
-		      struct bpf_prog *replace_prog, enum bpf_attach_type type,
+int cgroup_bpf_attach(struct cgroup *cgrp,
+		      struct bpf_prog *prog, struct bpf_prog *replace_prog,
+		      struct bpf_cgroup_link *link,
+		      enum bpf_attach_type type,
 		      u32 flags)
 {
 	int ret;
 
 	mutex_lock(&cgroup_mutex);
-	ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags);
+	ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
+
 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, u32 flags)
+		      enum bpf_attach_type type)
 {
 	int ret;
 
 	mutex_lock(&cgroup_mutex);
-	ret = __cgroup_bpf_detach(cgrp, prog, type);
+	ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
+
 int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		     union bpf_attr __user *uattr)
 {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9f786a5a44ac..37dffe5089a0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -111,6 +111,7 @@ enum bpf_cmd {
 	BPF_MAP_LOOKUP_AND_DELETE_BATCH,
 	BPF_MAP_UPDATE_BATCH,
 	BPF_MAP_DELETE_BATCH,
+	BPF_LINK_CREATE,
 };
 
 enum bpf_map_type {
@@ -541,7 +542,7 @@ union bpf_attr {
 		__u32		prog_cnt;
 	} query;
 
-	struct {
+	struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
 		__u64 name;
 		__u32 prog_fd;
 	} raw_tracepoint;
@@ -569,6 +570,13 @@ union bpf_attr {
 		__u64		probe_offset;	/* output: probe_offset */
 		__u64		probe_addr;	/* output: probe_addr */
 	} task_fd_query;
+
+	struct { /* struct used by BPF_LINK_CREATE command */
+		__u32		prog_fd;	/* eBPF program to attach */
+		__u32		target_fd;	/* object to attach to */
+		__u32		attach_type;	/* attach type */
+		__u32		flags;		/* extra flags */
+	} link_create;
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
-- 
cgit v1.2.3


From 0c991ebc8c69d29b7fc44db17075c5aa5253e2ab Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Sun, 29 Mar 2020 19:59:59 -0700
Subject: bpf: Implement bpf_prog replacement for an active bpf_cgroup_link

Add new operation (LINK_UPDATE), which allows to replace active bpf_prog from
under given bpf_link. Currently this is only supported for bpf_cgroup_link,
but will be extended to other kinds of bpf_links in follow-up patches.

For bpf_cgroup_link, implemented functionality matches existing semantics for
direct bpf_prog attachment (including BPF_F_REPLACE flag). User can either
unconditionally set new bpf_prog regardless of which bpf_prog is currently
active under given bpf_link, or, optionally, can specify expected active
bpf_prog. If active bpf_prog doesn't match expected one, no changes are
performed, old bpf_link stays intact and attached, operation returns
a failure.

cgroup_bpf_replace() operation is resolving race between auto-detachment and
bpf_prog update in the same fashion as it's done for bpf_link detachment,
except in this case update has no way of succeeding because of target cgroup
marked as dying. So in this case error is returned.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200330030001.2312810-3-andriin@fb.com
---
 include/linux/bpf-cgroup.h | 12 +++++++
 include/uapi/linux/bpf.h   | 12 +++++++
 kernel/bpf/cgroup.c        | 80 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c       | 55 +++++++++++++++++++++++++++++++
 kernel/cgroup/cgroup.c     | 27 ++++++++++++++++
 5 files changed, 186 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index d2d969669564..c11b413d5b1a 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -100,6 +100,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 			struct bpf_cgroup_link *link,
 			enum bpf_attach_type type);
+int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
+			 struct bpf_prog *new_prog);
 int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		       union bpf_attr __user *uattr);
 
@@ -110,6 +112,8 @@ int cgroup_bpf_attach(struct cgroup *cgrp,
 		      u32 flags);
 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type);
+int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog,
+		       struct bpf_prog *new_prog);
 int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		     union bpf_attr __user *uattr);
 
@@ -350,6 +354,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
 #else
 
 struct bpf_prog;
+struct bpf_link;
 struct cgroup_bpf {};
 static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 static inline void cgroup_bpf_offline(struct cgroup *cgrp) {}
@@ -373,6 +378,13 @@ static inline int cgroup_bpf_link_attach(const union bpf_attr *attr,
 	return -EINVAL;
 }
 
+static inline int cgroup_bpf_replace(struct bpf_link *link,
+				     struct bpf_prog *old_prog,
+				     struct bpf_prog *new_prog)
+{
+	return -EINVAL;
+}
+
 static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 					union bpf_attr __user *uattr)
 {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 37dffe5089a0..2e29a671d67e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_cmd {
 	BPF_MAP_UPDATE_BATCH,
 	BPF_MAP_DELETE_BATCH,
 	BPF_LINK_CREATE,
+	BPF_LINK_UPDATE,
 };
 
 enum bpf_map_type {
@@ -577,6 +578,17 @@ union bpf_attr {
 		__u32		attach_type;	/* attach type */
 		__u32		flags;		/* extra flags */
 	} link_create;
+
+	struct { /* struct used by BPF_LINK_UPDATE command */
+		__u32		link_fd;	/* link fd */
+		/* new program fd to update link with */
+		__u32		new_prog_fd;
+		__u32		flags;		/* extra flags */
+		/* expected link's program fd; is specified only if
+		 * BPF_F_REPLACE flag is set in flags */
+		__u32		old_prog_fd;
+	} link_update;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index c24029937431..80676fc00d81 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -500,6 +500,86 @@ cleanup:
 	return err;
 }
 
+/* Swap updated BPF program for given link in effective program arrays across
+ * all descendant cgroups. This function is guaranteed to succeed.
+ */
+static void replace_effective_prog(struct cgroup *cgrp,
+				   enum bpf_attach_type type,
+				   struct bpf_cgroup_link *link)
+{
+	struct bpf_prog_array_item *item;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_array *progs;
+	struct bpf_prog_list *pl;
+	struct list_head *head;
+	struct cgroup *cg;
+	int pos;
+
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		if (percpu_ref_is_zero(&desc->bpf.refcnt))
+			continue;
+
+		/* find position of link in effective progs array */
+		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
+			if (pos && !(cg->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+				continue;
+
+			head = &cg->bpf.progs[type];
+			list_for_each_entry(pl, head, node) {
+				if (!prog_list_prog(pl))
+					continue;
+				if (pl->link == link)
+					goto found;
+				pos++;
+			}
+		}
+found:
+		BUG_ON(!cg);
+		progs = rcu_dereference_protected(
+				desc->bpf.effective[type],
+				lockdep_is_held(&cgroup_mutex));
+		item = &progs->items[pos];
+		WRITE_ONCE(item->prog, link->link.prog);
+	}
+}
+
+/**
+ * __cgroup_bpf_replace() - Replace link's program and propagate the change
+ *                          to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @link: A link for which to replace BPF program
+ * @type: Type of attach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link,
+			 struct bpf_prog *new_prog)
+{
+	struct list_head *progs = &cgrp->bpf.progs[link->type];
+	struct bpf_prog *old_prog;
+	struct bpf_prog_list *pl;
+	bool found = false;
+
+	if (link->link.prog->type != new_prog->type)
+		return -EINVAL;
+
+	list_for_each_entry(pl, progs, node) {
+		if (pl->link == link) {
+			found = true;
+			break;
+		}
+	}
+	if (!found)
+		return -ENOENT;
+
+	old_prog = xchg(&link->link.prog, new_prog);
+	replace_effective_prog(cgrp, link->type, link);
+	bpf_prog_put(old_prog);
+	return 0;
+}
+
 static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
 					       struct bpf_prog *prog,
 					       struct bpf_cgroup_link *link,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 97d5c6fb63cd..e0a3b34d7039 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3596,6 +3596,58 @@ err_out:
 	return ret;
 }
 
+#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
+
+static int link_update(union bpf_attr *attr)
+{
+	struct bpf_prog *old_prog = NULL, *new_prog;
+	struct bpf_link *link;
+	u32 flags;
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (CHECK_ATTR(BPF_LINK_UPDATE))
+		return -EINVAL;
+
+	flags = attr->link_update.flags;
+	if (flags & ~BPF_F_REPLACE)
+		return -EINVAL;
+
+	link = bpf_link_get_from_fd(attr->link_update.link_fd);
+	if (IS_ERR(link))
+		return PTR_ERR(link);
+
+	new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
+	if (IS_ERR(new_prog))
+		return PTR_ERR(new_prog);
+
+	if (flags & BPF_F_REPLACE) {
+		old_prog = bpf_prog_get(attr->link_update.old_prog_fd);
+		if (IS_ERR(old_prog)) {
+			ret = PTR_ERR(old_prog);
+			old_prog = NULL;
+			goto out_put_progs;
+		}
+	}
+
+#ifdef CONFIG_CGROUP_BPF
+	if (link->ops == &bpf_cgroup_link_lops) {
+		ret = cgroup_bpf_replace(link, old_prog, new_prog);
+		goto out_put_progs;
+	}
+#endif
+	ret = -EINVAL;
+
+out_put_progs:
+	if (old_prog)
+		bpf_prog_put(old_prog);
+	if (ret)
+		bpf_prog_put(new_prog);
+	return ret;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -3709,6 +3761,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_LINK_CREATE:
 		err = link_create(&attr);
 		break;
+	case BPF_LINK_UPDATE:
+		err = link_update(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 219624fba9ba..915dda3f7f19 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6317,6 +6317,33 @@ int cgroup_bpf_attach(struct cgroup *cgrp,
 	return ret;
 }
 
+int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *old_prog,
+		       struct bpf_prog *new_prog)
+{
+	struct bpf_cgroup_link *cg_link;
+	int ret;
+
+	if (link->ops != &bpf_cgroup_link_lops)
+		return -EINVAL;
+
+	cg_link = container_of(link, struct bpf_cgroup_link, link);
+
+	mutex_lock(&cgroup_mutex);
+	/* link might have been auto-released by dying cgroup, so fail */
+	if (!cg_link->cgroup) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	if (old_prog && link->prog != old_prog) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+	ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
+out_unlock:
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type)
 {
-- 
cgit v1.2.3